diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 4e8a1794f50a..c39fe782ba67 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -140,4 +140,6 @@ source "drivers/staging/netlogic/Kconfig" source "drivers/staging/dwc2/Kconfig" +source "drivers/staging/lustre/Kconfig" + endif # STAGING diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 415772ea306d..110c59754dda 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -62,3 +62,4 @@ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/ obj-$(CONFIG_ZCACHE) += zcache/ obj-$(CONFIG_GOLDFISH) += goldfish/ obj-$(CONFIG_USB_DWC2) += dwc2/ +obj-$(CONFIG_LUSTRE_FS) += lustre/ diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig new file mode 100644 index 000000000000..a224d88bf43d --- /dev/null +++ b/drivers/staging/lustre/Kconfig @@ -0,0 +1,3 @@ +source "drivers/staging/lustre/lustre/Kconfig" + +source "drivers/staging/lustre/lnet/Kconfig" diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile new file mode 100644 index 000000000000..26162893fd20 --- /dev/null +++ b/drivers/staging/lustre/Makefile @@ -0,0 +1,4 @@ +subdir-ccflags-y := -I$(src)/include/ + +obj-$(CONFIG_LUSTRE_FS) += lustre/ +obj-$(CONFIG_LNET) += lnet/ diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO new file mode 100644 index 000000000000..22742d6d62a8 --- /dev/null +++ b/drivers/staging/lustre/TODO @@ -0,0 +1,13 @@ +* Possible remaining coding style fix. +* Remove deadcode. +* Seperate client/server functionality. Functions only used by server can be + removed from client. +* Clean up libcfs layer. Ideally we can remove include/linux/libcfs entirely. +* Clean up CLIO layer. Lustre client readahead/writeback control needs to better + suit kernel providings. +* Add documents in Documentation. +* Other minor misc cleanups... + +Please send any patches to Greg Kroah-Hartman , Andreas Dilger + and Peng Tao . CCing +hpdd-discuss would be great too. diff --git a/drivers/staging/lustre/include/linux/libcfs/bitmap.h b/drivers/staging/lustre/include/linux/libcfs/bitmap.h new file mode 100644 index 000000000000..3f1c37b4bb7a --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/bitmap.h @@ -0,0 +1,111 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LIBCFS_BITMAP_H_ +#define _LIBCFS_BITMAP_H_ + + +typedef struct { + int size; + unsigned long data[0]; +} cfs_bitmap_t; + +#define CFS_BITMAP_SIZE(nbits) \ + (((nbits/BITS_PER_LONG)+1)*sizeof(long)+sizeof(cfs_bitmap_t)) + +static inline +cfs_bitmap_t *CFS_ALLOCATE_BITMAP(int size) +{ + cfs_bitmap_t *ptr; + + OBD_ALLOC(ptr, CFS_BITMAP_SIZE(size)); + if (ptr == NULL) + RETURN(ptr); + + ptr->size = size; + + RETURN (ptr); +} + +#define CFS_FREE_BITMAP(ptr) OBD_FREE(ptr, CFS_BITMAP_SIZE(ptr->size)) + +static inline +void cfs_bitmap_set(cfs_bitmap_t *bitmap, int nbit) +{ + set_bit(nbit, bitmap->data); +} + +static inline +void cfs_bitmap_clear(cfs_bitmap_t *bitmap, int nbit) +{ + test_and_clear_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_check(cfs_bitmap_t *bitmap, int nbit) +{ + return test_bit(nbit, bitmap->data); +} + +static inline +int cfs_bitmap_test_and_clear(cfs_bitmap_t *bitmap, int nbit) +{ + return test_and_clear_bit(nbit, bitmap->data); +} + +/* return 0 is bitmap has none set bits */ +static inline +int cfs_bitmap_check_empty(cfs_bitmap_t *bitmap) +{ + return find_first_bit(bitmap->data, bitmap->size) == bitmap->size; +} + +static inline +void cfs_bitmap_copy(cfs_bitmap_t *new, cfs_bitmap_t *old) +{ + int newsize; + + LASSERT(new->size >= old->size); + newsize = new->size; + memcpy(new, old, CFS_BITMAP_SIZE(old->size)); + new->size = newsize; +} + +#define cfs_foreach_bit(bitmap, pos) \ + for ((pos) = find_first_bit((bitmap)->data, bitmap->size); \ + (pos) < (bitmap)->size; \ + (pos) = find_next_bit((bitmap)->data, (bitmap)->size, (pos) + 1)) + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/curproc.h b/drivers/staging/lustre/include/linux/libcfs/curproc.h new file mode 100644 index 000000000000..90d7ce630e94 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/curproc.h @@ -0,0 +1,110 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/curproc.h + * + * Lustre curproc API declaration + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_CURPROC_H__ +#define __LIBCFS_CURPROC_H__ + +/* + * Portable API to access common characteristics of "current" UNIX process. + * + * Implemented in portals/include/libcfs// + */ +int cfs_curproc_groups_nr(void); +int current_is_in_group(gid_t group); +void cfs_curproc_groups_dump(gid_t *array, int size); + +/* + * Plus, platform-specific constant + * + * CFS_CURPROC_COMM_MAX, + * + * and opaque scalar type + * + * kernel_cap_t + */ + +/* check if task is running in compat mode.*/ +int current_is_32bit(void); +#define current_pid() (current->pid) +#define current_comm() (current->comm) +int cfs_get_environ(const char *key, char *value, int *val_len); + +typedef __u32 cfs_cap_t; + +#define CFS_CAP_CHOWN 0 +#define CFS_CAP_DAC_OVERRIDE 1 +#define CFS_CAP_DAC_READ_SEARCH 2 +#define CFS_CAP_FOWNER 3 +#define CFS_CAP_FSETID 4 +#define CFS_CAP_LINUX_IMMUTABLE 9 +#define CFS_CAP_SYS_ADMIN 21 +#define CFS_CAP_SYS_BOOT 23 +#define CFS_CAP_SYS_RESOURCE 24 + +#define CFS_CAP_FS_MASK ((1 << CFS_CAP_CHOWN) | \ + (1 << CFS_CAP_DAC_OVERRIDE) | \ + (1 << CFS_CAP_DAC_READ_SEARCH) | \ + (1 << CFS_CAP_FOWNER) | \ + (1 << CFS_CAP_FSETID ) | \ + (1 << CFS_CAP_LINUX_IMMUTABLE) | \ + (1 << CFS_CAP_SYS_ADMIN) | \ + (1 << CFS_CAP_SYS_BOOT) | \ + (1 << CFS_CAP_SYS_RESOURCE)) + +void cfs_cap_raise(cfs_cap_t cap); +void cfs_cap_lower(cfs_cap_t cap); +int cfs_cap_raised(cfs_cap_t cap); +cfs_cap_t cfs_curproc_cap_pack(void); +void cfs_curproc_cap_unpack(cfs_cap_t cap); +int cfs_capable(cfs_cap_t cap); + +/* __LIBCFS_CURPROC_H__ */ +#endif +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h new file mode 100644 index 000000000000..6dd5a7d27827 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs.h @@ -0,0 +1,286 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LIBCFS_H__ +#define __LIBCFS_LIBCFS_H__ + +#if !__GNUC__ +#define __attribute__(x) +#endif + +#include + +#include "curproc.h" + +#ifndef offsetof +# define offsetof(typ,memb) ((long)(long_ptr_t)((char *)&(((typ *)0)->memb))) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) ((sizeof (a)) / (sizeof ((a)[0]))) +#endif + +#if !defined(swap) +#define swap(x,y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#if !defined(container_of) +/* given a pointer @ptr to the field @member embedded into type (usually + * struct) @type, return pointer to the embedding instance of @type. */ +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) +#endif + +static inline int __is_po2(unsigned long long val) +{ + return !(val & (val - 1)); +} + +#define IS_PO2(val) __is_po2((unsigned long long)(val)) + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +/* + * Lustre Error Checksum: calculates checksum + * of Hex number by XORing each bit. + */ +#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \ + ((hexnum) >> 8 & 0xf)) + + +/* + * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses + * Lustre RETURN(NULL) macro. + */ +#if defined(NULL) +#undef NULL +#endif + +#define NULL ((void *)0) + +#define LUSTRE_SRV_LNET_PID LUSTRE_LNET_PID + + +#include + +#ifndef cfs_for_each_possible_cpu +# error cfs_for_each_possible_cpu is not supported by kernel! +#endif + +/* libcfs tcpip */ +int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask); +int libcfs_ipif_enumerate(char ***names); +void libcfs_ipif_free_enumeration(char **names, int n); +int libcfs_sock_listen(socket_t **sockp, __u32 ip, int port, int backlog); +int libcfs_sock_accept(socket_t **newsockp, socket_t *sock); +void libcfs_sock_abort_accept(socket_t *sock); +int libcfs_sock_connect(socket_t **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port); +int libcfs_sock_setbuf(socket_t *socket, int txbufsize, int rxbufsize); +int libcfs_sock_getbuf(socket_t *socket, int *txbufsize, int *rxbufsize); +int libcfs_sock_getaddr(socket_t *socket, int remote, __u32 *ip, int *port); +int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout); +int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout); +void libcfs_sock_release(socket_t *sock); + +/* libcfs watchdogs */ +struct lc_watchdog; + +/* Add a watchdog which fires after "time" milliseconds of delay. You have to + * touch it once to enable it. */ +struct lc_watchdog *lc_watchdog_add(int time, + void (*cb)(pid_t pid, void *), + void *data); + +/* Enables a watchdog and resets its timer. */ +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout); +#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout, \ + AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \ + svc->srv_watchdog_factor) + +/* Disable a watchdog; touch it to restart it. */ +void lc_watchdog_disable(struct lc_watchdog *lcw); + +/* Clean up the watchdog */ +void lc_watchdog_delete(struct lc_watchdog *lcw); + +/* Dump a debug log */ +void lc_watchdog_dumplog(pid_t pid, void *data); + + +/* need both kernel and user-land acceptor */ +#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 +#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 + +/* + * libcfs pseudo device operations + * + * struct psdev_t and + * misc_register() and + * misc_deregister() are declared in + * libcfs//-prim.h + * + * It's just draft now. + */ + +struct cfs_psdev_file { + unsigned long off; + void *private_data; + unsigned long reserved1; + unsigned long reserved2; +}; + +struct cfs_psdev_ops { + int (*p_open)(unsigned long, void *); + int (*p_close)(unsigned long, void *); + int (*p_read)(struct cfs_psdev_file *, char *, unsigned long); + int (*p_write)(struct cfs_psdev_file *, char *, unsigned long); + int (*p_ioctl)(struct cfs_psdev_file *, unsigned long, void *); +}; + +/* + * Drop into debugger, if possible. Implementation is provided by platform. + */ + +void cfs_enter_debugger(void); + +/* + * Defined by platform + */ +int unshare_fs_struct(void); +sigset_t cfs_get_blocked_sigs(void); +sigset_t cfs_block_allsigs(void); +sigset_t cfs_block_sigs(unsigned long sigs); +sigset_t cfs_block_sigsinv(unsigned long sigs); +void cfs_restore_sigs(sigset_t); +int cfs_signal_pending(void); +void cfs_clear_sigpending(void); + +int convert_server_error(__u64 ecode); +int convert_client_oflag(int cflag, int *result); + +/* + * Stack-tracing filling. + */ + +/* + * Platform-dependent data-type to hold stack frames. + */ +struct cfs_stack_trace; + +/* + * Fill @trace with current back-trace. + */ +void cfs_stack_trace_fill(struct cfs_stack_trace *trace); + +/* + * Return instruction pointer for frame @frame_no. NULL if @frame_no is + * invalid. + */ +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no); + +#ifndef O_NOACCESS +#define O_NOACCESS O_NONBLOCK +#endif + +/* + * Universal open flags. + */ +#define CFS_O_NOACCESS 0003 +#define CFS_O_ACCMODE CFS_O_NOACCESS +#define CFS_O_CREAT 0100 +#define CFS_O_EXCL 0200 +#define CFS_O_NOCTTY 0400 +#define CFS_O_TRUNC 01000 +#define CFS_O_APPEND 02000 +#define CFS_O_NONBLOCK 04000 +#define CFS_O_NDELAY CFS_O_NONBLOCK +#define CFS_O_SYNC 010000 +#define CFS_O_ASYNC 020000 +#define CFS_O_DIRECT 040000 +#define CFS_O_LARGEFILE 0100000 +#define CFS_O_DIRECTORY 0200000 +#define CFS_O_NOFOLLOW 0400000 +#define CFS_O_NOATIME 01000000 + +/* convert local open flags to universal open flags */ +int cfs_oflags2univ(int flags); +/* convert universal open flags to local open flags */ +int cfs_univ2oflags(int flags); + +/* + * Random number handling + */ + +/* returns a random 32-bit integer */ +unsigned int cfs_rand(void); +/* seed the generator */ +void cfs_srand(unsigned int, unsigned int); +void cfs_get_random_bytes(void *buf, int size); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* container_of depends on "likely" which is defined in libcfs_private.h */ +static inline void *__container_of(void *ptr, unsigned long shift) +{ + if (unlikely(IS_ERR(ptr) || ptr == NULL)) + return ptr; + else + return (char *)ptr - shift; +} + +#define container_of0(ptr, type, member) \ + ((type *)__container_of((void *)(ptr), offsetof(type, member))) + +#define SET_BUT_UNUSED(a) do { } while(sizeof(a) - sizeof(a)) + +#define _LIBCFS_H + +#endif /* _LIBCFS_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h new file mode 100644 index 000000000000..6ae7415a3b99 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h @@ -0,0 +1,214 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_cpu.h + * + * CPU partition + * . CPU partition is virtual processing unit + * + * . CPU partition can present 1-N cores, or 1-N NUMA nodes, + * in other words, CPU partition is a processors pool. + * + * CPU Partition Table (CPT) + * . a set of CPU partitions + * + * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP + * + * . User can specify total number of CPU partitions while creating a + * CPT, ID of CPU partition is always start from 0. + * + * Example: if there are 8 cores on the system, while creating a CPT + * with cpu_npartitions=4: + * core[0, 1] = partition[0], core[2, 3] = partition[1] + * core[4, 5] = partition[2], core[6, 7] = partition[3] + * + * cpu_npartitions=1: + * core[0, 1, ... 7] = partition[0] + * + * . User can also specify CPU partitions by string pattern + * + * Examples: cpu_partitions="0[0,1], 1[2,3]" + * cpu_partitions="N 0[0-3], 1[4-8]" + * + * The first character "N" means following numbers are numa ID + * + * . NUMA allocators, CPU affinity threads are built over CPU partitions, + * instead of HW CPUs or HW nodes. + * + * . By default, Lustre modules should refer to the global cfs_cpt_table, + * instead of accessing HW CPUs directly, so concurrency of Lustre can be + * configured by cpu_npartitions of the global cfs_cpt_table + * + * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the + * same way as 2.2 or earlier versions + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_CPU_H__ +#define __LIBCFS_CPU_H__ + +#ifndef HAVE_LIBCFS_CPT + +typedef unsigned long cpumask_t; +typedef unsigned long nodemask_t; + +struct cfs_cpt_table { + /* # of CPU partitions */ + int ctb_nparts; + /* cpu mask */ + cpumask_t ctb_mask; + /* node mask */ + nodemask_t ctb_nodemask; + /* version */ + __u64 ctb_version; +}; + +#endif /* !HAVE_LIBCFS_CPT */ + +/* any CPU partition */ +#define CFS_CPT_ANY (-1) + +extern struct cfs_cpt_table *cfs_cpt_table; + +/** + * destroy a CPU partition table + */ +void cfs_cpt_table_free(struct cfs_cpt_table *cptab); +/** + * create a cfs_cpt_table with \a ncpt number of partitions + */ +struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt); +/** + * print string information of cpt-table + */ +int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); +/** + * return total number of CPU partitions in \a cptab + */ +int +cfs_cpt_number(struct cfs_cpt_table *cptab); +/** + * return number of HW cores or hypter-threadings in a CPU partition \a cpt + */ +int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); +/** + * is there any online CPU in CPU partition \a cpt + */ +int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); +/** + * return cpumask of CPU partition \a cpt + */ +cpumask_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); +/** + * return nodemask of CPU partition \a cpt + */ +nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); +/** + * shadow current HW processor ID to CPU-partition ID of \a cptab + */ +int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); +/** + * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab + */ +int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); +/** + * bind current thread on a CPU-partition \a cpt of \a cptab + */ +int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); +/** + * add \a cpu to CPU partion @cpt of \a cptab, return 1 for success, + * otherwise 0 is returned + */ +int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * remove \a cpu from CPU partition \a cpt of \a cptab + */ +void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); +/** + * add all cpus in \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * remove all cpus in \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, + int cpt, cpumask_t *mask); +/** + * add all cpus in NUMA node \a node to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); +/** + * remove all cpus in NUMA node \a node from CPU partition \a cpt + */ +void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); + +/** + * add all cpus in node mask \a mask to CPU partition \a cpt + * return 1 if successfully set all CPUs, otherwise return 0 + */ +int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * remove all cpus in node mask \a mask from CPU partition \a cpt + */ +void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, + int cpt, nodemask_t *mask); +/** + * unset all cpus for CPU partition \a cpt + */ +void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt); +/** + * convert partition id \a cpt to numa node id, if there are more than one + * nodes in this partition, it might return a different node id each time. + */ +int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); + +/** + * iterate over all CPU partitions in \a cptab + */ +#define cfs_cpt_for_each(i, cptab) \ + for (i = 0; i < cfs_cpt_number(cptab); i++) + +#ifndef __read_mostly +# define __read_mostly +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned +#endif + +int cfs_cpu_init(void); +void cfs_cpu_fini(void); + +#endif /* __LIBCFS_CPU_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h new file mode 100644 index 000000000000..64ca62f0cc93 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h @@ -0,0 +1,201 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +#ifndef _LIBCFS_CRYPTO_H +#define _LIBCFS_CRYPTO_H + +struct cfs_crypto_hash_type { + char *cht_name; /**< hash algorithm name, equal to + * format name for crypto api */ + unsigned int cht_key; /**< init key by default (vaild for + * 4 bytes context like crc32, adler */ + unsigned int cht_size; /**< hash digest size */ +}; + +enum cfs_crypto_hash_alg { + CFS_HASH_ALG_NULL = 0, + CFS_HASH_ALG_ADLER32, + CFS_HASH_ALG_CRC32, + CFS_HASH_ALG_MD5, + CFS_HASH_ALG_SHA1, + CFS_HASH_ALG_SHA256, + CFS_HASH_ALG_SHA384, + CFS_HASH_ALG_SHA512, + CFS_HASH_ALG_CRC32C, + CFS_HASH_ALG_MAX +}; + +static struct cfs_crypto_hash_type hash_types[] = { + [CFS_HASH_ALG_NULL] = { "null", 0, 0 }, + [CFS_HASH_ALG_ADLER32] = { "adler32", 1, 4 }, + [CFS_HASH_ALG_CRC32] = { "crc32", ~0, 4 }, + [CFS_HASH_ALG_CRC32C] = { "crc32c", ~0, 4 }, + [CFS_HASH_ALG_MD5] = { "md5", 0, 16 }, + [CFS_HASH_ALG_SHA1] = { "sha1", 0, 20 }, + [CFS_HASH_ALG_SHA256] = { "sha256", 0, 32 }, + [CFS_HASH_ALG_SHA384] = { "sha384", 0, 48 }, + [CFS_HASH_ALG_SHA512] = { "sha512", 0, 64 }, +}; + +/** Return pointer to type of hash for valid hash algorithm identifier */ +static inline const struct cfs_crypto_hash_type * + cfs_crypto_hash_type(unsigned char hash_alg) +{ + struct cfs_crypto_hash_type *ht; + + if (hash_alg < CFS_HASH_ALG_MAX) { + ht = &hash_types[hash_alg]; + if (ht->cht_name) + return ht; + } + return NULL; +} + +/** Return hash name for valid hash algorithm identifier or "unknown" */ +static inline const char *cfs_crypto_hash_name(unsigned char hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_name; + else + return "unknown"; +} + +/** Return digest size for valid algorithm identifier or 0 */ +static inline int cfs_crypto_hash_digestsize(unsigned char hash_alg) +{ + const struct cfs_crypto_hash_type *ht; + + ht = cfs_crypto_hash_type(hash_alg); + if (ht) + return ht->cht_size; + else + return 0; +} + +/** Return hash identifier for valid hash algorithm name or 0xFF */ +static inline unsigned char cfs_crypto_hash_alg(const char *algname) +{ + unsigned char i; + + for (i = 0; i < CFS_HASH_ALG_MAX; i++) + if (!strcmp(hash_types[i].cht_name, algname)) + break; + return (i == CFS_HASH_ALG_MAX ? 0xFF : i); +} + +/** Calculate hash digest for buffer. + * @param alg id of hash algorithm + * @param buf buffer of data + * @param buf_len buffer len + * @param key initial value for algorithm, if it is NULL, + * default initial value should be used. + * @param key_len len of initial value + * @param hash [out] pointer to hash, if it is NULL, hash_len is + * set to valid digest size in bytes, retval -ENOSPC. + * @param hash_len [in,out] size of hash buffer + * @returns status of operation + * @retval -EINVAL if buf, buf_len, hash_len or alg_id is invalid + * @retval -ENODEV if this algorithm is unsupported + * @retval -ENOSPC if pointer to hash is NULL, or hash_len less than + * digest size + * @retval 0 for success + * @retval < 0 other errors from lower layers. + */ +int cfs_crypto_hash_digest(unsigned char alg, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len); + +/* cfs crypto hash descriptor */ +struct cfs_crypto_hash_desc; + +/** Allocate and initialize desriptor for hash algorithm. + * @param alg algorithm id + * @param key initial value for algorithm, if it is NULL, + * default initial value should be used. + * @param key_len len of initial value + * @returns pointer to descriptor of hash instance + * @retval ERR_PTR(error) when errors occured. + */ +struct cfs_crypto_hash_desc* + cfs_crypto_hash_init(unsigned char alg, + unsigned char *key, unsigned int key_len); + +/** Update digest by part of data. + * @param desc hash descriptor + * @param page data page + * @param offset data offset + * @param len data len + * @returns status of operation + * @retval 0 for success. + */ +int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *desc, + struct page *page, unsigned int offset, + unsigned int len); + +/** Update digest by part of data. + * @param desc hash descriptor + * @param buf pointer to data buffer + * @param buf_len size of data at buffer + * @returns status of operation + * @retval 0 for success. + */ +int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *desc, const void *buf, + unsigned int buf_len); + +/** Finalize hash calculation, copy hash digest to buffer, destroy hash + * descriptor. + * @param desc hash descriptor + * @param hash buffer pointer to store hash digest + * @param hash_len pointer to hash buffer size, if NULL + * destory hash descriptor + * @returns status of operation + * @retval -ENOSPC if hash is NULL, or *hash_len less than + * digest size + * @retval 0 for success + * @retval < 0 other errors from lower layers. + */ +int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *desc, + unsigned char *hash, unsigned int *hash_len); +/** + * Register crypto hash algorithms + */ +int cfs_crypto_register(void); + +/** + * Unregister + */ +void cfs_crypto_unregister(void); + +/** Return hash speed in Mbytes per second for valid hash algorithm + * identifier. If test was unsuccessfull -1 would be return. + */ +int cfs_crypto_hash_speed(unsigned char hash_alg); +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h new file mode 100644 index 000000000000..dd8ac2f52c9f --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h @@ -0,0 +1,350 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_debug.h + * + * Debug messages and assertions + * + */ + +#ifndef __LIBCFS_DEBUG_H__ +#define __LIBCFS_DEBUG_H__ + +/* + * Debugging + */ +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_stack; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_watchdog_ratelimit; +extern unsigned int libcfs_console_max_delay; +extern unsigned int libcfs_console_min_delay; +extern unsigned int libcfs_console_backoff; +extern unsigned int libcfs_debug_binary; +extern char libcfs_debug_file_path_arr[PATH_MAX]; + +int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); +int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); + +/* Has there been an LBUG? */ +extern unsigned int libcfs_catastrophe; +extern unsigned int libcfs_panic_on_lbug; + +/** + * Format for debug message headers + */ +struct ptldebug_header { + __u32 ph_len; + __u32 ph_flags; + __u32 ph_subsys; + __u32 ph_mask; + __u16 ph_cpu_id; + __u16 ph_type; + __u32 ph_sec; + __u64 ph_usec; + __u32 ph_stack; + __u32 ph_pid; + __u32 ph_extern_pid; + __u32 ph_line_num; +} __attribute__((packed)); + + +#define PH_FLAG_FIRST_RECORD 1 + +/* Debugging subsystems (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ +#define S_UNDEFINED 0x00000001 +#define S_MDC 0x00000002 +#define S_MDS 0x00000004 +#define S_OSC 0x00000008 +#define S_OST 0x00000010 +#define S_CLASS 0x00000020 +#define S_LOG 0x00000040 +#define S_LLITE 0x00000080 +#define S_RPC 0x00000100 +#define S_MGMT 0x00000200 +#define S_LNET 0x00000400 +#define S_LND 0x00000800 /* ALL LNDs */ +#define S_PINGER 0x00001000 +#define S_FILTER 0x00002000 +/* unused */ +#define S_ECHO 0x00008000 +#define S_LDLM 0x00010000 +#define S_LOV 0x00020000 +#define S_LQUOTA 0x00040000 +#define S_OSD 0x00080000 +/* unused */ +/* unused */ +/* unused */ +#define S_LMV 0x00800000 /* b_new_cmd */ +/* unused */ +#define S_SEC 0x02000000 /* upcall cache */ +#define S_GSS 0x04000000 /* b_new_cmd */ +/* unused */ +#define S_MGC 0x10000000 +#define S_MGS 0x20000000 +#define S_FID 0x40000000 /* b_new_cmd */ +#define S_FLD 0x80000000 /* b_new_cmd */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ + +/* Debugging masks (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ +#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ +#define D_INODE 0x00000002 +#define D_SUPER 0x00000004 +#define D_EXT2 0x00000008 /* anything from ext2_debug */ +#define D_MALLOC 0x00000010 /* print malloc, free information */ +#define D_CACHE 0x00000020 /* cache-related items */ +#define D_INFO 0x00000040 /* general information */ +#define D_IOCTL 0x00000080 /* ioctl related information */ +#define D_NETERROR 0x00000100 /* network errors */ +#define D_NET 0x00000200 /* network communications */ +#define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ +#define D_BUFFS 0x00000800 +#define D_OTHER 0x00001000 +#define D_DENTRY 0x00002000 +#define D_NETTRACE 0x00004000 +#define D_PAGE 0x00008000 /* bulk page handling */ +#define D_DLMTRACE 0x00010000 +#define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG 0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA 0x00080000 /* recovery and failover */ +#define D_RPCTRACE 0x00100000 /* for distributed debugging */ +#define D_VFSTRACE 0x00200000 +#define D_READA 0x00400000 /* read-ahead */ +#define D_MMAP 0x00800000 +#define D_CONFIG 0x01000000 +#define D_CONSOLE 0x02000000 +#define D_QUOTA 0x04000000 +#define D_SEC 0x08000000 +#define D_LFSCK 0x10000000 /* For both OI scrub and LFSCK */ +/* keep these in sync with lnet/{utils,libcfs}/debug.c */ + +#define D_HSM D_TRACE + +#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + +#define CDEBUG_DEFAULT_MAX_DELAY (cfs_time_seconds(600)) /* jiffies */ +#define CDEBUG_DEFAULT_MIN_DELAY ((cfs_time_seconds(1) + 1) / 2) /* jiffies */ +#define CDEBUG_DEFAULT_BACKOFF 2 +typedef struct { + cfs_time_t cdls_next; + unsigned int cdls_delay; + int cdls_count; +} cfs_debug_limit_state_t; + +struct libcfs_debug_msg_data { + const char *msg_file; + const char *msg_fn; + int msg_subsys; + int msg_line; + int msg_mask; + cfs_debug_limit_state_t *msg_cdls; +}; + +#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls) \ +do { \ + (data)->msg_subsys = DEBUG_SUBSYSTEM; \ + (data)->msg_file = __FILE__; \ + (data)->msg_fn = __FUNCTION__; \ + (data)->msg_line = __LINE__; \ + (data)->msg_cdls = (cdls); \ + (data)->msg_mask = (mask); \ +} while (0) + +#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls) \ + static struct libcfs_debug_msg_data dataname = { \ + .msg_subsys = DEBUG_SUBSYSTEM, \ + .msg_file = __FILE__, \ + .msg_fn = __FUNCTION__, \ + .msg_line = __LINE__, \ + .msg_cdls = (cdls) }; \ + dataname.msg_mask = (mask); + + + +/** + * Filters out logging messages based on mask and subsystem. + */ +static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) +{ + return mask & D_CANTMASK || + ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem)); +} + +#define __CDEBUG(cdls, mask, format, ...) \ +do { \ + static struct libcfs_debug_msg_data msgdata; \ + \ + CFS_CHECK_STACK(&msgdata, mask, cdls); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls); \ + libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__); \ + } \ +} while (0) + +#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__) + +#define CDEBUG_LIMIT(mask, format, ...) \ +do { \ + static cfs_debug_limit_state_t cdls; \ + \ + __CDEBUG(&cdls, mask, format, ## __VA_ARGS__);\ +} while (0) + + + + +#define CWARN(format, ...) CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__) +#define CERROR(format, ...) CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__) +#define CNETERR(format, a...) CDEBUG_LIMIT(D_NETERROR, format, ## a) +#define CEMERG(format, ...) CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__) + +#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__) +#define LCONSOLE_INFO(format, ...) CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__) +#define LCONSOLE_WARN(format, ...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__) +#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \ + "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__) +#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__) + +#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) + + +void libcfs_log_goto(struct libcfs_debug_msg_data *, const char *, long_ptr_t); +#define GOTO(label, rc) \ +do { \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + libcfs_log_goto(&msgdata, #label, (long_ptr_t)(rc)); \ + } else { \ + (void)(rc); \ + } \ + goto label; \ +} while (0) + + +/* + * if rc == NULL, we need to code as RETURN((void *)NULL), otherwise + * there will be a warning in osx. + */ +#if defined(__GNUC__) + +long libcfs_log_return(struct libcfs_debug_msg_data *, long rc); +#if BITS_PER_LONG > 32 +#define RETURN(rc) \ +do { \ + EXIT_NESTING; \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + return (typeof(rc))libcfs_log_return(&msgdata, \ + (long)(rc)); \ + } \ + \ + return (rc); \ +} while (0) +#else /* BITS_PER_LONG == 32 */ +/* We need an on-stack variable, because we cannot case a 32-bit pointer + * directly to (long long) without generating a complier warning/error, yet + * casting directly to (long) will truncate 64-bit return values. The log + * values will print as 32-bit values, but they always have been. LU-1436 + */ +#define RETURN(rc) \ +do { \ + EXIT_NESTING; \ + if (cfs_cdebug_show(D_TRACE, DEBUG_SUBSYSTEM)) { \ + typeof(rc) __rc = (rc); \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_TRACE, NULL); \ + libcfs_log_return(&msgdata, (long_ptr_t)__rc); \ + return __rc; \ + } \ + \ + return (rc); \ +} while (0) +#endif /* BITS_PER_LONG > 32 */ + +#elif defined(_MSC_VER) +#define RETURN(rc) \ +do { \ + CDEBUG(D_TRACE, "Process leaving.\n"); \ + EXIT_NESTING; \ + return (rc); \ +} while (0) +#else +# error "Unkown compiler" +#endif /* __GNUC__ */ + +#define ENTRY \ +ENTRY_NESTING; \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ + EXIT_NESTING; \ +} while(0) + +#define RETURN_EXIT \ +do { \ + EXIT; \ + return; \ +} while (0) + +extern int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format1, ...) + __attribute__ ((format (printf, 2, 3))); + +extern int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, + va_list args, const char *format2, ...) + __attribute__ ((format (printf, 4, 5))); + +/* other external symbols that tracefile provides: */ +extern int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char *usr_buffer, int usr_buffer_nob); +extern int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append); + +#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" + +#endif /* __LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h new file mode 100644 index 000000000000..8393c2703ce6 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h @@ -0,0 +1,170 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores, + * CA 94065 USA or visit www.oracle.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#ifndef _LIBCFS_FAIL_H +#define _LIBCFS_FAIL_H + +extern unsigned long cfs_fail_loc; +extern unsigned int cfs_fail_val; + +extern wait_queue_head_t cfs_race_waitq; +extern int cfs_race_state; + +int __cfs_fail_check_set(__u32 id, __u32 value, int set); +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set); + +enum { + CFS_FAIL_LOC_NOSET = 0, + CFS_FAIL_LOC_ORSET = 1, + CFS_FAIL_LOC_RESET = 2, + CFS_FAIL_LOC_VALUE = 3 +}; + +/* Failure injection control */ +#define CFS_FAIL_MASK_SYS 0x0000FF00 +#define CFS_FAIL_MASK_LOC (0x000000FF | CFS_FAIL_MASK_SYS) + +#define CFS_FAILED_BIT 30 +/* CFS_FAILED is 0x40000000 */ +#define CFS_FAILED (1 << CFS_FAILED_BIT) + +#define CFS_FAIL_ONCE_BIT 31 +/* CFS_FAIL_ONCE is 0x80000000 */ +#define CFS_FAIL_ONCE (1 << CFS_FAIL_ONCE_BIT) + +/* The following flags aren't made to be combined */ +#define CFS_FAIL_SKIP 0x20000000 /* skip N times then fail */ +#define CFS_FAIL_SOME 0x10000000 /* only fail N times */ +#define CFS_FAIL_RAND 0x08000000 /* fail 1/N of the times */ +#define CFS_FAIL_USR1 0x04000000 /* user flag */ + +#define CFS_FAIL_PRECHECK(id) (cfs_fail_loc && \ + (cfs_fail_loc & CFS_FAIL_MASK_LOC) == \ + ((id) & CFS_FAIL_MASK_LOC)) + +static inline int cfs_fail_check_set(__u32 id, __u32 value, + int set, int quiet) +{ + int ret = 0; + + if (unlikely(CFS_FAIL_PRECHECK(id) && + (ret = __cfs_fail_check_set(id, value, set)))) { + if (quiet) { + CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } else { + LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n", + id, value); + } + } + + return ret; +} + +/* If id hit cfs_fail_loc, return 1, otherwise return 0 */ +#define CFS_FAIL_CHECK(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0) +#define CFS_FAIL_CHECK_QUIET(id) \ + cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1) + +/* If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_VALUE(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0) +#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_ORSET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0) +#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1) + +/* If id hit cfs_fail_loc, cfs_fail_loc = value and return 1, + * otherwise return 0 */ +#define CFS_FAIL_CHECK_RESET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0) +#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \ + cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1) + +static inline int cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + if (unlikely(CFS_FAIL_PRECHECK(id))) + return __cfs_fail_timeout_set(id, value, ms, set); + else + return 0; +} + +/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT(id, secs) \ + cfs_fail_timeout_set(id, 0, secs * 1000, CFS_FAIL_LOC_NOSET) + +#define CFS_FAIL_TIMEOUT_MS(id, ms) \ + cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET) + +/* If id hit cfs_fail_loc, cfs_fail_loc |= value and + * sleep seconds or milliseconds */ +#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ + cfs_fail_timeout_set(id, value, secs * 1000, CFS_FAIL_LOC_ORSET) + +#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ + cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) + +/* The idea here is to synchronise two threads to force a race. The + * first thread that calls this with a matching fail_loc is put to + * sleep. The next thread that calls with the same fail_loc wakes up + * the first and continues. */ +static inline void cfs_race(__u32 id) +{ + + if (CFS_FAIL_PRECHECK(id)) { + if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { + int rc; + cfs_race_state = 0; + CERROR("cfs_race id %x sleeping\n", id); + cfs_wait_event_interruptible(cfs_race_waitq, + cfs_race_state != 0, rc); + CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc); + } else { + CERROR("cfs_fail_race id %x waking\n", id); + cfs_race_state = 1; + wake_up(&cfs_race_waitq); + } + } +} +#define CFS_RACE(id) cfs_race(id) + +#endif /* _LIBCFS_FAIL_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h new file mode 100644 index 000000000000..c5b371569da8 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h @@ -0,0 +1,850 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_hash.h + * + * Hashing routines + * + */ + +#ifndef __LIBCFS_HASH_H__ +#define __LIBCFS_HASH_H__ +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL + +/* + * Ideally we would use HAVE_HASH_LONG for this, but on linux we configure + * the linux kernel and user space at the same time, so we need to differentiate + * between them explicitely. If this is not needed on other architectures, then + * we'll need to move the functions to archi specific headers. + */ + +#include + +#define cfs_hash_long(val, bits) hash_long(val, bits) + +/** disable debug */ +#define CFS_HASH_DEBUG_NONE 0 +/** record hash depth and output to console when it's too deep, + * computing overhead is low but consume more memory */ +#define CFS_HASH_DEBUG_1 1 +/** expensive, check key validation */ +#define CFS_HASH_DEBUG_2 2 + +#define CFS_HASH_DEBUG_LEVEL CFS_HASH_DEBUG_NONE + +struct cfs_hash_ops; +struct cfs_hash_lock_ops; +struct cfs_hash_hlist_ops; + +typedef union { + rwlock_t rw; /**< rwlock */ + spinlock_t spin; /**< spinlock */ +} cfs_hash_lock_t; + +/** + * cfs_hash_bucket is a container of: + * - lock, couter ... + * - array of hash-head starting from hsb_head[0], hash-head can be one of + * . cfs_hash_head_t + * . cfs_hash_head_dep_t + * . cfs_hash_dhead_t + * . cfs_hash_dhead_dep_t + * which depends on requirement of user + * - some extra bytes (caller can require it while creating hash) + */ +typedef struct cfs_hash_bucket { + cfs_hash_lock_t hsb_lock; /**< bucket lock */ + __u32 hsb_count; /**< current entries */ + __u32 hsb_version; /**< change version */ + unsigned int hsb_index; /**< index of bucket */ + int hsb_depmax; /**< max depth on bucket */ + long hsb_head[0]; /**< hash-head array */ +} cfs_hash_bucket_t; + +/** + * cfs_hash bucket descriptor, it's normally in stack of caller + */ +typedef struct cfs_hash_bd { + cfs_hash_bucket_t *bd_bucket; /**< address of bucket */ + unsigned int bd_offset; /**< offset in bucket */ +} cfs_hash_bd_t; + +#define CFS_HASH_NAME_LEN 16 /**< default name length */ +#define CFS_HASH_BIGNAME_LEN 64 /**< bigname for param tree */ + +#define CFS_HASH_BKT_BITS 3 /**< default bits of bucket */ +#define CFS_HASH_BITS_MAX 30 /**< max bits of bucket */ +#define CFS_HASH_BITS_MIN CFS_HASH_BKT_BITS + +/** + * common hash attributes. + */ +enum cfs_hash_tag { + /** + * don't need any lock, caller will protect operations with it's + * own lock. With this flag: + * . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK + * will be ignored. + * . Some functions will be disabled with this flag, i.e: + * cfs_hash_for_each_empty, cfs_hash_rehash + */ + CFS_HASH_NO_LOCK = 1 << 0, + /** no bucket lock, use one spinlock to protect the whole hash */ + CFS_HASH_NO_BKTLOCK = 1 << 1, + /** rwlock to protect bucket */ + CFS_HASH_RW_BKTLOCK = 1 << 2, + /** spinlcok to protect bucket */ + CFS_HASH_SPIN_BKTLOCK = 1 << 3, + /** always add new item to tail */ + CFS_HASH_ADD_TAIL = 1 << 4, + /** hash-table doesn't have refcount on item */ + CFS_HASH_NO_ITEMREF = 1 << 5, + /** big name for param-tree */ + CFS_HASH_BIGNAME = 1 << 6, + /** track global count */ + CFS_HASH_COUNTER = 1 << 7, + /** rehash item by new key */ + CFS_HASH_REHASH_KEY = 1 << 8, + /** Enable dynamic hash resizing */ + CFS_HASH_REHASH = 1 << 9, + /** can shrink hash-size */ + CFS_HASH_SHRINK = 1 << 10, + /** assert hash is empty on exit */ + CFS_HASH_ASSERT_EMPTY = 1 << 11, + /** record hlist depth */ + CFS_HASH_DEPTH = 1 << 12, + /** + * rehash is always scheduled in a different thread, so current + * change on hash table is non-blocking + */ + CFS_HASH_NBLK_CHANGE = 1 << 13, + /** NB, we typed hs_flags as __u16, please change it + * if you need to extend >=16 flags */ +}; + +/** most used attributes */ +#define CFS_HASH_DEFAULT (CFS_HASH_RW_BKTLOCK | \ + CFS_HASH_COUNTER | CFS_HASH_REHASH) + +/** + * cfs_hash is a hash-table implementation for general purpose, it can support: + * . two refcount modes + * hash-table with & without refcount + * . four lock modes + * nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock + * . general operations + * lookup, add(add_tail or add_head), delete + * . rehash + * grows or shrink + * . iteration + * locked iteration and unlocked iteration + * . bigname + * support long name hash + * . debug + * trace max searching depth + * + * Rehash: + * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker) + * is spawned to handle the rehash in the background, it's possible that other + * processes can concurrently perform additions, deletions, and lookups + * without being blocked on rehash completion, because rehash will release + * the global wrlock for each bucket. + * + * rehash and iteration can't run at the same time because it's too tricky + * to keep both of them safe and correct. + * As they are relatively rare operations, so: + * . if iteration is in progress while we try to launch rehash, then + * it just giveup, iterator will launch rehash at the end. + * . if rehash is in progress while we try to iterate the hash table, + * then we just wait (shouldn't be very long time), anyway, nobody + * should expect iteration of whole hash-table to be non-blocking. + * + * During rehashing, a (key,object) pair may be in one of two buckets, + * depending on whether the worker task has yet to transfer the object + * to its new location in the table. Lookups and deletions need to search both + * locations; additions must take care to only insert into the new bucket. + */ + +typedef struct cfs_hash { + /** serialize with rehash, or serialize all operations if + * the hash-table has CFS_HASH_NO_BKTLOCK */ + cfs_hash_lock_t hs_lock; + /** hash operations */ + struct cfs_hash_ops *hs_ops; + /** hash lock operations */ + struct cfs_hash_lock_ops *hs_lops; + /** hash list operations */ + struct cfs_hash_hlist_ops *hs_hops; + /** hash buckets-table */ + cfs_hash_bucket_t **hs_buckets; + /** total number of items on this hash-table */ + atomic_t hs_count; + /** hash flags, see cfs_hash_tag for detail */ + __u16 hs_flags; + /** # of extra-bytes for bucket, for user saving extended attributes */ + __u16 hs_extra_bytes; + /** wants to iterate */ + __u8 hs_iterating; + /** hash-table is dying */ + __u8 hs_exiting; + /** current hash bits */ + __u8 hs_cur_bits; + /** min hash bits */ + __u8 hs_min_bits; + /** max hash bits */ + __u8 hs_max_bits; + /** bits for rehash */ + __u8 hs_rehash_bits; + /** bits for each bucket */ + __u8 hs_bkt_bits; + /** resize min threshold */ + __u16 hs_min_theta; + /** resize max threshold */ + __u16 hs_max_theta; + /** resize count */ + __u32 hs_rehash_count; + /** # of iterators (caller of cfs_hash_for_each_*) */ + __u32 hs_iterators; + /** rehash workitem */ + cfs_workitem_t hs_rehash_wi; + /** refcount on this hash table */ + atomic_t hs_refcount; + /** rehash buckets-table */ + cfs_hash_bucket_t **hs_rehash_buckets; +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + /** serialize debug members */ + spinlock_t hs_dep_lock; + /** max depth */ + unsigned int hs_dep_max; + /** id of the deepest bucket */ + unsigned int hs_dep_bkt; + /** offset in the deepest bucket */ + unsigned int hs_dep_off; + /** bits when we found the max depth */ + unsigned int hs_dep_bits; + /** workitem to output max depth */ + cfs_workitem_t hs_dep_wi; +#endif + /** name of htable */ + char hs_name[0]; +} cfs_hash_t; + +typedef struct cfs_hash_lock_ops { + /** lock the hash table */ + void (*hs_lock)(cfs_hash_lock_t *lock, int exclusive); + /** unlock the hash table */ + void (*hs_unlock)(cfs_hash_lock_t *lock, int exclusive); + /** lock the hash bucket */ + void (*hs_bkt_lock)(cfs_hash_lock_t *lock, int exclusive); + /** unlock the hash bucket */ + void (*hs_bkt_unlock)(cfs_hash_lock_t *lock, int exclusive); +} cfs_hash_lock_ops_t; + +typedef struct cfs_hash_hlist_ops { + /** return hlist_head of hash-head of @bd */ + struct hlist_head *(*hop_hhead)(cfs_hash_t *hs, cfs_hash_bd_t *bd); + /** return hash-head size */ + int (*hop_hhead_size)(cfs_hash_t *hs); + /** add @hnode to hash-head of @bd */ + int (*hop_hnode_add)(cfs_hash_t *hs, + cfs_hash_bd_t *bd, struct hlist_node *hnode); + /** remove @hnode from hash-head of @bd */ + int (*hop_hnode_del)(cfs_hash_t *hs, + cfs_hash_bd_t *bd, struct hlist_node *hnode); +} cfs_hash_hlist_ops_t; + +typedef struct cfs_hash_ops { + /** return hashed value from @key */ + unsigned (*hs_hash)(cfs_hash_t *hs, const void *key, unsigned mask); + /** return key address of @hnode */ + void * (*hs_key)(struct hlist_node *hnode); + /** copy key from @hnode to @key */ + void (*hs_keycpy)(struct hlist_node *hnode, void *key); + /** + * compare @key with key of @hnode + * returns 1 on a match + */ + int (*hs_keycmp)(const void *key, struct hlist_node *hnode); + /** return object address of @hnode, i.e: container_of(...hnode) */ + void * (*hs_object)(struct hlist_node *hnode); + /** get refcount of item, always called with holding bucket-lock */ + void (*hs_get)(cfs_hash_t *hs, struct hlist_node *hnode); + /** release refcount of item */ + void (*hs_put)(cfs_hash_t *hs, struct hlist_node *hnode); + /** release refcount of item, always called with holding bucket-lock */ + void (*hs_put_locked)(cfs_hash_t *hs, struct hlist_node *hnode); + /** it's called before removing of @hnode */ + void (*hs_exit)(cfs_hash_t *hs, struct hlist_node *hnode); +} cfs_hash_ops_t; + +/** total number of buckets in @hs */ +#define CFS_HASH_NBKT(hs) \ + (1U << ((hs)->hs_cur_bits - (hs)->hs_bkt_bits)) + +/** total number of buckets in @hs while rehashing */ +#define CFS_HASH_RH_NBKT(hs) \ + (1U << ((hs)->hs_rehash_bits - (hs)->hs_bkt_bits)) + +/** number of hlist for in bucket */ +#define CFS_HASH_BKT_NHLIST(hs) (1U << (hs)->hs_bkt_bits) + +/** total number of hlist in @hs */ +#define CFS_HASH_NHLIST(hs) (1U << (hs)->hs_cur_bits) + +/** total number of hlist in @hs while rehashing */ +#define CFS_HASH_RH_NHLIST(hs) (1U << (hs)->hs_rehash_bits) + +static inline int +cfs_hash_with_no_lock(cfs_hash_t *hs) +{ + /* caller will serialize all operations for this hash-table */ + return (hs->hs_flags & CFS_HASH_NO_LOCK) != 0; +} + +static inline int +cfs_hash_with_no_bktlock(cfs_hash_t *hs) +{ + /* no bucket lock, one single lock to protect the hash-table */ + return (hs->hs_flags & CFS_HASH_NO_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_rw_bktlock(cfs_hash_t *hs) +{ + /* rwlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_RW_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_spin_bktlock(cfs_hash_t *hs) +{ + /* spinlock to protect hash bucket */ + return (hs->hs_flags & CFS_HASH_SPIN_BKTLOCK) != 0; +} + +static inline int +cfs_hash_with_add_tail(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_ADD_TAIL) != 0; +} + +static inline int +cfs_hash_with_no_itemref(cfs_hash_t *hs) +{ + /* hash-table doesn't keep refcount on item, + * item can't be removed from hash unless it's + * ZERO refcount */ + return (hs->hs_flags & CFS_HASH_NO_ITEMREF) != 0; +} + +static inline int +cfs_hash_with_bigname(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_BIGNAME) != 0; +} + +static inline int +cfs_hash_with_counter(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_COUNTER) != 0; +} + +static inline int +cfs_hash_with_rehash(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH) != 0; +} + +static inline int +cfs_hash_with_rehash_key(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_REHASH_KEY) != 0; +} + +static inline int +cfs_hash_with_shrink(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_SHRINK) != 0; +} + +static inline int +cfs_hash_with_assert_empty(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_ASSERT_EMPTY) != 0; +} + +static inline int +cfs_hash_with_depth(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_DEPTH) != 0; +} + +static inline int +cfs_hash_with_nblk_change(cfs_hash_t *hs) +{ + return (hs->hs_flags & CFS_HASH_NBLK_CHANGE) != 0; +} + +static inline int +cfs_hash_is_exiting(cfs_hash_t *hs) +{ /* cfs_hash_destroy is called */ + return hs->hs_exiting; +} + +static inline int +cfs_hash_is_rehashing(cfs_hash_t *hs) +{ /* rehash is launched */ + return hs->hs_rehash_bits != 0; +} + +static inline int +cfs_hash_is_iterating(cfs_hash_t *hs) +{ /* someone is calling cfs_hash_for_each_* */ + return hs->hs_iterating || hs->hs_iterators != 0; +} + +static inline int +cfs_hash_bkt_size(cfs_hash_t *hs) +{ + return offsetof(cfs_hash_bucket_t, hsb_head[0]) + + hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) + + hs->hs_extra_bytes; +} + +#define CFS_HOP(hs, op) (hs)->hs_ops->hs_ ## op + +static inline unsigned +cfs_hash_id(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return CFS_HOP(hs, hash)(hs, key, mask); +} + +static inline void * +cfs_hash_key(cfs_hash_t *hs, struct hlist_node *hnode) +{ + return CFS_HOP(hs, key)(hnode); +} + +static inline void +cfs_hash_keycpy(cfs_hash_t *hs, struct hlist_node *hnode, void *key) +{ + if (CFS_HOP(hs, keycpy) != NULL) + CFS_HOP(hs, keycpy)(hnode, key); +} + +/** + * Returns 1 on a match, + */ +static inline int +cfs_hash_keycmp(cfs_hash_t *hs, const void *key, struct hlist_node *hnode) +{ + return CFS_HOP(hs, keycmp)(key, hnode); +} + +static inline void * +cfs_hash_object(cfs_hash_t *hs, struct hlist_node *hnode) +{ + return CFS_HOP(hs, object)(hnode); +} + +static inline void +cfs_hash_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + return CFS_HOP(hs, get)(hs, hnode); +} + +static inline void +cfs_hash_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + LASSERT(CFS_HOP(hs, put_locked) != NULL); + + return CFS_HOP(hs, put_locked)(hs, hnode); +} + +static inline void +cfs_hash_put(cfs_hash_t *hs, struct hlist_node *hnode) +{ + LASSERT(CFS_HOP(hs, put) != NULL); + + return CFS_HOP(hs, put)(hs, hnode); +} + +static inline void +cfs_hash_exit(cfs_hash_t *hs, struct hlist_node *hnode) +{ + if (CFS_HOP(hs, exit)) + CFS_HOP(hs, exit)(hs, hnode); +} + +static inline void cfs_hash_lock(cfs_hash_t *hs, int excl) +{ + hs->hs_lops->hs_lock(&hs->hs_lock, excl); +} + +static inline void cfs_hash_unlock(cfs_hash_t *hs, int excl) +{ + hs->hs_lops->hs_unlock(&hs->hs_lock, excl); +} + +static inline int cfs_hash_dec_and_lock(cfs_hash_t *hs, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_no_bktlock(hs)); + return atomic_dec_and_lock(condition, &hs->hs_lock.spin); +} + +static inline void cfs_hash_bd_lock(cfs_hash_t *hs, + cfs_hash_bd_t *bd, int excl) +{ + hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl); +} + +static inline void cfs_hash_bd_unlock(cfs_hash_t *hs, + cfs_hash_bd_t *bd, int excl) +{ + hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl); +} + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are normally for hash-table without rehash + */ +void cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd); + +static inline void cfs_hash_bd_get_and_lock(cfs_hash_t *hs, const void *key, + cfs_hash_bd_t *bd, int excl) +{ + cfs_hash_bd_get(hs, key, bd); + cfs_hash_bd_lock(hs, bd, excl); +} + +static inline unsigned cfs_hash_bd_index_get(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits); +} + +static inline void cfs_hash_bd_index_set(cfs_hash_t *hs, + unsigned index, cfs_hash_bd_t *bd) +{ + bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits]; + bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U); +} + +static inline void * +cfs_hash_bd_extra_get(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + return (void *)bd->bd_bucket + + cfs_hash_bkt_size(hs) - hs->hs_extra_bytes; +} + +static inline __u32 +cfs_hash_bd_version_get(cfs_hash_bd_t *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_version; +} + +static inline __u32 +cfs_hash_bd_count_get(cfs_hash_bd_t *bd) +{ + /* need hold cfs_hash_bd_lock */ + return bd->bd_bucket->hsb_count; +} + +static inline int +cfs_hash_bd_depmax_get(cfs_hash_bd_t *bd) +{ + return bd->bd_bucket->hsb_depmax; +} + +static inline int +cfs_hash_bd_compare(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2) +{ + if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index) + return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index; + + if (bd1->bd_offset != bd2->bd_offset) + return bd1->bd_offset - bd2->bd_offset; + + return 0; +} + +void cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode); +void cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode); +void cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old, + cfs_hash_bd_t *bd_new, struct hlist_node *hnode); + +static inline int cfs_hash_bd_dec_and_lock(cfs_hash_t *hs, cfs_hash_bd_t *bd, + atomic_t *condition) +{ + LASSERT(cfs_hash_with_spin_bktlock(hs)); + return atomic_dec_and_lock(condition, + &bd->bd_bucket->hsb_lock.spin); +} + +static inline struct hlist_head *cfs_hash_bd_hhead(cfs_hash_t *hs, + cfs_hash_bd_t *bd) +{ + return hs->hs_hops->hop_hhead(hs, bd); +} + +struct hlist_node *cfs_hash_bd_lookup_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bd, const void *key); +struct hlist_node *cfs_hash_bd_peek_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bd, const void *key); +struct hlist_node *cfs_hash_bd_findadd_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bd, const void *key, + struct hlist_node *hnode, + int insist_add); +struct hlist_node *cfs_hash_bd_finddel_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bd, const void *key, + struct hlist_node *hnode); + +/** + * operations on cfs_hash bucket (bd: bucket descriptor), + * they are safe for hash-table with rehash + */ +void cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds); +void cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl); +void cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl); + +static inline void cfs_hash_dual_bd_get_and_lock(cfs_hash_t *hs, const void *key, + cfs_hash_bd_t *bds, int excl) +{ + cfs_hash_dual_bd_get(hs, key, bds); + cfs_hash_dual_bd_lock(hs, bds, excl); +} + +struct hlist_node *cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bds, + const void *key); +struct hlist_node *cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bds, + const void *key, + struct hlist_node *hnode, + int insist_add); +struct hlist_node *cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bds, + const void *key, + struct hlist_node *hnode); + +/* Hash init/cleanup functions */ +cfs_hash_t *cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + cfs_hash_ops_t *ops, unsigned flags); + +cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs); +void cfs_hash_putref(cfs_hash_t *hs); + +/* Hash addition functions */ +void cfs_hash_add(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode); +int cfs_hash_add_unique(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode); +void *cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode); + +/* Hash deletion functions */ +void *cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode); +void *cfs_hash_del_key(cfs_hash_t *hs, const void *key); + +/* Hash lookup/for_each functions */ +#define CFS_HASH_LOOP_HOG 1024 + +typedef int (*cfs_hash_for_each_cb_t)(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *node, void *data); +void *cfs_hash_lookup(cfs_hash_t *hs, const void *key); +void cfs_hash_for_each(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data); +void cfs_hash_for_each_safe(cfs_hash_t *hs, cfs_hash_for_each_cb_t, void *data); +int cfs_hash_for_each_nolock(cfs_hash_t *hs, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_for_each_empty(cfs_hash_t *hs, + cfs_hash_for_each_cb_t, void *data); +void cfs_hash_for_each_key(cfs_hash_t *hs, const void *key, + cfs_hash_for_each_cb_t, void *data); +typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data); +void cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t, void *data); + +void cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex, + cfs_hash_for_each_cb_t, void *data); +int cfs_hash_is_empty(cfs_hash_t *hs); +__u64 cfs_hash_size_get(cfs_hash_t *hs); + +/* + * Rehash - Theta is calculated to be the average chained + * hash depth assuming a perfectly uniform hash funcion. + */ +void cfs_hash_rehash_cancel_locked(cfs_hash_t *hs); +void cfs_hash_rehash_cancel(cfs_hash_t *hs); +int cfs_hash_rehash(cfs_hash_t *hs, int do_rehash); +void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key, + void *new_key, struct hlist_node *hnode); + +#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 +/* Validate hnode references the correct key */ +static inline void +cfs_hash_key_validate(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode) +{ + LASSERT(cfs_hash_keycmp(hs, key, hnode)); +} + +/* Validate hnode is in the correct bucket */ +static inline void +cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + cfs_hash_bd_t bds[2]; + + cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds); + LASSERT(bds[0].bd_bucket == bd->bd_bucket || + bds[1].bd_bucket == bd->bd_bucket); +} + +#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */ + +static inline void +cfs_hash_key_validate(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode) {} + +static inline void +cfs_hash_bucket_validate(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) {} + +#endif /* CFS_HASH_DEBUG_LEVEL */ + +#define CFS_HASH_THETA_BITS 10 +#define CFS_HASH_MIN_THETA (1U << (CFS_HASH_THETA_BITS - 1)) +#define CFS_HASH_MAX_THETA (1U << (CFS_HASH_THETA_BITS + 1)) + +/* Return integer component of theta */ +static inline int __cfs_hash_theta_int(int theta) +{ + return (theta >> CFS_HASH_THETA_BITS); +} + +/* Return a fractional value between 0 and 999 */ +static inline int __cfs_hash_theta_frac(int theta) +{ + return ((theta * 1000) >> CFS_HASH_THETA_BITS) - + (__cfs_hash_theta_int(theta) * 1000); +} + +static inline int __cfs_hash_theta(cfs_hash_t *hs) +{ + return (atomic_read(&hs->hs_count) << + CFS_HASH_THETA_BITS) >> hs->hs_cur_bits; +} + +static inline void __cfs_hash_set_theta(cfs_hash_t *hs, int min, int max) +{ + LASSERT(min < max); + hs->hs_min_theta = (__u16)min; + hs->hs_max_theta = (__u16)max; +} + +/* Generic debug formatting routines mainly for proc handler */ +int cfs_hash_debug_header(char *str, int size); +int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size); + +/* + * Generic djb2 hash algorithm for character arrays. + */ +static inline unsigned +cfs_hash_djb2_hash(const void *key, size_t size, unsigned mask) +{ + unsigned i, hash = 5381; + + LASSERT(key != NULL); + + for (i = 0; i < size; i++) + hash = hash * 33 + ((char *)key)[i]; + + return (hash & mask); +} + +/* + * Generic u32 hash algorithm. + */ +static inline unsigned +cfs_hash_u32_hash(const __u32 key, unsigned mask) +{ + return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); +} + +/* + * Generic u64 hash algorithm. + */ +static inline unsigned +cfs_hash_u64_hash(const __u64 key, unsigned mask) +{ + return ((unsigned)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask); +} + +/** iterate over all buckets in @bds (array of cfs_hash_bd_t) */ +#define cfs_hash_for_each_bd(bds, n, i) \ + for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++) + +/** iterate over all buckets of @hs */ +#define cfs_hash_for_each_bucket(hs, bd, pos) \ + for (pos = 0; \ + pos < CFS_HASH_NBKT(hs) && \ + ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++) + +/** iterate over all hlist of bucket @bd */ +#define cfs_hash_bd_for_each_hlist(hs, bd, hlist) \ + for ((bd)->bd_offset = 0; \ + (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) && \ + (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL; \ + (bd)->bd_offset++) + +/* !__LIBCFS__HASH_H__ */ +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h new file mode 100644 index 000000000000..bfa6d7b245ea --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_heap.h @@ -0,0 +1,200 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/include/libcfs/heap.h + * + * Author: Eric Barton + * Liang Zhen + */ + +#ifndef __LIBCFS_HEAP_H__ +#define __LIBCFS_HEAP_H__ + +/** \defgroup heap Binary heap + * + * The binary heap is a scalable data structure created using a binary tree. It + * is capable of maintaining large sets of elements sorted usually by one or + * more element properties, but really based on anything that can be used as a + * binary predicate in order to determine the relevant ordering of any two nodes + * that belong to the set. There is no search operation, rather the intention is + * for the element of the lowest priority which will always be at the root of + * the tree (as this is an implementation of a min-heap) to be removed by users + * for consumption. + * + * Users of the heap should embed a \e cfs_binheap_node_t object instance on + * every object of the set that they wish the binary heap instance to handle, + * and (at a minimum) provide a cfs_binheap_ops_t::hop_compare() implementation + * which is used by the heap as the binary predicate during its internal sorting + * operations. + * + * The current implementation enforces no locking scheme, and so assumes the + * user caters for locking between calls to insert, delete and lookup + * operations. Since the only consumer for the data structure at this point + * are NRS policies, and these operate on a per-CPT basis, binary heap instances + * are tied to a specific CPT. + * @{ + */ + +/** + * Binary heap node. + * + * Objects of this type are embedded into objects of the ordered set that is to + * be maintained by a \e cfs_binheap_t instance. + */ +typedef struct { + /** Index into the binary tree */ + unsigned int chn_index; +} cfs_binheap_node_t; + +#define CBH_SHIFT 9 +#define CBH_SIZE (1 << CBH_SHIFT) /* # ptrs per level */ +#define CBH_MASK (CBH_SIZE - 1) +#define CBH_NOB (CBH_SIZE * sizeof(cfs_binheap_node_t *)) + +#define CBH_POISON 0xdeadbeef + +/** + * Binary heap flags. + */ +enum { + CBH_FLAG_ATOMIC_GROW = 1, +}; + +struct cfs_binheap; + +/** + * Binary heap operations. + */ +typedef struct { + /** + * Called right before inserting a node into the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 success + * \retval != 0 error + */ + int (*hop_enter)(struct cfs_binheap *h, + cfs_binheap_node_t *e); + /** + * Called right after removing a node from the binary heap. + * + * Implementing this operation is optional. + * + * \param[in] h The heap + * \param[in] e The node + */ + void (*hop_exit)(struct cfs_binheap *h, + cfs_binheap_node_t *e); + /** + * A binary predicate which is called during internal heap sorting + * operations, and used in order to determine the relevant ordering of + * two heap nodes. + * + * Implementing this operation is mandatory. + * + * \param[in] a The first heap node + * \param[in] b The second heap node + * + * \retval 0 Node a > node b + * \retval 1 Node a < node b + * + * \see cfs_binheap_bubble() + * \see cfs_biheap_sink() + */ + int (*hop_compare)(cfs_binheap_node_t *a, + cfs_binheap_node_t *b); +} cfs_binheap_ops_t; + +/** + * Binary heap object. + * + * Sorts elements of type \e cfs_binheap_node_t + */ +typedef struct cfs_binheap { + /** Triple indirect */ + cfs_binheap_node_t ****cbh_elements3; + /** double indirect */ + cfs_binheap_node_t ***cbh_elements2; + /** single indirect */ + cfs_binheap_node_t **cbh_elements1; + /** # elements referenced */ + unsigned int cbh_nelements; + /** high water mark */ + unsigned int cbh_hwm; + /** user flags */ + unsigned int cbh_flags; + /** operations table */ + cfs_binheap_ops_t *cbh_ops; + /** private data */ + void *cbh_private; + /** associated CPT table */ + struct cfs_cpt_table *cbh_cptab; + /** associated CPT id of this cfs_binheap_t::cbh_cptab */ + int cbh_cptid; +} cfs_binheap_t; + +void cfs_binheap_destroy(cfs_binheap_t *h); +cfs_binheap_t *cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags, + unsigned count, void *arg, + struct cfs_cpt_table *cptab, int cptid); +cfs_binheap_node_t *cfs_binheap_find(cfs_binheap_t *h, unsigned int idx); +int cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e); +void cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e); + +static inline int +cfs_binheap_size(cfs_binheap_t *h) +{ + return h->cbh_nelements; +} + +static inline int +cfs_binheap_is_empty(cfs_binheap_t *h) +{ + return h->cbh_nelements == 0; +} + +static inline cfs_binheap_node_t * +cfs_binheap_root(cfs_binheap_t *h) +{ + return cfs_binheap_find(h, 0); +} + +static inline cfs_binheap_node_t * +cfs_binheap_remove_root(cfs_binheap_t *h) +{ + cfs_binheap_node_t *e = cfs_binheap_find(h, 0); + + if (e != NULL) + cfs_binheap_remove(h, e); + return e; +} + +/** @} heap */ + +#endif /* __LIBCFS_HEAP_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h new file mode 100644 index 000000000000..5be367973508 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_ioctl.h @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_ioctl.h + * + * Low-level ioctl data structures. Kernel ioctl functions declared here, + * and user space functions are in libcfsutil_ioctl.h. + * + */ + +#ifndef __LIBCFS_IOCTL_H__ +#define __LIBCFS_IOCTL_H__ + + +#define LIBCFS_IOCTL_VERSION 0x0001000a + +struct libcfs_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + __u64 ioc_nid; + __u64 ioc_u64[1]; + + __u32 ioc_flags; + __u32 ioc_count; + __u32 ioc_net; + __u32 ioc_u32[7]; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + + +struct libcfs_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct libcfs_debug_ioctl_data +{ + struct libcfs_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define LIBCFS_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = LIBCFS_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + + +struct libcfs_ioctl_handler { + struct list_head item; + int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data); +}; + +#define DECLARE_IOCTL_HANDLER(ident, func) \ + struct libcfs_ioctl_handler ident = { \ + /* .item = */ LIST_HEAD_INIT(ident.item), \ + /* .handle_ioctl = */ func \ + } + + +/* FIXME check conflict with lustre_lib.h */ +#define LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) + + +/* ioctls for manipulating snapshots 30- */ +#define IOC_LIBCFS_TYPE 'e' +#define IOC_LIBCFS_MIN_NR 30 +/* libcfs ioctls */ +#define IOC_LIBCFS_PANIC _IOWR('e', 30, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_CONTROL _IOWR('e', 33, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_SNAPSHOT _IOWR('e', 34, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_LOOKUP_STRING _IOWR('e', 35, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MEMHOG _IOWR('e', 36, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING_TEST _IOWR('e', 37, IOCTL_LIBCFS_TYPE) +/* lnet ioctls */ +#define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_ROUTE _IOWR('e', 52, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_ROUTE _IOWR('e', 53, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_ROUTE _IOWR('e', 54, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEBUG_PEER _IOWR('e', 62, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNETST _IOWR('e', 63, IOCTL_LIBCFS_TYPE) +/* lnd ioctls */ +#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_TXDESC _IOWR('e', 77, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) + +#define IOC_LIBCFS_MAX_NR 80 + +static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) +{ + int len = sizeof(*data); + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + return len; +} + +static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("LIBCFS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[cfs_size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + + +extern int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand); +extern int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand); +extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg); +extern int libcfs_ioctl_popdata(void *arg, void *buf, int size); + + +#endif /* __LIBCFS_IOCTL_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h new file mode 100644 index 000000000000..596a15fc8996 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_kernelcomm.h @@ -0,0 +1,117 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Nathan Rutman + * + * libcfs/include/libcfs/libcfs_kernelcomm.h + * + * Kernel <-> userspace communication routines. + * The definitions below are used in the kernel and userspace. + * + */ + +#ifndef __LIBCFS_KERNELCOMM_H__ +#define __LIBCFS_KERNELCOMM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +/* KUC message header. + * All current and future KUC messages should use this header. + * To avoid having to include Lustre headers from libcfs, define this here. + */ +struct kuc_hdr { + __u16 kuc_magic; + __u8 kuc_transport; /* Each new Lustre feature should use a different + transport */ + __u8 kuc_flags; + __u16 kuc_msgtype; /* Message type or opcode, transport-specific */ + __u16 kuc_msglen; /* Including header */ +} __attribute__((aligned(sizeof(__u64)))); + +#define KUC_MAGIC 0x191C /*Lustre9etLinC */ +#define KUC_FL_BLOCK 0x01 /* Wait for send */ + +/* kuc_msgtype values are defined in each transport */ +enum kuc_transport_type { + KUC_TRANSPORT_GENERIC = 1, + KUC_TRANSPORT_HSM = 2, + KUC_TRANSPORT_CHANGELOG = 3, +}; + +enum kuc_generic_message_type { + KUC_MSG_SHUTDOWN = 1, +}; + +/* prototype for callback function on kuc groups */ +typedef int (*libcfs_kkuc_cb_t)(__u32 data, void *cb_arg); + +/* KUC Broadcast Groups. This determines which userspace process hears which + * messages. Mutliple transports may be used within a group, or multiple + * groups may use the same transport. Broadcast + * groups need not be used if e.g. a UID is specified instead; + * use group 0 to signify unicast. + */ +#define KUC_GRP_HSM 0x02 +#define KUC_GRP_MAX KUC_GRP_HSM + +/* Kernel methods */ +extern int libcfs_kkuc_msg_put(struct file *fp, void *payload); +extern int libcfs_kkuc_group_put(int group, void *payload); +extern int libcfs_kkuc_group_add(struct file *fp, int uid, int group, + __u32 data); +extern int libcfs_kkuc_group_rem(int uid, int group); +extern int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func, + void *cb_arg); + +#define LK_FLG_STOP 0x01 + +/* kernelcomm control structure, passed from userspace to kernel */ +typedef struct lustre_kernelcomm { + __u32 lk_wfd; + __u32 lk_rfd; + __u32 lk_uid; + __u32 lk_group; + __u32 lk_data; + __u32 lk_flags; +} __attribute__((packed)) lustre_kernelcomm; + +/* Userspace methods */ +extern int libcfs_ukuc_start(lustre_kernelcomm *l, int groups); +extern int libcfs_ukuc_stop(lustre_kernelcomm *l); +extern int libcfs_ukuc_msg_get(lustre_kernelcomm *l, char *buf, int maxsize, + int transport); + +#endif /* __LIBCFS_KERNELCOMM_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h new file mode 100644 index 000000000000..9c40ed904da5 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_prim.h @@ -0,0 +1,101 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_prim.h + * + * General primitives. + * + */ + +#ifndef __LIBCFS_PRIM_H__ +#define __LIBCFS_PRIM_H__ + +#ifndef EXPORT_SYMBOL +# define EXPORT_SYMBOL(s) +#endif + +/* + * Schedule + */ +void cfs_pause(cfs_duration_t ticks); + +/* + * Timer + */ +typedef void (cfs_timer_func_t)(ulong_ptr_t); +void schedule_timeout_and_set_state(cfs_task_state_t, int64_t); + +void init_waitqueue_entry_current(wait_queue_t *link); +int64_t waitq_timedwait(wait_queue_t *, cfs_task_state_t, int64_t); +void waitq_wait(wait_queue_t *, cfs_task_state_t); +void add_wait_queue_exclusive_head(wait_queue_head_t *, wait_queue_t *); + +void cfs_init_timer(timer_list_t *t); +void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg); +void cfs_timer_done(timer_list_t *t); +void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline); +void cfs_timer_disarm(timer_list_t *t); +int cfs_timer_is_armed(timer_list_t *t); +cfs_time_t cfs_timer_deadline(timer_list_t *t); + +/* + * Memory + */ +#ifndef memory_pressure_get +#define memory_pressure_get() (0) +#endif +#ifndef memory_pressure_set +#define memory_pressure_set() do {} while (0) +#endif +#ifndef memory_pressure_clr +#define memory_pressure_clr() do {} while (0) +#endif + +static inline int cfs_memory_pressure_get_and_set(void) +{ + int old = memory_pressure_get(); + + if (!old) + memory_pressure_set(); + return old; +} + +static inline void cfs_memory_pressure_restore(int old) +{ + if (old) + memory_pressure_set(); + else + memory_pressure_clr(); + return; +} +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h new file mode 100644 index 000000000000..62bf32f8539d --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -0,0 +1,568 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_private.h + * + * Various defines for libcfs. + * + */ + +#ifndef __LIBCFS_PRIVATE_H__ +#define __LIBCFS_PRIVATE_H__ + +/* XXX this layering violation is for nidstrings */ +#include + +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif + + + +/* + * When this is on, LASSERT macro includes check for assignment used instead + * of equality check, but doesn't have unlikely(). Turn this on from time to + * time to make test-builds. This shouldn't be on for production release. + */ +#define LASSERT_CHECKED (0) + + +#define LASSERTF(cond, fmt, ...) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ + libcfs_debug_msg(&__msg_data, \ + "ASSERTION( %s ) failed: " fmt, #cond, \ + ## __VA_ARGS__); \ + lbug_with_loc(&__msg_data); \ + } \ +} while (0) + +#define LASSERT(cond) LASSERTF(cond, "\n") + +# define LINVRNT(exp) ((void)sizeof!!(exp)) + +#define KLASSERT(e) LASSERT(e) + +void lbug_with_loc(struct libcfs_debug_msg_data *) __attribute__((noreturn)); + +#define LBUG() \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + lbug_with_loc(&msgdata); \ +} while(0) + +extern atomic_t libcfs_kmemory; +/* + * Memory + */ + +# define libcfs_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_dec(ptr, size) \ +do { \ + atomic_sub(size, &libcfs_kmemory); \ +} while (0) + +# define libcfs_kmem_read() \ + atomic_read(&libcfs_kmemory) + + +#ifndef LIBCFS_VMALLOC_SIZE +#define LIBCFS_VMALLOC_SIZE (2 << PAGE_CACHE_SHIFT) /* 2 pages */ +#endif + +#define LIBCFS_ALLOC_PRE(size, mask) \ +do { \ + LASSERT(!in_interrupt() || \ + ((size) <= LIBCFS_VMALLOC_SIZE && \ + ((mask) & GFP_ATOMIC)) != 0); \ +} while (0) + +#define LIBCFS_ALLOC_POST(ptr, size) \ +do { \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LNET: out of memory at %s:%d (tried to alloc '" \ + #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size)); \ + CERROR("LNET: %d total bytes allocated by lnet\n", \ + libcfs_kmem_read()); \ + } else { \ + memset((ptr), 0, (size)); \ + libcfs_kmem_inc((ptr), (size)); \ + CDEBUG(D_MALLOC, "alloc '" #ptr "': %d at %p (tot %d).\n", \ + (int)(size), (ptr), libcfs_kmem_read()); \ + } \ +} while (0) + +/** + * allocate memory with GFP flags @mask + */ +#define LIBCFS_ALLOC_GFP(ptr, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + kmalloc((size), (mask)) : vmalloc(size); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** + * default allocator + */ +#define LIBCFS_ALLOC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, __GFP_IO) + +/** + * non-sleeping allocator + */ +#define LIBCFS_ALLOC_ATOMIC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, GFP_ATOMIC) + +/** + * allocate memory for specified CPU partition + * \a cptab != NULL, \a cpt is CPU partition id of \a cptab + * \a cptab == NULL, \a cpt is HW NUMA node id + */ +#define LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, mask) \ +do { \ + LIBCFS_ALLOC_PRE((size), (mask)); \ + (ptr) = (size) <= LIBCFS_VMALLOC_SIZE ? \ + cfs_cpt_malloc((cptab), (cpt), (size), (mask)) : \ + cfs_cpt_vmalloc((cptab), (cpt), (size)); \ + LIBCFS_ALLOC_POST((ptr), (size)); \ +} while (0) + +/** default numa allocator */ +#define LIBCFS_CPT_ALLOC(ptr, cptab, cpt, size) \ + LIBCFS_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO) + +#define LIBCFS_FREE(ptr, size) \ +do { \ + int s = (size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + libcfs_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ + s, (ptr), libcfs_kmem_read()); \ + if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ + vfree(ptr); \ + else \ + kfree(ptr); \ +} while (0) + +/******************************************************************************/ + +/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */ +#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__) +#define ___htonl(x) __cpu_to_be32(x) +#define ___htons(x) __cpu_to_be16(x) +#define ___ntohl(x) __be32_to_cpu(x) +#define ___ntohs(x) __be16_to_cpu(x) +#define htonl(x) ___htonl(x) +#define ntohl(x) ___ntohl(x) +#define htons(x) ___htons(x) +#define ntohs(x) ___ntohs(x) +#endif + +void libcfs_debug_dumpstack(task_t *tsk); +void libcfs_run_upcall(char **argv); +void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *); +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); +int libcfs_debug_clear_buffer(void); +int libcfs_debug_mark_buffer(const char *text); + +void libcfs_debug_set_level(unsigned int debug_level); + + +/* + * allocate per-cpu-partition data, returned value is an array of pointers, + * variable can be indexed by CPU ID. + * cptable != NULL: size of array is number of CPU partitions + * cptable == NULL: size of array is number of HW cores + */ +void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); +/* + * destory per-cpu-partition variable + */ +void cfs_percpt_free(void *vars); +int cfs_percpt_number(void *vars); +void *cfs_percpt_current(void *vars); +void *cfs_percpt_index(void *vars, int idx); + +#define cfs_percpt_for_each(var, i, vars) \ + for (i = 0; i < cfs_percpt_number(vars) && \ + ((var) = (vars)[i]) != NULL; i++) + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by count. + */ +void *cfs_array_alloc(int count, unsigned int size); +void cfs_array_free(void *vars); + +#define LASSERT_ATOMIC_ENABLED (1) + +#if LASSERT_ATOMIC_ENABLED + +/** assert value of @a is equal to @v */ +#define LASSERT_ATOMIC_EQ(a, v) \ +do { \ + LASSERTF(atomic_read(a) == v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is unequal to @v */ +#define LASSERT_ATOMIC_NE(a, v) \ +do { \ + LASSERTF(atomic_read(a) != v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little than @v */ +#define LASSERT_ATOMIC_LT(a, v) \ +do { \ + LASSERTF(atomic_read(a) < v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is little/equal to @v */ +#define LASSERT_ATOMIC_LE(a, v) \ +do { \ + LASSERTF(atomic_read(a) <= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v */ +#define LASSERT_ATOMIC_GT(a, v) \ +do { \ + LASSERTF(atomic_read(a) > v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great/equal to @v */ +#define LASSERT_ATOMIC_GE(a, v) \ +do { \ + LASSERTF(atomic_read(a) >= v, \ + "value: %d\n", atomic_read((a))); \ +} while (0) + +/** assert value of @a is great than @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great than @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little than @v2 */ +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v); \ +} while (0) + +/** assert value of @a is great/equal to @v1 and little/equal to @v2 */ +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) \ +do { \ + int __v = atomic_read(a); \ + LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v); \ +} while (0) + +#else /* !LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_EQ(a, v) do {} while (0) +#define LASSERT_ATOMIC_NE(a, v) do {} while (0) +#define LASSERT_ATOMIC_LT(a, v) do {} while (0) +#define LASSERT_ATOMIC_LE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT(a, v) do {} while (0) +#define LASSERT_ATOMIC_GE(a, v) do {} while (0) +#define LASSERT_ATOMIC_GT_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GT_LE(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LT(a, v1, v2) do {} while (0) +#define LASSERT_ATOMIC_GE_LE(a, v1, v2) do {} while (0) + +#endif /* LASSERT_ATOMIC_ENABLED */ + +#define LASSERT_ATOMIC_ZERO(a) LASSERT_ATOMIC_EQ(a, 0) +#define LASSERT_ATOMIC_POS(a) LASSERT_ATOMIC_GT(a, 0) + +#define CFS_ALLOC_PTR(ptr) LIBCFS_ALLOC(ptr, sizeof (*(ptr))); +#define CFS_FREE_PTR(ptr) LIBCFS_FREE(ptr, sizeof (*(ptr))); + +/* + * percpu partition lock + * + * There are some use-cases like this in Lustre: + * . each CPU partition has it's own private data which is frequently changed, + * and mostly by the local CPU partition. + * . all CPU partitions share some global data, these data are rarely changed. + * + * LNet is typical example. + * CPU partition lock is designed for this kind of use-cases: + * . each CPU partition has it's own private lock + * . change on private data just needs to take the private lock + * . read on shared data just needs to take _any_ of private locks + * . change on shared data needs to take _all_ private locks, + * which is slow and should be really rare. + */ + +enum { + CFS_PERCPT_LOCK_EX = -1, /* negative */ +}; + + +struct cfs_percpt_lock { + /* cpu-partition-table for this lock */ + struct cfs_cpt_table *pcl_cptab; + /* exclusively locked */ + unsigned int pcl_locked; + /* private lock table */ + spinlock_t **pcl_locks; +}; + +/* return number of private locks */ +static inline int +cfs_percpt_lock_num(struct cfs_percpt_lock *pcl) +{ + return cfs_cpt_number(pcl->pcl_cptab); +} + + +/* + * create a cpu-partition lock based on CPU partition table \a cptab, + * each private lock has extra \a psize bytes padding data + */ +struct cfs_percpt_lock *cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab); +/* destroy a cpu-partition lock */ +void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl); + +/* lock private lock \a index of \a pcl */ +void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index); +/* unlock private lock \a index of \a pcl */ +void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index); +/* create percpt (atomic) refcount based on @cptab */ +atomic_t **cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int val); +/* destroy percpt refcount */ +void cfs_percpt_atomic_free(atomic_t **refs); +/* return sum of all percpu refs */ +int cfs_percpt_atomic_summary(atomic_t **refs); + + +/** Compile-time assertion. + + * Check an invariant described by a constant expression at compile time by + * forcing a compiler error if it does not hold. \a cond must be a constant + * expression as defined by the ISO C Standard: + * + * 6.8.4.2 The switch statement + * .... + * [#3] The expression of each case label shall be an integer + * constant expression and no two of the case constant + * expressions in the same switch statement shall have the same + * value after conversion... + * + */ +#define CLASSERT(cond) do {switch(42) {case (cond): case 0: break;}} while (0) + +/* support decl needed both by kernel and liblustre */ +int libcfs_isknown_lnd(int type); +char *libcfs_lnd2modname(int type); +char *libcfs_lnd2str(int type); +int libcfs_str2lnd(const char *str); +char *libcfs_net2str(__u32 net); +char *libcfs_nid2str(lnet_nid_t nid); +__u32 libcfs_str2net(const char *str); +lnet_nid_t libcfs_str2nid(const char *str); +int libcfs_str2anynid(lnet_nid_t *nid, const char *str); +char *libcfs_id2str(lnet_process_id_t id); +void cfs_free_nidlist(struct list_head *list); +int cfs_parse_nidlist(char *str, int len, struct list_head *list); +int cfs_match_nid(lnet_nid_t nid, struct list_head *list); + +/** \addtogroup lnet_addr + * @{ */ +/* how an LNET NID encodes net:address */ +/** extract the address part of an lnet_nid_t */ +#define LNET_NIDADDR(nid) ((__u32)((nid) & 0xffffffff)) +/** extract the network part of an lnet_nid_t */ +#define LNET_NIDNET(nid) ((__u32)(((nid) >> 32)) & 0xffffffff) +/** make an lnet_nid_t from a network part and an address part */ +#define LNET_MKNID(net,addr) ((((__u64)(net))<<32)|((__u64)(addr))) +/* how net encodes type:number */ +#define LNET_NETNUM(net) ((net) & 0xffff) +#define LNET_NETTYP(net) (((net) >> 16) & 0xffff) +#define LNET_MKNET(typ,num) ((((__u32)(typ))<<16)|((__u32)(num))) +/** @} lnet_addr */ + +/* max value for numeric network address */ +#define MAX_NUMERIC_VALUE 0xffffffff + +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) + +#ifndef CFS_CURRENT_TIME +# define CFS_CURRENT_TIME time(0) +#endif + +/* -------------------------------------------------------------------- + * Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. + * All stuff about lwt are put in arch/kp30.h + * -------------------------------------------------------------------- */ + +struct libcfs_device_userstate +{ + int ldu_memhog_pages; + struct page *ldu_memhog_root_page; +}; + +/* what used to be in portals_lib.h */ +#ifndef MIN +# define MIN(a,b) (((a)<(b)) ? (a): (b)) +#endif +#ifndef MAX +# define MAX(a,b) (((a)>(b)) ? (a): (b)) +#endif + +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int cfs_size_round4 (int val) +{ + return (val + 3) & (~0x3); +} + +#ifndef HAVE_CFS_SIZE_ROUND +static inline int cfs_size_round (int val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +static inline int cfs_size_round16(int val) +{ + return (val + 0xf) & (~0xf); +} + +static inline int cfs_size_round32(int val) +{ + return (val + 0x1f) & (~0x1f); +} + +static inline int cfs_size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t cfs_round_strlen(char *fset) +{ + return (size_t)cfs_size_round((int)strlen(fset) + 1); +} + +/* roundup \a val to power2 */ +static inline unsigned int cfs_power2_roundup(unsigned int val) +{ + if (val != LOWEST_BIT_SET(val)) { /* not a power of 2 already */ + do { + val &= ~LOWEST_BIT_SET(val); + } while (val != LOWEST_BIT_SET(val)); + /* ...and round up */ + val <<= 1; + } + return val; +} + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += cfs_size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += cfs_size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += cfs_size_round(len + 1); \ +} while (0) + +/** + * Lustre Network Driver types. + */ +enum { + /* Only add to these values (i.e. don't ever change or redefine them): + * network addresses depend on them... */ + QSWLND = 1, + SOCKLND = 2, + GMLND = 3, /* obsolete, keep it so that libcfs_nid2str works */ + PTLLND = 4, + O2IBLND = 5, + CIBLND = 6, + OPENIBLND = 7, + IIBLND = 8, + LOLND = 9, + RALND = 10, + VIBLND = 11, + MXLND = 12, + GNILND = 13, +}; + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h new file mode 100644 index 000000000000..a6bac9c36339 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h @@ -0,0 +1,137 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_string.h + * + * Generic string manipulation functions. + * + * Author: Nathan Rutman + */ + +#ifndef __LIBCFS_STRING_H__ +#define __LIBCFS_STRING_H__ + +/* libcfs_string.c */ +/* string comparison ignoring case */ +int cfs_strncasecmp(const char *s1, const char *s2, size_t n); +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask); + +/* Allocate space for and copy an existing string. + * Must free with kfree(). + */ +char *cfs_strdup(const char *str, u_int32_t flags); + +/* safe vsnprintf */ +int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args); + +/* safe snprintf */ +int cfs_snprintf(char *buf, size_t size, const char *fmt, ...); + +/* trim leading and trailing space characters */ +char *cfs_firststr(char *str, size_t size); + +/** + * Structure to represent NULL-less strings. + */ +struct cfs_lstr { + char *ls_str; + int ls_len; +}; + +/* + * Structure to represent \ token of the syntax. + */ +struct cfs_range_expr { + /* + * Link to cfs_expr_list::el_exprs. + */ + struct list_head re_link; + __u32 re_lo; + __u32 re_hi; + __u32 re_stride; +}; + +struct cfs_expr_list { + struct list_head el_link; + struct list_head el_exprs; +}; + +static inline int +cfs_iswhite(char c) +{ + switch (c) { + case ' ': + case '\t': + case '\n': + case '\r': + return 1; + default: + break; + } + return 0; +} + +char *cfs_trimwhite(char *str); +int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); +int cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max); +int cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int single_tok, struct cfs_range_expr **expr); +int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list); +int cfs_expr_list_values(struct cfs_expr_list *expr_list, + int max, __u32 **values); +static inline void +cfs_expr_list_values_free(__u32 *values, int num) +{ + /* This array is allocated by LIBCFS_ALLOC(), so it shouldn't be freed + * by OBD_FREE() if it's called by module other than libcfs & LNet, + * otherwise we will see fake memory leak */ + LIBCFS_FREE(values, num * sizeof(values[0])); +} + +void cfs_expr_list_free(struct cfs_expr_list *expr_list); +void cfs_expr_list_print(struct cfs_expr_list *expr_list); +int cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp); +void cfs_expr_list_free_list(struct list_head *list); +int cfs_ip_addr_parse(char *str, int len, struct list_head *list); +int cfs_ip_addr_match(__u32 addr, struct list_head *list); +void cfs_ip_addr_free(struct list_head *list); + +#define strtoul(str, endp, base) simple_strtoul(str, endp, base) + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h new file mode 100644 index 000000000000..4bdd77163d5e --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_time.h @@ -0,0 +1,132 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_time.h + * + * Time functions. + * + */ + +#ifndef __LIBCFS_TIME_H__ +#define __LIBCFS_TIME_H__ +/* + * generic time manipulation functions. + */ + +static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d) +{ + return (cfs_time_t)(t + d); +} + +static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2) +{ + return (cfs_time_t)(t1 - t2); +} + +static inline int cfs_time_after(cfs_time_t t1, cfs_time_t t2) +{ + return cfs_time_before(t2, t1); +} + +static inline int cfs_time_aftereq(cfs_time_t t1, cfs_time_t t2) +{ + return cfs_time_beforeq(t2, t1); +} + + +static inline cfs_time_t cfs_time_shift(int seconds) +{ + return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds)); +} + +static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small, + struct timeval *result) +{ + long r = (long) ( + (large->tv_sec - small->tv_sec) * ONE_MILLION + + (large->tv_usec - small->tv_usec)); + if (result != NULL) { + result->tv_usec = r % ONE_MILLION; + result->tv_sec = r / ONE_MILLION; + } + return r; +} + +static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg) +{ + if (cfs_time_after(cfs_time_current(), + cfs_time_add(now, cfs_time_seconds(15)))) + CERROR("slow %s "CFS_TIME_T" sec\n", msg, + cfs_duration_sec(cfs_time_sub(cfs_time_current(),now))); +} + +#define CFS_RATELIMIT(seconds) \ +({ \ + /* \ + * XXX nikita: non-portable initializer \ + */ \ + static time_t __next_message = 0; \ + int result; \ + \ + if (cfs_time_after(cfs_time_current(), __next_message)) \ + result = 1; \ + else { \ + __next_message = cfs_time_shift(seconds); \ + result = 0; \ + } \ + result; \ +}) + +/* + * helper function similar to do_gettimeofday() of Linux kernel + */ +static inline void cfs_fs_timeval(struct timeval *tv) +{ + cfs_fs_time_t time; + + cfs_fs_time_current(&time); + cfs_fs_time_usec(&time, tv); +} + +/* + * return valid time-out based on user supplied one. Currently we only check + * that time-out is not shorted than allowed. + */ +static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout) +{ + if (timeout < CFS_TICK) + timeout = CFS_TICK; + return timeout; +} + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h new file mode 100644 index 000000000000..5cc64f327a87 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_workitem.h @@ -0,0 +1,110 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/libcfs_workitem.h + * + * Author: Isaac Huang + * Liang Zhen + * + * A workitems is deferred work with these semantics: + * - a workitem always runs in thread context. + * - a workitem can be concurrent with other workitems but is strictly + * serialized with respect to itself. + * - no CPU affinity, a workitem does not necessarily run on the same CPU + * that schedules it. However, this might change in the future. + * - if a workitem is scheduled again before it has a chance to run, it + * runs only once. + * - if a workitem is scheduled while it runs, it runs again after it + * completes; this ensures that events occurring while other events are + * being processed receive due attention. This behavior also allows a + * workitem to reschedule itself. + * + * Usage notes: + * - a workitem can sleep but it should be aware of how that sleep might + * affect others. + * - a workitem runs inside a kernel thread so there's no user space to access. + * - do not use a workitem if the scheduling latency can't be tolerated. + * + * When wi_action returns non-zero, it means the workitem has either been + * freed or reused and workitem scheduler won't touch it any more. + */ + +#ifndef __LIBCFS_WORKITEM_H__ +#define __LIBCFS_WORKITEM_H__ + +struct cfs_wi_sched; + +void cfs_wi_sched_destroy(struct cfs_wi_sched *); +int cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, int cpt, + int nthrs, struct cfs_wi_sched **); + +struct cfs_workitem; + +typedef int (*cfs_wi_action_t) (struct cfs_workitem *); +typedef struct cfs_workitem { + /** chain on runq or rerunq */ + struct list_head wi_list; + /** working function */ + cfs_wi_action_t wi_action; + /** arg for working function */ + void *wi_data; + /** in running */ + unsigned short wi_running:1; + /** scheduled */ + unsigned short wi_scheduled:1; +} cfs_workitem_t; + +static inline void +cfs_wi_init(cfs_workitem_t *wi, void *data, cfs_wi_action_t action) +{ + INIT_LIST_HEAD(&wi->wi_list); + + wi->wi_running = 0; + wi->wi_scheduled = 0; + wi->wi_data = data; + wi->wi_action = action; +} + +void cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi); +int cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi); +void cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi); + +int cfs_wi_startup(void); +void cfs_wi_shutdown(void); + +/** # workitem scheduler loops before reschedule */ +#define CFS_WI_RESCHED 128 + +#endif /* __LIBCFS_WORKITEM_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h new file mode 100644 index 000000000000..4b7ae1c5bd3b --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/kp30.h @@ -0,0 +1,286 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_KP30_H__ +#define __LIBCFS_LINUX_KP30_H__ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_MM_INLINE +# include +#endif +#include +#include +#include + +#include + + +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_WORK((wq), (void *)(cb)); \ +} while (0) + +#define cfs_get_work_data(type,field,data) container_of(data,type,field) + + +#define our_recalc_sigpending(current) recalc_sigpending() +#define strtok(a,b) strpbrk(a, b) +#define work_struct_t struct work_struct + +#ifdef CONFIG_SMP +#else +#endif + + +#define SEM_COUNT(sem) ((sem)->count) + + +/* ------------------------------------------------------------------- */ + +#define PORTAL_SYMBOL_REGISTER(x) +#define PORTAL_SYMBOL_UNREGISTER(x) + + + + +/******************************************************************************/ +/* Module parameter support */ +#define CFS_MODULE_PARM(name, t, type, perm, desc) \ + module_param(name, type, perm);\ + MODULE_PARM_DESC(name, desc) + +#define CFS_SYSFS_MODULE_PARM 1 /* module parameters accessible via sysfs */ + +/******************************************************************************/ + +#if (__GNUC__) +/* Use the special GNU C __attribute__ hack to have the compiler check the + * printf style argument string against the actual argument count and + * types. + */ +#ifdef printf +# warning printf has been defined as a macro... +# undef printf +#endif + +#endif /* __GNUC__ */ + +# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) +# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) +# define time(a) CURRENT_TIME + +# define cfs_num_present_cpus() num_present_cpus() + +/******************************************************************************/ +/* Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. */ +#define LWT_SUPPORT 0 + +#define LWT_MEMORY (16<<20) + +#ifndef KLWT_SUPPORT +# if !defined(BITS_PER_LONG) +# error "BITS_PER_LONG not defined" +# endif + +/* kernel hasn't defined this? */ +typedef struct { + long long lwte_when; + char *lwte_where; + void *lwte_task; + long lwte_p1; + long lwte_p2; + long lwte_p3; + long lwte_p4; +# if BITS_PER_LONG > 32 + long lwte_pad; +# endif +} lwt_event_t; +#endif /* !KLWT_SUPPORT */ + +#if LWT_SUPPORT +# if !KLWT_SUPPORT + +typedef struct _lwt_page { + struct list_head lwtp_list; + struct page *lwtp_page; + lwt_event_t *lwtp_events; +} lwt_page_t; + +typedef struct { + int lwtc_current_index; + lwt_page_t *lwtc_current_page; +} lwt_cpu_t; + +extern int lwt_enabled; +extern lwt_cpu_t lwt_cpus[]; + +/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. + * This stuff is meant for finding specific problems; it never stays in + * production code... */ + +#define LWTSTR(n) #n +#define LWTWHERE(f,l) f ":" LWTSTR(l) +#define LWT_EVENTS_PER_PAGE (PAGE_CACHE_SIZE / sizeof (lwt_event_t)) + +#define LWT_EVENT(p1, p2, p3, p4) \ +do { \ + unsigned long flags; \ + lwt_cpu_t *cpu; \ + lwt_page_t *p; \ + lwt_event_t *e; \ + \ + if (lwt_enabled) { \ + local_irq_save (flags); \ + \ + cpu = &lwt_cpus[smp_processor_id()]; \ + p = cpu->lwtc_current_page; \ + e = &p->lwtp_events[cpu->lwtc_current_index++]; \ + \ + if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \ + cpu->lwtc_current_page = \ + list_entry (p->lwtp_list.next, \ + lwt_page_t, lwtp_list); \ + cpu->lwtc_current_index = 0; \ + } \ + \ + e->lwte_when = get_cycles(); \ + e->lwte_where = LWTWHERE(__FILE__,__LINE__); \ + e->lwte_task = current; \ + e->lwte_p1 = (long)(p1); \ + e->lwte_p2 = (long)(p2); \ + e->lwte_p3 = (long)(p3); \ + e->lwte_p4 = (long)(p4); \ + \ + local_irq_restore (flags); \ + } \ +} while (0) + +#endif /* !KLWT_SUPPORT */ + +extern int lwt_init (void); +extern void lwt_fini (void); +extern int lwt_lookup_string (int *size, char *knlptr, + char *usrptr, int usrsize); +extern int lwt_control (int enable, int clear); +extern int lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size, + void *user_ptr, int user_size); +#endif /* LWT_SUPPORT */ + +/* ------------------------------------------------------------------ */ + +#define IOCTL_LIBCFS_TYPE long + +#ifdef __CYGWIN__ +# ifndef BITS_PER_LONG +# define BITS_PER_LONG 64 +# endif +#endif + +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) + +/* this is a bit chunky */ + +#define _LWORDSIZE BITS_PER_LONG + +# define LPU64 "%llu" +# define LPD64 "%lld" +# define LPX64 "%#llx" +# define LPX64i "%llx" +# define LPO64 "%#llo" +# define LPF64 "L" + +/* + * long_ptr_t & ulong_ptr_t, same to "long" for gcc + */ +# define LPLU "%lu" +# define LPLD "%ld" +# define LPLX "%#lx" + +/* + * pid_t + */ +# define LPPID "%d" + + +#undef _LWORDSIZE + +/* compat macroses */ + + +#ifndef get_cpu +# ifdef CONFIG_PREEMPT +# define get_cpu() ({ preempt_disable(); smp_processor_id(); }) +# define put_cpu() preempt_enable() +# else +# define get_cpu() smp_processor_id() +# define put_cpu() +# endif +#else +#endif /* get_cpu & put_cpu */ + +#define INIT_CTL_NAME(a) +#define INIT_STRATEGY(a) + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h new file mode 100644 index 000000000000..757e6dcaaf9a --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/libcfs.h @@ -0,0 +1,131 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_LIBCFS_H__ +#define __LIBCFS_LINUX_LIBCFS_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* THREAD_SIZE */ +#include + +#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) + +#if !defined(__x86_64__) +# ifdef __ia64__ +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +# endif /* __ia64__ */ + +#define __CHECK_STACK(msgdata, mask, cdls) \ +do { \ + if (unlikely(CDEBUG_STACK() > libcfs_stack)) { \ + LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL); \ + libcfs_stack = CDEBUG_STACK(); \ + libcfs_debug_msg(msgdata, \ + "maximum lustre stack %lu\n", \ + CDEBUG_STACK()); \ + (msgdata)->msg_mask = mask; \ + (msgdata)->msg_cdls = cdls; \ + dump_stack(); \ + /*panic("LBUG");*/ \ + } \ +} while (0) +#define CFS_CHECK_STACK(msgdata, mask, cdls) __CHECK_STACK(msgdata, mask, cdls) +#else /* __x86_64__ */ +#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while(0) +#define CDEBUG_STACK() (0L) +#endif /* __x86_64__ */ + +/* initial pid */ +#define LUSTRE_LNET_PID 12345 + +#define ENTRY_NESTING_SUPPORT (1) +#define ENTRY_NESTING do {;} while (0) +#define EXIT_NESTING do {;} while (0) +#define __current_nesting_level() (0) + +/** + * Platform specific declarations for cfs_curproc API (libcfs/curproc.h) + * + * Implementation is in linux-curproc.c + */ +#define CFS_CURPROC_COMM_MAX (sizeof ((struct task_struct *)0)->comm) + +#include + +/* + * No stack-back-tracing in Linux for now. + */ +struct cfs_stack_trace { +}; + +/* long integer with size equal to pointer */ +typedef unsigned long ulong_ptr_t; +typedef long long_ptr_t; + +#ifndef WITH_WATCHDOG +#define WITH_WATCHDOG +#endif + + + + +#endif /* _LINUX_LIBCFS_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h new file mode 100644 index 000000000000..43936e349dd4 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-bitops.h @@ -0,0 +1,38 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-bitops.h + */ +#include diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h new file mode 100644 index 000000000000..224371c92f7c --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-cpu.h @@ -0,0 +1,175 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + * + * Author: liang@whamcloud.com + */ + +#ifndef __LIBCFS_LINUX_CPU_H__ +#define __LIBCFS_LINUX_CPU_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include +#include +#include +#include + + +#ifdef CONFIG_SMP + +#define HAVE_LIBCFS_CPT + +/** virtual processing unit */ +struct cfs_cpu_partition { + /* CPUs mask for this partition */ + cpumask_t *cpt_cpumask; + /* nodes mask for this partition */ + nodemask_t *cpt_nodemask; + /* spread rotor for NUMA allocator */ + unsigned cpt_spread_rotor; +}; + +/** descriptor for CPU partitions */ +struct cfs_cpt_table { + /* version, reserved for hotplug */ + unsigned ctb_version; + /* spread rotor for NUMA allocator */ + unsigned ctb_spread_rotor; + /* # of CPU partitions */ + unsigned ctb_nparts; + /* partitions tables */ + struct cfs_cpu_partition *ctb_parts; + /* shadow HW CPU to CPU partition ID */ + int *ctb_cpu2cpt; + /* all cpus in this partition table */ + cpumask_t *ctb_cpumask; + /* all nodes in this partition table */ + nodemask_t *ctb_nodemask; +}; + +void cfs_cpu_core_siblings(int cpu, cpumask_t *mask); +void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask); +void cfs_node_to_cpumask(int node, cpumask_t *mask); +int cfs_cpu_core_nsiblings(int cpu); +int cfs_cpu_ht_nsiblings(int cpu); + +/** + * comment out definitions for compatible layer + * #define CFS_CPU_NR NR_CPUS + * + * typedef cpumask_t cfs_cpumask_t; + * + * #define cfs_cpu_current() smp_processor_id() + * #define cfs_cpu_online(i) cpu_online(i) + * #define cfs_cpu_online_num() num_online_cpus() + * #define cfs_cpu_online_for_each(i) for_each_online_cpu(i) + * #define cfs_cpu_possible_num() num_possible_cpus() + * #define cfs_cpu_possible_for_each(i) for_each_possible_cpu(i) + * + * #ifdef CONFIG_CPUMASK_SIZE + * #define cfs_cpu_mask_size() cpumask_size() + * #else + * #define cfs_cpu_mask_size() sizeof(cfs_cpumask_t) + * #endif + * + * #define cfs_cpu_mask_set(i, mask) cpu_set(i, mask) + * #define cfs_cpu_mask_unset(i, mask) cpu_clear(i, mask) + * #define cfs_cpu_mask_isset(i, mask) cpu_isset(i, mask) + * #define cfs_cpu_mask_clear(mask) cpus_clear(mask) + * #define cfs_cpu_mask_empty(mask) cpus_empty(mask) + * #define cfs_cpu_mask_weight(mask) cpus_weight(mask) + * #define cfs_cpu_mask_first(mask) first_cpu(mask) + * #define cfs_cpu_mask_any_online(mask) (any_online_cpu(mask) != NR_CPUS) + * #define cfs_cpu_mask_for_each(i, mask) for_each_cpu_mask(i, mask) + * #define cfs_cpu_mask_bind(t, mask) set_cpus_allowed(t, mask) + * + * #ifdef HAVE_CPUMASK_COPY + * #define cfs_cpu_mask_copy(dst, src) cpumask_copy(dst, src) + * #else + * #define cfs_cpu_mask_copy(dst, src) memcpy(dst, src, sizeof(*src)) + * #endif + * + * static inline void + * cfs_cpu_mask_of_online(cfs_cpumask_t *mask) + * { + * cfs_cpu_mask_copy(mask, &cpu_online_map); + * } + * + * #ifdef CONFIG_NUMA + * + * #define CFS_NODE_NR MAX_NUMNODES + * + * typedef nodemask_t cfs_node_mask_t; + * + * #define cfs_node_of_cpu(cpu) cpu_to_node(cpu) + * #define cfs_node_online(i) node_online(i) + * #define cfs_node_online_num() num_online_nodes() + * #define cfs_node_online_for_each(i) for_each_online_node(i) + * #define cfs_node_possible_num() num_possible_nodes() + * #define cfs_node_possible_for_each(i) for_each_node(i) + * + * static inline void cfs_node_to_cpumask(int node, cfs_cpumask_t *mask) + * { + * #if defined(HAVE_NODE_TO_CPUMASK) + * *mask = node_to_cpumask(node); + * #elif defined(HAVE_CPUMASK_OF_NODE) + * cfs_cpu_mask_copy(mask, cpumask_of_node(node)); + * #else + * # error "Needs node_to_cpumask or cpumask_of_node" + * #endif + * } + * + * #define cfs_node_mask_set(i, mask) node_set(i, mask) + * #define cfs_node_mask_unset(i, mask) node_clear(i, mask) + * #define cfs_node_mask_isset(i, mask) node_isset(i, mask) + * #define cfs_node_mask_clear(mask) nodes_reset(mask) + * #define cfs_node_mask_empty(mask) nodes_empty(mask) + * #define cfs_node_mask_weight(mask) nodes_weight(mask) + * #define cfs_node_mask_for_each(i, mask) for_each_node_mask(i, mask) + * #define cfs_node_mask_copy(dst, src) memcpy(dst, src, sizeof(*src)) + * + * static inline void + * cfs_node_mask_of_online(cfs_node_mask_t *mask) + * { + * cfs_node_mask_copy(mask, &node_online_map); + * } + * + * #endif + */ + +#endif /* CONFIG_SMP */ +#endif /* __LIBCFS_LINUX_CPU_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h new file mode 100644 index 000000000000..97c771cf691f --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-crypto.h @@ -0,0 +1,49 @@ + /* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/** + * Linux crypto hash specific functions. + */ + +/** + * Functions for start/stop shash CRC32 algorithm. + */ +int cfs_crypto_crc32_register(void); +void cfs_crypto_crc32_unregister(void); + +/** + * Functions for start/stop shash adler32 algorithm. + */ +int cfs_crypto_adler32_register(void); +void cfs_crypto_adler32_unregister(void); + +/** + * Functions for start/stop shash crc32 pclmulqdq + */ +int cfs_crypto_crc32_pclmul_register(void); +void cfs_crypto_crc32_pclmul_unregister(void); diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h new file mode 100644 index 000000000000..90ff47a18924 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-fs.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-fs.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_FS_H__ +#define __LIBCFS_LINUX_CFS_FS_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include +#include +#include +#include +#include + +#define filp_size(f) \ + (i_size_read((f)->f_dentry->d_inode)) +#define filp_poff(f) \ + (&(f)->f_pos) + +# define do_fsync(fp, flag) \ + ((fp)->f_op->fsync(fp, 0, LLONG_MAX, flag)) + +#define filp_read(fp, buf, size, pos) \ + ((fp)->f_op->read((fp), (buf), (size), pos)) + +#define filp_write(fp, buf, size, pos) \ + ((fp)->f_op->write((fp), (buf), (size), pos)) + +#define filp_fsync(fp) \ + do_fsync(fp, 1) + +#define flock_type(fl) ((fl)->fl_type) +#define flock_set_type(fl, type) do { (fl)->fl_type = (type); } while (0) +#define flock_pid(fl) ((fl)->fl_pid) +#define flock_set_pid(fl, pid) do { (fl)->fl_pid = (pid); } while (0) +#define flock_start(fl) ((fl)->fl_start) +#define flock_set_start(fl, st) do { (fl)->fl_start = (st); } while (0) +#define flock_end(fl) ((fl)->fl_end) +#define flock_set_end(fl, end) do { (fl)->fl_end = (end); } while (0) + +ssize_t filp_user_write(struct file *filp, const void *buf, size_t count, + loff_t *offset); + +#ifndef IFSHIFT +#define IFSHIFT 12 +#endif + +#ifndef IFTODT +#define IFTODT(type) (((type) & S_IFMT) >> IFSHIFT) +#endif +#ifndef DTTOIF +#define DTTOIF(dirtype) ((dirtype) << IFSHIFT) +#endif + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h new file mode 100644 index 000000000000..6fbcbf3ab0d3 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-lock.h @@ -0,0 +1,204 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-lock.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_LOCK_H__ +#define __LIBCFS_LINUX_CFS_LOCK_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include + +/* + * IMPORTANT !!!!!!!! + * + * All locks' declaration are not guaranteed to be initialized, + * Althought some of they are initialized in Linux. All locks + * declared by CFS_DECL_* should be initialized explicitly. + */ + +/* + * spin_lock "implementation" (use Linux kernel's primitives) + * + * - spin_lock_init(x) + * - spin_lock(x) + * - spin_lock_bh(x) + * - spin_lock_bh_init(x) + * - spin_unlock(x) + * - spin_unlock_bh(x) + * - spin_trylock(x) + * - spin_is_locked(x) + * + * - spin_lock_irq(x) + * - spin_lock_irqsave(x, f) + * - spin_unlock_irqrestore(x, f) + * - read_lock_irqsave(lock, f) + * - write_lock_irqsave(lock, f) + * - write_unlock_irqrestore(lock, f) + */ + +/* + * spinlock "implementation" + */ + + + + +/* + * rw_semaphore "implementation" (use Linux kernel's primitives) + * + * - sema_init(x) + * - init_rwsem(x) + * - down_read(x) + * - up_read(x) + * - down_write(x) + * - up_write(x) + */ + + +#define fini_rwsem(s) do {} while (0) + + +/* + * rwlock_t "implementation" (use Linux kernel's primitives) + * + * - rwlock_init(x) + * - read_lock(x) + * - read_unlock(x) + * - write_lock(x) + * - write_unlock(x) + * - write_lock_bh(x) + * - write_unlock_bh(x) + * + * - RW_LOCK_UNLOCKED + */ + + +#ifndef DEFINE_RWLOCK +#define DEFINE_RWLOCK(lock) rwlock_t lock = __RW_LOCK_UNLOCKED(lock) +#endif + +/* + * completion "implementation" (use Linux kernel's primitives) + * + * - DECLARE_COMPLETION(work) + * - INIT_COMPLETION(c) + * - COMPLETION_INITIALIZER(work) + * - init_completion(c) + * - complete(c) + * - wait_for_completion(c) + * - wait_for_completion_interruptible(c) + * - fini_completion(c) + */ +#define fini_completion(c) do { } while (0) + +/* + * semaphore "implementation" (use Linux kernel's primitives) + * - DEFINE_SEMAPHORE(name) + * - sema_init(sem, val) + * - up(sem) + * - down(sem) + * - down_interruptible(sem) + * - down_trylock(sem) + */ + +/* + * mutex "implementation" (use Linux kernel's primitives) + * + * - DEFINE_MUTEX(name) + * - mutex_init(x) + * - mutex_lock(x) + * - mutex_unlock(x) + * - mutex_trylock(x) + * - mutex_is_locked(x) + * - mutex_destroy(x) + */ + +#ifndef lockdep_set_class + +/************************************************************************** + * + * Lockdep "implementation". Also see liblustre.h + * + **************************************************************************/ + +struct lock_class_key { + ; +}; + +#define lockdep_set_class(lock, key) \ + do { (void)sizeof(lock); (void)sizeof(key); } while (0) +/* This has to be a macro, so that `subclass' can be undefined in kernels + * that do not support lockdep. */ + + +static inline void lockdep_off(void) +{ +} + +static inline void lockdep_on(void) +{ +} +#else + +#endif /* lockdep_set_class */ + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#ifndef mutex_lock_nested +#define mutex_lock_nested(mutex, subclass) mutex_lock(mutex) +#endif + +#ifndef spin_lock_nested +#define spin_lock_nested(lock, subclass) spin_lock(lock) +#endif + +#ifndef down_read_nested +#define down_read_nested(lock, subclass) down_read(lock) +#endif + +#ifndef down_write_nested +#define down_write_nested(lock, subclass) down_write(lock) +#endif +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + + +#endif /* __LIBCFS_LINUX_CFS_LOCK_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h new file mode 100644 index 000000000000..f6cb4635ef4e --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h @@ -0,0 +1,139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-mem.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_MEM_H__ +#define __LIBCFS_LINUX_CFS_MEM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include +#include +#include +#include +#include +#include + +#define CFS_PAGE_MASK (~((__u64)PAGE_CACHE_SIZE-1)) +#define page_index(p) ((p)->index) + +#define memory_pressure_get() (current->flags & PF_MEMALLOC) +#define memory_pressure_set() do { current->flags |= PF_MEMALLOC; } while (0) +#define memory_pressure_clr() do { current->flags &= ~PF_MEMALLOC; } while (0) + +#if BITS_PER_LONG == 32 +/* limit to lowmem on 32-bit systems */ +#define NUM_CACHEPAGES \ + min(num_physpages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4) +#else +#define NUM_CACHEPAGES num_physpages +#endif + +/* + * In Linux there is no way to determine whether current execution context is + * blockable. + */ +#define ALLOC_ATOMIC_TRY GFP_ATOMIC + +#define DECL_MMSPACE mm_segment_t __oldfs +#define MMSPACE_OPEN \ + do { __oldfs = get_fs(); set_fs(get_ds());} while(0) +#define MMSPACE_CLOSE set_fs(__oldfs) + + +/* + * NUMA allocators + * + * NB: we will rename these functions in a separate patch: + * - rename kmalloc to cfs_malloc + * - rename kmalloc/free_page to cfs_page_alloc/free + * - rename kmalloc/free_large to cfs_vmalloc/vfree + */ +extern void *cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, + size_t nr_bytes, unsigned int flags); +extern void *cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt, + size_t nr_bytes); +extern struct page *cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, + int cpt, unsigned int flags); +extern void *cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, + struct cfs_cpt_table *cptab, + int cpt, unsigned int flags); + +/* + * Shrinker + */ + +# define SHRINKER_ARGS(sc, nr_to_scan, gfp_mask) \ + struct shrinker *shrinker, \ + struct shrink_control *sc +# define shrink_param(sc, var) ((sc)->var) + +typedef int (*shrinker_t)(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)); + +static inline +struct shrinker *set_shrinker(int seek, shrinker_t func) +{ + struct shrinker *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return (NULL); + + s->shrink = func; + s->seeks = seek; + + register_shrinker(s); + + return s; +} + +static inline +void remove_shrinker(struct shrinker *shrinker) +{ + if (shrinker == NULL) + return; + + unregister_shrinker(shrinker); + kfree(shrinker); +} + +#endif /* __LINUX_CFS_MEM_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h new file mode 100644 index 000000000000..c346bcdf05bb --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-prim.h @@ -0,0 +1,243 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-prim.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_PRIM_H__ +#define __LIBCFS_LINUX_CFS_PRIM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +/* + * CPU + */ +#ifdef for_each_possible_cpu +#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) +#elif defined(for_each_cpu) +#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu) +#endif + +#ifdef NR_CPUS +#else +#define NR_CPUS 1 +#endif + +#define cfs_set_cpus_allowed(t, mask) set_cpus_allowed(t, mask) + +/* + * cache + */ + +/* + * IRQs + */ + + +/* + * Pseudo device register + */ +typedef struct miscdevice psdev_t; + +/* + * Sysctl register + */ +typedef struct ctl_table ctl_table_t; +typedef struct ctl_table_header ctl_table_header_t; + +#define cfs_register_sysctl_table(t, a) register_sysctl_table(t) + +#define DECLARE_PROC_HANDLER(name) \ +static int \ +LL_PROC_PROTO(name) \ +{ \ + DECLARE_LL_PROC_PPOS_DECL; \ + \ + return proc_call_handler(table->data, write, \ + ppos, buffer, lenp, \ + __##name); \ +} + +/* + * Symbol register + */ +#define cfs_symbol_register(s, p) do {} while(0) +#define cfs_symbol_unregister(s) do {} while(0) +#define cfs_symbol_get(s) symbol_get(s) +#define cfs_symbol_put(s) symbol_put(s) + +typedef struct module module_t; + +/* + * Proc file system APIs + */ +typedef struct proc_dir_entry proc_dir_entry_t; + +/* + * Wait Queue + */ + + +typedef long cfs_task_state_t; + +#define CFS_DECL_WAITQ(wq) DECLARE_WAIT_QUEUE_HEAD(wq) + +/* + * Task struct + */ +typedef struct task_struct task_t; +#define DECL_JOURNAL_DATA void *journal_info +#define PUSH_JOURNAL do { \ + journal_info = current->journal_info; \ + current->journal_info = NULL; \ + } while(0) +#define POP_JOURNAL do { \ + current->journal_info = journal_info; \ + } while(0) + +/* Module interfaces */ +#define cfs_module(name, version, init, fini) \ + module_init(init); \ + module_exit(fini) + +/* + * Signal + */ + +/* + * Timer + */ +typedef struct timer_list timer_list_t; + + +#ifndef wait_event_timeout /* Only for RHEL3 2.4.21 kernel */ +#define __wait_event_timeout(wq, condition, timeout, ret) \ +do { \ + int __ret = 0; \ + if (!(condition)) { \ + wait_queue_t __wait; \ + unsigned long expire; \ + \ + init_waitqueue_entry(&__wait, current); \ + expire = timeout + jiffies; \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + if (jiffies > expire) { \ + ret = jiffies - expire; \ + break; \ + } \ + schedule_timeout(timeout); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ + } \ +} while (0) +/* + retval == 0; condition met; we're good. + retval > 0; timed out. +*/ +#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret) \ +do { \ + ret = 0; \ + if (!(condition)) \ + __wait_event_timeout(wq, condition, timeout, ret); \ +} while (0) +#else +#define cfs_waitq_wait_event_timeout(wq, condition, timeout, ret) \ + ret = wait_event_timeout(wq, condition, timeout) +#endif + +#define cfs_waitq_wait_event_interruptible_timeout(wq, c, timeout, ret) \ + ret = wait_event_interruptible_timeout(wq, c, timeout) + +/* + * atomic + */ + + +#define cfs_atomic_add_unless(atom, a, u) atomic_add_unless(atom, a, u) +#define cfs_atomic_cmpxchg(atom, old, nv) atomic_cmpxchg(atom, old, nv) + +/* + * membar + */ + + +/* + * interrupt + */ + + +/* + * might_sleep + */ + +/* + * group_info + */ +typedef struct group_info group_info_t; + + +/* + * Random bytes + */ +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h new file mode 100644 index 000000000000..687f33f4e8a7 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-tcpip.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-tcpip.h + * + * Basic library routines. + */ + +#ifndef __LIBCFS_LINUX_CFS_TCP_H__ +#define __LIBCFS_LINUX_CFS_TCP_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#include + +#ifndef HIPQUAD +// XXX Should just kill all users +#if defined(__LITTLE_ENDIAN) +#define HIPQUAD(addr) \ + ((unsigned char *)&addr)[3], \ + ((unsigned char *)&addr)[2], \ + ((unsigned char *)&addr)[1], \ + ((unsigned char *)&addr)[0] +#elif defined(__BIG_ENDIAN) +#define HIPQUAD NIPQUAD +#else +#error "Please fix asm/byteorder.h" +#endif /* __LITTLE_ENDIAN */ +#endif + +typedef struct socket socket_t; + +#define SOCK_SNDBUF(so) ((so)->sk->sk_sndbuf) +#define SOCK_TEST_NOSPACE(so) test_bit(SOCK_NOSPACE, &(so)->flags) + +static inline int +cfs_sock_error(struct socket *sock) +{ + return sock->sk->sk_err; +} + +static inline int +cfs_sock_wmem_queued(struct socket *sock) +{ + return sock->sk->sk_wmem_queued; +} + +#define cfs_sk_sleep(sk) sk_sleep(sk) + +#define DEFAULT_NET (&init_net) + +#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h new file mode 100644 index 000000000000..4a48b914b42a --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-time.h @@ -0,0 +1,275 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/linux/linux-time.h + * + * Implementation of portable time API for Linux (kernel and user-level). + * + * Author: Nikita Danilov + */ + +#ifndef __LIBCFS_LINUX_LINUX_TIME_H__ +#define __LIBCFS_LINUX_LINUX_TIME_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +/* Portable time API */ + +/* + * Platform provides three opaque data-types: + * + * cfs_time_t represents point in time. This is internal kernel + * time rather than "wall clock". This time bears no + * relation to gettimeofday(). + * + * cfs_duration_t represents time interval with resolution of internal + * platform clock + * + * cfs_fs_time_t represents instance in world-visible time. This is + * used in file-system time-stamps + * + * cfs_time_t cfs_time_current(void); + * cfs_time_t cfs_time_add (cfs_time_t, cfs_duration_t); + * cfs_duration_t cfs_time_sub (cfs_time_t, cfs_time_t); + * int cfs_impl_time_before (cfs_time_t, cfs_time_t); + * int cfs_impl_time_before_eq(cfs_time_t, cfs_time_t); + * + * cfs_duration_t cfs_duration_build(int64_t); + * + * time_t cfs_duration_sec (cfs_duration_t); + * void cfs_duration_usec(cfs_duration_t, struct timeval *); + * void cfs_duration_nsec(cfs_duration_t, struct timespec *); + * + * void cfs_fs_time_current(cfs_fs_time_t *); + * time_t cfs_fs_time_sec (cfs_fs_time_t *); + * void cfs_fs_time_usec (cfs_fs_time_t *, struct timeval *); + * void cfs_fs_time_nsec (cfs_fs_time_t *, struct timespec *); + * int cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *); + * int cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *); + * + * CFS_TIME_FORMAT + * CFS_DURATION_FORMAT + * + */ + +#define ONE_BILLION ((u_int64_t)1000000000) +#define ONE_MILLION 1000000 + + +#include +#include +#include +#include +#include + +#include + +/* + * post 2.5 kernels. + */ + +#include + +typedef struct timespec cfs_fs_time_t; + +static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v) +{ + v->tv_sec = t->tv_sec; + v->tv_usec = t->tv_nsec / 1000; +} + +static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s) +{ + *s = *t; +} + +/* + * internal helper function used by cfs_fs_time_before*() + */ +static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t) +{ + return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec; +} + + +/* + * Generic kernel stuff + */ + +typedef unsigned long cfs_time_t; /* jiffies */ +typedef long cfs_duration_t; +typedef cycles_t cfs_cycles_t; + +static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2) +{ + return time_before(t1, t2); +} + +static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2) +{ + return time_before_eq(t1, t2); +} + +static inline cfs_time_t cfs_time_current(void) +{ + return jiffies; +} + +static inline time_t cfs_time_current_sec(void) +{ + return get_seconds(); +} + +static inline void cfs_fs_time_current(cfs_fs_time_t *t) +{ + *t = CURRENT_TIME; +} + +static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t) +{ + return t->tv_sec; +} + +static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2) +{ + return __cfs_fs_time_flat(t1) < __cfs_fs_time_flat(t2); +} + +static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2) +{ + return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2); +} + +#if 0 +static inline cfs_duration_t cfs_duration_build(int64_t nano) +{ +#if (BITS_PER_LONG == 32) + /* We cannot use do_div(t, ONE_BILLION), do_div can only process + * 64 bits n and 32 bits base */ + int64_t t = nano * HZ; + do_div(t, 1000); + do_div(t, 1000000); + return (cfs_duration_t)t; +#else + return (nano * HZ / ONE_BILLION); +#endif +} +#endif + +static inline cfs_duration_t cfs_time_seconds(int seconds) +{ + return ((cfs_duration_t)seconds) * HZ; +} + +static inline time_t cfs_duration_sec(cfs_duration_t d) +{ + return d / HZ; +} + +static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s) +{ +#if (BITS_PER_LONG == 32) && (HZ > 4096) + __u64 t; + + s->tv_sec = d / HZ; + t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION; + do_div(t, HZ); + s->tv_usec = t; +#else + s->tv_sec = d / HZ; + s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * \ + ONE_MILLION) / HZ; +#endif +} + +static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s) +{ +#if (BITS_PER_LONG == 32) + __u64 t; + + s->tv_sec = d / HZ; + t = (d - s->tv_sec * HZ) * ONE_BILLION; + do_div(t, HZ); + s->tv_nsec = t; +#else + s->tv_sec = d / HZ; + s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ; +#endif +} + +#define cfs_time_current_64 get_jiffies_64 + +static inline __u64 cfs_time_add_64(__u64 t, __u64 d) +{ + return t + d; +} + +static inline __u64 cfs_time_shift_64(int seconds) +{ + return cfs_time_add_64(cfs_time_current_64(), + cfs_time_seconds(seconds)); +} + +static inline int cfs_time_before_64(__u64 t1, __u64 t2) +{ + return (__s64)t2 - (__s64)t1 > 0; +} + +static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2) +{ + return (__s64)t2 - (__s64)t1 >= 0; +} + + +/* + * One jiffy + */ +#define CFS_TICK (1) + +#define CFS_TIME_T "%lu" +#define CFS_DURATION_T "%ld" + + +#endif /* __LIBCFS_LINUX_LINUX_TIME_H__ */ +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h new file mode 100644 index 000000000000..142394925567 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-types.h @@ -0,0 +1,36 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/include/libcfs/user-bitops.h + */ +#include diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h new file mode 100644 index 000000000000..2b9487267596 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/linux/portals_compat25.h @@ -0,0 +1,116 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_PORTALS_COMPAT_H__ +#define __LIBCFS_LINUX_PORTALS_COMPAT_H__ + +// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved +#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG +# define SIGNAL_MASK_ASSERT() \ + LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC) +#else +# define SIGNAL_MASK_ASSERT() +#endif +// XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved + +#define SIGNAL_MASK_LOCK(task, flags) \ + spin_lock_irqsave(&task->sighand->siglock, flags) +#define SIGNAL_MASK_UNLOCK(task, flags) \ + spin_unlock_irqrestore(&task->sighand->siglock, flags) +#define USERMODEHELPER(path, argv, envp) \ + call_usermodehelper(path, argv, envp, 1) +#define clear_tsk_thread_flag(current, TIF_SIGPENDING) clear_tsk_thread_flag(current, \ + TIF_SIGPENDING) +# define smp_num_cpus num_online_cpus() + +#define cfs_wait_event_interruptible(wq, condition, ret) \ + ret = wait_event_interruptible(wq, condition) +#define cfs_wait_event_interruptible_exclusive(wq, condition, ret) \ + ret = wait_event_interruptible_exclusive(wq, condition) + +#define THREAD_NAME(comm, len, fmt, a...) \ + snprintf(comm, len, fmt, ## a) + +/* 2.6 alloc_page users can use page->lru */ +#define PAGE_LIST_ENTRY lru +#define PAGE_LIST(page) ((page)->lru) + +#ifndef __user +#define __user +#endif + +#ifndef __fls +#define __cfs_fls fls +#else +#define __cfs_fls __fls +#endif + +#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos) \ + proc_dointvec(table, write, buffer, lenp, ppos); + +#define ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos) \ + proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos) \ + proc_dostring(table, write, buffer, lenp, ppos); +#define LL_PROC_PROTO(name) \ + name(ctl_table_t *table, int write, \ + void __user *buffer, size_t *lenp, loff_t *ppos) +#define DECLARE_LL_PROC_PPOS_DECL + +/* helper for sysctl handlers */ +int proc_call_handler(void *data, int write, + loff_t *ppos, void *buffer, size_t *lenp, + int (*handler)(void *data, int write, + loff_t pos, void *buffer, int len)); +/* + * CPU + */ +#ifdef for_each_possible_cpu +#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) +#elif defined(for_each_cpu) +#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu) +#endif + +#ifdef NR_CPUS +#else +#define NR_CPUS 1 +#endif + +#define cfs_set_cpus_allowed(t, mask) set_cpus_allowed(t, mask) + +#define cfs_register_sysctl_table(t, a) register_sysctl_table(t) + +#endif /* _PORTALS_COMPAT_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/lucache.h b/drivers/staging/lustre/include/linux/libcfs/lucache.h new file mode 100644 index 000000000000..7ae36fc88d77 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/lucache.h @@ -0,0 +1,162 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUCACHE_H +#define _LUCACHE_H + +#include + +/** \defgroup ucache ucache + * + * @{ + */ + +#define UC_CACHE_NEW 0x01 +#define UC_CACHE_ACQUIRING 0x02 +#define UC_CACHE_INVALID 0x04 +#define UC_CACHE_EXPIRED 0x08 + +#define UC_CACHE_IS_NEW(i) ((i)->ue_flags & UC_CACHE_NEW) +#define UC_CACHE_IS_INVALID(i) ((i)->ue_flags & UC_CACHE_INVALID) +#define UC_CACHE_IS_ACQUIRING(i) ((i)->ue_flags & UC_CACHE_ACQUIRING) +#define UC_CACHE_IS_EXPIRED(i) ((i)->ue_flags & UC_CACHE_EXPIRED) +#define UC_CACHE_IS_VALID(i) ((i)->ue_flags == 0) + +#define UC_CACHE_SET_NEW(i) (i)->ue_flags |= UC_CACHE_NEW +#define UC_CACHE_SET_INVALID(i) (i)->ue_flags |= UC_CACHE_INVALID +#define UC_CACHE_SET_ACQUIRING(i) (i)->ue_flags |= UC_CACHE_ACQUIRING +#define UC_CACHE_SET_EXPIRED(i) (i)->ue_flags |= UC_CACHE_EXPIRED +#define UC_CACHE_SET_VALID(i) (i)->ue_flags = 0 + +#define UC_CACHE_CLEAR_NEW(i) (i)->ue_flags &= ~UC_CACHE_NEW +#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING +#define UC_CACHE_CLEAR_INVALID(i) (i)->ue_flags &= ~UC_CACHE_INVALID +#define UC_CACHE_CLEAR_EXPIRED(i) (i)->ue_flags &= ~UC_CACHE_EXPIRED + +struct upcall_cache_entry; + +struct md_perm { + lnet_nid_t mp_nid; + __u32 mp_perm; +}; + +struct md_identity { + struct upcall_cache_entry *mi_uc_entry; + uid_t mi_uid; + gid_t mi_gid; + group_info_t *mi_ginfo; + int mi_nperms; + struct md_perm *mi_perms; +}; + +struct upcall_cache_entry { + struct list_head ue_hash; + __u64 ue_key; + atomic_t ue_refcount; + int ue_flags; + wait_queue_head_t ue_waitq; + cfs_time_t ue_acquire_expire; + cfs_time_t ue_expire; + union { + struct md_identity identity; + } u; +}; + +#define UC_CACHE_HASH_SIZE (128) +#define UC_CACHE_HASH_INDEX(id) ((id) & (UC_CACHE_HASH_SIZE - 1)) +#define UC_CACHE_UPCALL_MAXPATH (1024UL) + +struct upcall_cache; + +struct upcall_cache_ops { + void (*init_entry)(struct upcall_cache_entry *, void *args); + void (*free_entry)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*upcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*downcall_compare)(struct upcall_cache *, + struct upcall_cache_entry *, + __u64 key, void *args); + int (*do_upcall)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*parse_downcall)(struct upcall_cache *, + struct upcall_cache_entry *, void *); +}; + +struct upcall_cache { + struct list_head uc_hashtable[UC_CACHE_HASH_SIZE]; + spinlock_t uc_lock; + rwlock_t uc_upcall_rwlock; + + char uc_name[40]; /* for upcall */ + char uc_upcall[UC_CACHE_UPCALL_MAXPATH]; + int uc_acquire_expire; /* seconds */ + int uc_entry_expire; /* seconds */ + struct upcall_cache_ops *uc_ops; +}; + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args); +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args); +void upcall_cache_flush_idle(struct upcall_cache *cache); +void upcall_cache_flush_all(struct upcall_cache *cache); +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args); +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops); +void upcall_cache_cleanup(struct upcall_cache *cache); + +#if 0 +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash, + __u64 key, __u32 primary, + __u32 ngroups, __u32 *groups); +void upcall_cache_put_entry(struct upcall_cache *hash, + struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key, + __u32 primary, __u32 ngroups, __u32 *groups); +void upcall_cache_flush_idle(struct upcall_cache *cache); +void upcall_cache_flush_all(struct upcall_cache *cache); +struct upcall_cache *upcall_cache_init(const char *name); +void upcall_cache_cleanup(struct upcall_cache *hash); + +#endif + +/** @} ucache */ + +#endif /* _LUCACHE_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/params_tree.h b/drivers/staging/lustre/include/linux/libcfs/params_tree.h new file mode 100644 index 000000000000..6551f4b030d9 --- /dev/null +++ b/drivers/staging/lustre/include/linux/libcfs/params_tree.h @@ -0,0 +1,230 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * API and structure definitions for params_tree. + * + * Author: LiuYing + */ +#ifndef __PARAMS_TREE_H__ +#define __PARAMS_TREE_H__ + +#include + +#undef LPROCFS +#if defined(CONFIG_PROC_FS) +# define LPROCFS +#endif + +#ifdef LPROCFS +typedef struct file cfs_param_file_t; +typedef struct inode cfs_inode_t; +typedef struct proc_inode cfs_proc_inode_t; +typedef struct seq_file cfs_seq_file_t; +typedef struct seq_operations cfs_seq_ops_t; +typedef struct file_operations cfs_param_file_ops_t; +typedef module_t *cfs_param_module_t; +typedef struct proc_dir_entry cfs_param_dentry_t; +typedef struct poll_table_struct cfs_poll_table_t; +#define CFS_PARAM_MODULE THIS_MODULE +#define CFS_PDE(value) PDE(value) +#define cfs_file_private(file) (file->private_data) +#define cfs_dentry_data(dentry) (dentry->data) +#define cfs_proc_inode_pde(proc_inode) (proc_inode->pde) +#define cfs_proc_inode(proc_inode) (proc_inode->vfs_inode) +#define cfs_seq_read_common seq_read +#define cfs_seq_lseek_common seq_lseek +#define cfs_seq_private(seq) (seq->private) +#define cfs_seq_printf(seq, format, ...) seq_printf(seq, format, \ + ## __VA_ARGS__) +#define cfs_seq_release(inode, file) seq_release(inode, file) +#define cfs_seq_puts(seq, s) seq_puts(seq, s) +#define cfs_seq_putc(seq, s) seq_putc(seq, s) +#define cfs_seq_read(file, buf, count, ppos, rc) (rc = seq_read(file, buf, \ + count, ppos)) +#define cfs_seq_open(file, ops, rc) (rc = seq_open(file, ops)) + +/* in lprocfs_stat.c, to protect the private data for proc entries */ +extern struct rw_semaphore _lprocfs_lock; + +/* to begin from 2.6.23, Linux defines self file_operations (proc_reg_file_ops) + * in procfs, the proc file_operation defined by Lustre (lprocfs_generic_fops) + * will be wrapped into the new defined proc_reg_file_ops, which instroduces + * user count in proc_dir_entrey(pde_users) to protect the proc entry from + * being deleted. then the protection lock (_lprocfs_lock) defined by Lustre + * isn't necessary anymore for lprocfs_generic_fops(e.g. lprocfs_fops_read). + * see bug19706 for detailed information. + */ +#define LPROCFS_ENTRY() do{ }while(0) +#define LPROCFS_EXIT() do{ }while(0) + +static inline +int LPROCFS_ENTRY_AND_CHECK(struct proc_dir_entry *dp) +{ + int deleted = 0; + + spin_lock(&(dp)->pde_unload_lock); + if (dp->proc_fops == NULL) + deleted = 1; + spin_unlock(&(dp)->pde_unload_lock); + if (deleted) + return -ENODEV; + return 0; +} +#define LPROCFS_SRCH_ENTRY() \ +do { \ + down_read(&_lprocfs_lock); \ +} while(0) + +#define LPROCFS_SRCH_EXIT() \ +do { \ + up_read(&_lprocfs_lock); \ +} while(0) + +#define LPROCFS_WRITE_ENTRY() \ +do { \ + down_write(&_lprocfs_lock); \ +} while(0) + +#define LPROCFS_WRITE_EXIT() \ +do { \ + up_write(&_lprocfs_lock); \ +} while(0) +#else /* !LPROCFS */ + +typedef struct cfs_params_file { + void *param_private; + loff_t param_pos; + unsigned int param_flags; +} cfs_param_file_t; + +typedef struct cfs_param_inode { + void *param_private; +} cfs_inode_t; + +typedef struct cfs_param_dentry { + void *param_data; +} cfs_param_dentry_t; + +typedef struct cfs_proc_inode { + cfs_param_dentry_t *param_pde; + cfs_inode_t param_inode; +} cfs_proc_inode_t; + +struct cfs_seq_operations; +typedef struct cfs_seq_file { + char *buf; + size_t size; + size_t from; + size_t count; + loff_t index; + loff_t version; + struct mutex lock; + struct cfs_seq_operations *op; + void *private; +} cfs_seq_file_t; + +typedef struct cfs_seq_operations { + void *(*start) (cfs_seq_file_t *m, loff_t *pos); + void (*stop) (cfs_seq_file_t *m, void *v); + void *(*next) (cfs_seq_file_t *m, void *v, loff_t *pos); + int (*show) (cfs_seq_file_t *m, void *v); +} cfs_seq_ops_t; + +typedef void *cfs_param_module_t; +typedef void *cfs_poll_table_t; + +typedef struct cfs_param_file_ops { + cfs_param_module_t owner; + int (*open) (cfs_inode_t *, struct file *); + loff_t (*llseek)(struct file *, loff_t, int); + int (*release) (cfs_inode_t *, cfs_param_file_t *); + unsigned int (*poll) (struct file *, cfs_poll_table_t *); + ssize_t (*write) (struct file *, const char *, size_t, loff_t *); + ssize_t (*read)(struct file *, char *, size_t, loff_t *); +} cfs_param_file_ops_t; +typedef cfs_param_file_ops_t *cfs_lproc_filep_t; + +static inline cfs_proc_inode_t *FAKE_PROC_I(const cfs_inode_t *inode) +{ + return container_of(inode, cfs_proc_inode_t, param_inode); +} + +static inline cfs_param_dentry_t *FAKE_PDE(cfs_inode_t *inode) +{ + return FAKE_PROC_I(inode)->param_pde; +} + +#define CFS_PARAM_MODULE NULL +#define CFS_PDE(value) FAKE_PDE(value) +#define cfs_file_private(file) (file->param_private) +#define cfs_dentry_data(dentry) (dentry->param_data) +#define cfs_proc_inode(proc_inode) (proc_inode->param_inode) +#define cfs_proc_inode_pde(proc_inode) (proc_inode->param_pde) +#define cfs_seq_read_common NULL +#define cfs_seq_lseek_common NULL +#define cfs_seq_private(seq) (seq->private) +#define cfs_seq_read(file, buf, count, ppos, rc) do {} while(0) +#define cfs_seq_open(file, ops, rc) \ +do { \ + cfs_seq_file_t *p = cfs_file_private(file); \ + if (!p) { \ + LIBCFS_ALLOC(p, sizeof(*p)); \ + if (!p) { \ + rc = -ENOMEM; \ + break; \ + } \ + cfs_file_private(file) = p; \ + } \ + memset(p, 0, sizeof(*p)); \ + p->op = ops; \ + rc = 0; \ +} while(0) + +#define LPROCFS_ENTRY() do {} while(0) +#define LPROCFS_EXIT() do {} while(0) +static inline +int LPROCFS_ENTRY_AND_CHECK(cfs_param_dentry_t *dp) +{ + LPROCFS_ENTRY(); + return 0; +} +#define LPROCFS_WRITE_ENTRY() do {} while(0) +#define LPROCFS_WRITE_EXIT() do {} while(0) + +#endif /* LPROCFS */ + +/* XXX: params_tree APIs */ + +#endif /* __PARAMS_TREE_H__ */ diff --git a/drivers/staging/lustre/include/linux/lnet/api-support.h b/drivers/staging/lustre/include/linux/lnet/api-support.h new file mode 100644 index 000000000000..a8d91dbe6060 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/api-support.h @@ -0,0 +1,44 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_API_SUPPORT_H__ +#define __LNET_API_SUPPORT_H__ + +#include + +#include +#include +#include + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h new file mode 100644 index 000000000000..e8642e33860d --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/api.h @@ -0,0 +1,220 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_API_H__ +#define __LNET_API_H__ + +/** \defgroup lnet LNet + * + * The Lustre Networking subsystem. + * + * LNet is an asynchronous message-passing API, which provides an unreliable + * connectionless service that can't guarantee any order. It supports OFA IB, + * TCP/IP, and Cray Portals, and routes between heterogeneous networks. + * + * LNet can run both in OS kernel space and in userspace as a library. + * @{ + */ + +#include + +/** \defgroup lnet_init_fini Initialization and cleanup + * The LNet must be properly initialized before any LNet calls can be made. + * @{ */ +int LNetInit(void); +void LNetFini(void); + +int LNetNIInit(lnet_pid_t requested_pid); +int LNetNIFini(void); +/** @} lnet_init_fini */ + +/** \defgroup lnet_addr LNet addressing and basic types + * + * Addressing scheme and basic data types of LNet. + * + * The LNet API is memory-oriented, so LNet must be able to address not only + * end-points but also memory region within a process address space. + * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process + * in a node. A portal represents an opening in the address space of a + * process. Match bits is criteria to identify a region of memory inside a + * portal, and offset specifies an offset within the memory region. + * + * LNet creates a table of portals for each process during initialization. + * This table has MAX_PORTALS entries and its size can't be dynamically + * changed. A portal stays empty until the owning process starts to add + * memory regions to it. A portal is sometimes called an index because + * it's an entry in the portals table of a process. + * + * \see LNetMEAttach + * @{ */ +int LNetGetId(unsigned int index, lnet_process_id_t *id); +int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); +void LNetSnprintHandle(char *str, int str_len, lnet_handle_any_t handle); + +/** @} lnet_addr */ + + +/** \defgroup lnet_me Match entries + * + * A match entry (abbreviated as ME) describes a set of criteria to accept + * incoming requests. + * + * A portal is essentially a match list plus a set of attributes. A match + * list is a chain of MEs. Each ME includes a pointer to a memory descriptor + * and a set of match criteria. The match criteria can be used to reject + * incoming requests based on process ID or the match bits provided in the + * request. MEs can be dynamically inserted into a match list by LNetMEAttach() + * and LNetMEInsert(), and removed from its list by LNetMEUnlink(). + * @{ */ +int LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t pos_in, + lnet_handle_me_t *handle_out); + +int LNetMEInsert(lnet_handle_me_t current_in, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t position_in, + lnet_handle_me_t *handle_out); + +int LNetMEUnlink(lnet_handle_me_t current_in); +/** @} lnet_me */ + +/** \defgroup lnet_md Memory descriptors + * + * A memory descriptor contains information about a region of a user's + * memory (either in kernel or user space) and optionally points to an + * event queue where information about the operations performed on the + * memory descriptor are recorded. Memory descriptor is abbreviated as + * MD and can be used interchangeably with the memory region it describes. + * + * The LNet API provides two operations to create MDs: LNetMDAttach() + * and LNetMDBind(); one operation to unlink and release the resources + * associated with a MD: LNetMDUnlink(). + * @{ */ +int LNetMDAttach(lnet_handle_me_t current_in, + lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); + +int LNetMDBind(lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); + +int LNetMDUnlink(lnet_handle_md_t md_in); +/** @} lnet_md */ + +/** \defgroup lnet_eq Events and event queues + * + * Event queues (abbreviated as EQ) are used to log operations performed on + * local MDs. In particular, they signal the completion of a data transmission + * into or out of a MD. They can also be used to hold acknowledgments for + * completed PUT operations and indicate when a MD has been unlinked. Multiple + * MDs can share a single EQ. An EQ may have an optional event handler + * associated with it. If an event handler exists, it will be run for each + * event that is deposited into the EQ. + * + * In addition to the lnet_handle_eq_t, the LNet API defines two types + * associated with events: The ::lnet_event_kind_t defines the kinds of events + * that can be stored in an EQ. The lnet_event_t defines a structure that + * holds the information about with an event. + * + * There are five functions for dealing with EQs: LNetEQAlloc() is used to + * create an EQ and allocate the resources needed, while LNetEQFree() + * releases these resources and free the EQ. LNetEQGet() retrieves the next + * event from an EQ, and LNetEQWait() can be used to block a process until + * an EQ has at least one event. LNetEQPoll() can be used to test or wait + * on multiple EQs. + * @{ */ +int LNetEQAlloc(unsigned int count_in, + lnet_eq_handler_t handler, + lnet_handle_eq_t *handle_out); + +int LNetEQFree(lnet_handle_eq_t eventq_in); + +int LNetEQGet(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); + + +int LNetEQWait(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); + +int LNetEQPoll(lnet_handle_eq_t *eventqs_in, + int neq_in, + int timeout_ms, + lnet_event_t *event_out, + int *which_eq_out); +/** @} lnet_eq */ + +/** \defgroup lnet_data Data movement operations + * + * The LNet API provides two data movement operations: LNetPut() + * and LNetGet(). + * @{ */ +int LNetPut(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_ack_req_t ack_req_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + __u64 hdr_data_in); + +int LNetGet(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in); +/** @} lnet_data */ + + +/** \defgroup lnet_misc Miscellaneous operations. + * Miscellaneous operations. + * @{ */ + +int LNetSetLazyPortal(int portal); +int LNetClearLazyPortal(int portal); +int LNetCtl(unsigned int cmd, void *arg); +int LNetSetAsync(lnet_process_id_t id, int nasync); + +/** @} lnet_misc */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h new file mode 100644 index 000000000000..59bff0bea816 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h @@ -0,0 +1,874 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-lnet.h + * + * Top level include for library side routines + */ + +#ifndef __LNET_LIB_LNET_H__ +#define __LNET_LIB_LNET_H__ + +#include + +#include +#include +#include +#include + +extern lnet_t the_lnet; /* THE network */ + +#if defined(LNET_USE_LIB_FREELIST) +/* 1 CPT, simplify implementation... */ +# define LNET_CPT_MAX_BITS 0 + +#else /* KERNEL and no freelist */ + +# if (BITS_PER_LONG == 32) +/* 2 CPTs, allowing more CPTs might make us under memory pressure */ +# define LNET_CPT_MAX_BITS 1 + +# else /* 64-bit system */ +/* + * 256 CPTs for thousands of CPUs, allowing more CPTs might make us + * under risk of consuming all lh_cookie. + */ +# define LNET_CPT_MAX_BITS 8 +# endif /* BITS_PER_LONG == 32 */ +#endif + +/* max allowed CPT number */ +#define LNET_CPT_MAX (1 << LNET_CPT_MAX_BITS) + +#define LNET_CPT_NUMBER (the_lnet.ln_cpt_number) +#define LNET_CPT_BITS (the_lnet.ln_cpt_bits) +#define LNET_CPT_MASK ((1ULL << LNET_CPT_BITS) - 1) + +/** exclusive lock */ +#define LNET_LOCK_EX CFS_PERCPT_LOCK_EX + +static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE && + wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE); +} + +static inline int lnet_md_exhausted (lnet_libmd_t *md) +{ + return (md->md_threshold == 0 || + ((md->md_options & LNET_MD_MAX_SIZE) != 0 && + md->md_offset + md->md_max_size > md->md_length)); +} + +static inline int lnet_md_unlinkable (lnet_libmd_t *md) +{ + /* Should unlink md when its refcount is 0 and either: + * - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink, + * in the latter case md may not be exhausted). + * - auto unlink is on and md is exhausted. + */ + if (md->md_refcount != 0) + return 0; + + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0) + return 1; + + return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)); +} + +#define lnet_cpt_table() (the_lnet.ln_cpt_table) +#define lnet_cpt_current() cfs_cpt_current(the_lnet.ln_cpt_table, 1) + +static inline int +lnet_cpt_of_cookie(__u64 cookie) +{ + unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK; + + /* LNET_CPT_NUMBER doesn't have to be power2, which means we can + * get illegal cpt from it's invalid cookie */ + return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER; +} + +static inline void +lnet_res_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_res_lock, cpt); +} + +static inline void +lnet_res_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_res_lock, cpt); +} + +static inline int +lnet_res_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_res_lock(cpt); + return cpt; +} + +static inline void +lnet_net_lock(int cpt) +{ + cfs_percpt_lock(the_lnet.ln_net_lock, cpt); +} + +static inline void +lnet_net_unlock(int cpt) +{ + cfs_percpt_unlock(the_lnet.ln_net_lock, cpt); +} + +static inline int +lnet_net_lock_current(void) +{ + int cpt = lnet_cpt_current(); + + lnet_net_lock(cpt); + return cpt; +} + +#define LNET_LOCK() lnet_net_lock(LNET_LOCK_EX) +#define LNET_UNLOCK() lnet_net_unlock(LNET_LOCK_EX) + + +#define lnet_ptl_lock(ptl) spin_lock(&(ptl)->ptl_lock) +#define lnet_ptl_unlock(ptl) spin_unlock(&(ptl)->ptl_lock) +#define lnet_eq_wait_lock() spin_lock(&the_lnet.ln_eq_wait_lock) +#define lnet_eq_wait_unlock() spin_unlock(&the_lnet.ln_eq_wait_lock) +#define lnet_ni_lock(ni) spin_lock(&(ni)->ni_lock) +#define lnet_ni_unlock(ni) spin_unlock(&(ni)->ni_lock) +#define LNET_MUTEX_LOCK(m) mutex_lock(m) +#define LNET_MUTEX_UNLOCK(m) mutex_unlock(m) + + +#define MAX_PORTALS 64 + +/* these are only used by code with LNET_USE_LIB_FREELIST, but we still + * exported them to !LNET_USE_LIB_FREELIST for easy implemetation */ +#define LNET_FL_MAX_MES 2048 +#define LNET_FL_MAX_MDS 2048 +#define LNET_FL_MAX_EQS 512 +#define LNET_FL_MAX_MSGS 2048 /* Outstanding messages */ + +#ifdef LNET_USE_LIB_FREELIST + +int lnet_freelist_init(lnet_freelist_t *fl, int n, int size); +void lnet_freelist_fini(lnet_freelist_t *fl); + +static inline void * +lnet_freelist_alloc (lnet_freelist_t *fl) +{ + /* ALWAYS called with liblock held */ + lnet_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lnet_freelist_free (lnet_freelist_t *fl, void *obj) +{ + /* ALWAYS called with liblock held */ + lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lnet_eq_t * +lnet_eq_alloc (void) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = &the_lnet.ln_eq_container; + lnet_eq_t *eq; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + eq = (lnet_eq_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + return eq; +} + +static inline void +lnet_eq_free_locked(lnet_eq_t *eq) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = &the_lnet.ln_eq_container; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, eq); +} + +static inline void +lnet_eq_free(lnet_eq_t *eq) +{ + lnet_res_lock(0); + lnet_eq_free_locked(eq); + lnet_res_unlock(0); +} + +static inline lnet_libmd_t * +lnet_md_alloc (lnet_md_t *umd) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_md_containers[0]; + lnet_libmd_t *md; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + md = (lnet_libmd_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + if (md != NULL) + INIT_LIST_HEAD(&md->md_list); + + return md; +} + +static inline void +lnet_md_free_locked(lnet_libmd_t *md) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_md_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, md); +} + +static inline void +lnet_md_free(lnet_libmd_t *md) +{ + lnet_res_lock(0); + lnet_md_free_locked(md); + lnet_res_unlock(0); +} + +static inline lnet_me_t * +lnet_me_alloc(void) +{ + /* NEVER called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_me_containers[0]; + lnet_me_t *me; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_res_lock(0); + me = (lnet_me_t *)lnet_freelist_alloc(&rec->rec_freelist); + lnet_res_unlock(0); + + return me; +} + +static inline void +lnet_me_free_locked(lnet_me_t *me) +{ + /* ALWAYS called with resource lock held */ + struct lnet_res_container *rec = the_lnet.ln_me_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + lnet_freelist_free(&rec->rec_freelist, me); +} + +static inline void +lnet_me_free(lnet_me_t *me) +{ + lnet_res_lock(0); + lnet_me_free_locked(me); + lnet_res_unlock(0); +} + +static inline lnet_msg_t * +lnet_msg_alloc (void) +{ + /* NEVER called with network lock held */ + struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0]; + lnet_msg_t *msg; + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_net_lock(0); + msg = (lnet_msg_t *)lnet_freelist_alloc(&msc->msc_freelist); + lnet_net_unlock(0); + + if (msg != NULL) { + /* NULL pointers, clear flags etc */ + memset(msg, 0, sizeof(*msg)); + } + return msg; +} + +static inline void +lnet_msg_free_locked(lnet_msg_t *msg) +{ + /* ALWAYS called with network lock held */ + struct lnet_msg_container *msc = the_lnet.ln_msg_containers[0]; + + LASSERT(LNET_CPT_NUMBER == 1); + LASSERT(!msg->msg_onactivelist); + lnet_freelist_free(&msc->msc_freelist, msg); +} + +static inline void +lnet_msg_free (lnet_msg_t *msg) +{ + lnet_net_lock(0); + lnet_msg_free_locked(msg); + lnet_net_unlock(0); +} + +#else /* !LNET_USE_LIB_FREELIST */ + +static inline lnet_eq_t * +lnet_eq_alloc (void) +{ + /* NEVER called with liblock held */ + lnet_eq_t *eq; + + LIBCFS_ALLOC(eq, sizeof(*eq)); + return (eq); +} + +static inline void +lnet_eq_free(lnet_eq_t *eq) +{ + /* ALWAYS called with resource lock held */ + LIBCFS_FREE(eq, sizeof(*eq)); +} + +static inline lnet_libmd_t * +lnet_md_alloc (lnet_md_t *umd) +{ + /* NEVER called with liblock held */ + lnet_libmd_t *md; + unsigned int size; + unsigned int niov; + + if ((umd->options & LNET_MD_KIOV) != 0) { + niov = umd->length; + size = offsetof(lnet_libmd_t, md_iov.kiov[niov]); + } else { + niov = ((umd->options & LNET_MD_IOVEC) != 0) ? + umd->length : 1; + size = offsetof(lnet_libmd_t, md_iov.iov[niov]); + } + + LIBCFS_ALLOC(md, size); + + if (md != NULL) { + /* Set here in case of early free */ + md->md_options = umd->options; + md->md_niov = niov; + INIT_LIST_HEAD(&md->md_list); + } + + return (md); +} + +static inline void +lnet_md_free(lnet_libmd_t *md) +{ + /* ALWAYS called with resource lock held */ + unsigned int size; + + if ((md->md_options & LNET_MD_KIOV) != 0) + size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]); + else + size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]); + + LIBCFS_FREE(md, size); +} + +static inline lnet_me_t * +lnet_me_alloc (void) +{ + /* NEVER called with liblock held */ + lnet_me_t *me; + + LIBCFS_ALLOC(me, sizeof(*me)); + return (me); +} + +static inline void +lnet_me_free(lnet_me_t *me) +{ + /* ALWAYS called with resource lock held */ + LIBCFS_FREE(me, sizeof(*me)); +} + +static inline lnet_msg_t * +lnet_msg_alloc(void) +{ + /* NEVER called with liblock held */ + lnet_msg_t *msg; + + LIBCFS_ALLOC(msg, sizeof(*msg)); + + /* no need to zero, LIBCFS_ALLOC does for us */ + return (msg); +} + +static inline void +lnet_msg_free(lnet_msg_t *msg) +{ + /* ALWAYS called with network lock held */ + LASSERT(!msg->msg_onactivelist); + LIBCFS_FREE(msg, sizeof(*msg)); +} + +#define lnet_eq_free_locked(eq) lnet_eq_free(eq) +#define lnet_md_free_locked(md) lnet_md_free(md) +#define lnet_me_free_locked(me) lnet_me_free(me) +#define lnet_msg_free_locked(msg) lnet_msg_free(msg) + +#endif /* LNET_USE_LIB_FREELIST */ + +lnet_libhandle_t *lnet_res_lh_lookup(struct lnet_res_container *rec, + __u64 cookie); +void lnet_res_lh_initialize(struct lnet_res_container *rec, + lnet_libhandle_t *lh); +static inline void +lnet_res_lh_invalidate(lnet_libhandle_t *lh) +{ + /* ALWAYS called with resource lock held */ + /* NB: cookie is still useful, don't reset it */ + list_del(&lh->lh_hash_chain); +} + +static inline void +lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq) +{ + if (eq == NULL) { + LNetInvalidateHandle(handle); + return; + } + + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lnet_eq_t * +lnet_handle2eq(lnet_handle_eq_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + + lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_eq_t, eq_lh); +} + +static inline void +lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lnet_libmd_t * +lnet_handle2md(lnet_handle_md_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_libmd_t, md_lh); +} + +static inline lnet_libmd_t * +lnet_wire_handle2md(lnet_handle_wire_t *wh) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) + return NULL; + + cpt = lnet_cpt_of_cookie(wh->wh_object_cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], + wh->wh_object_cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_libmd_t, md_lh); +} + +static inline void +lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lnet_me_t * +lnet_handle2me(lnet_handle_me_t *handle) +{ + /* ALWAYS called with resource lock held */ + lnet_libhandle_t *lh; + int cpt; + + cpt = lnet_cpt_of_cookie(handle->cookie); + lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt], + handle->cookie); + if (lh == NULL) + return NULL; + + return lh_entry(lh, lnet_me_t, me_lh); +} + +static inline void +lnet_peer_addref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount++; +} + +extern void lnet_destroy_peer_locked(lnet_peer_t *lp); + +static inline void +lnet_peer_decref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount--; + if (lp->lp_refcount == 0) + lnet_destroy_peer_locked(lp); +} + +static inline int +lnet_isrouter(lnet_peer_t *lp) +{ + return lp->lp_rtr_refcount != 0; +} + +static inline void +lnet_ni_addref_locked(lnet_ni_t *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] >= 0); + + (*ni->ni_refs[cpt])++; +} + +static inline void +lnet_ni_addref(lnet_ni_t *ni) +{ + lnet_net_lock(0); + lnet_ni_addref_locked(ni, 0); + lnet_net_unlock(0); +} + +static inline void +lnet_ni_decref_locked(lnet_ni_t *ni, int cpt) +{ + LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); + LASSERT(*ni->ni_refs[cpt] > 0); + + (*ni->ni_refs[cpt])--; +} + +static inline void +lnet_ni_decref(lnet_ni_t *ni) +{ + lnet_net_lock(0); + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); +} + +void lnet_ni_free(lnet_ni_t *ni); + +static inline int +lnet_nid2peerhash(lnet_nid_t nid) +{ + return cfs_hash_long(nid, LNET_PEER_HASH_BITS); +} + +static inline struct list_head * +lnet_net2rnethash(__u32 net) +{ + return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) + + LNET_NETTYP(net)) & + ((1U << the_lnet.ln_remote_nets_hbits) - 1)]; +} + +extern lnd_t the_lolnd; + + +extern int lnet_cpt_of_nid_locked(lnet_nid_t nid); +extern int lnet_cpt_of_nid(lnet_nid_t nid); +extern lnet_ni_t *lnet_nid2ni_locked(lnet_nid_t nid, int cpt); +extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt); +extern lnet_ni_t *lnet_net2ni(__u32 net); + +int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, cfs_time_t when); +void lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when); +int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); +int lnet_check_routes(void); +int lnet_del_route(__u32 net, lnet_nid_t gw_nid); +void lnet_destroy_routes(void); +int lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive); +void lnet_proc_init(void); +void lnet_proc_fini(void); +int lnet_rtrpools_alloc(int im_a_router); +void lnet_rtrpools_free(void); +lnet_remotenet_t *lnet_find_net_locked (__u32 net); + +int lnet_islocalnid(lnet_nid_t nid); +int lnet_islocalnet(__u32 net); + +void lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, + unsigned int offset, unsigned int mlen); +void lnet_msg_detach_md(lnet_msg_t *msg, int status); +void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev); +void lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type); +void lnet_msg_commit(lnet_msg_t *msg, int cpt); +void lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status); + +void lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev); +void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len); +int lnet_send(lnet_nid_t nid, lnet_msg_t *msg, lnet_nid_t rtr_nid); +void lnet_return_tx_credits_locked(lnet_msg_t *msg); +void lnet_return_rx_credits_locked(lnet_msg_t *msg); + +/* portals functions */ +/* portals attributes */ +static inline int +lnet_ptl_is_lazy(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_LAZY); +} + +static inline int +lnet_ptl_is_unique(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE); +} + +static inline int +lnet_ptl_is_wildcard(lnet_portal_t *ptl) +{ + return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD); +} + +static inline void +lnet_ptl_setopt(lnet_portal_t *ptl, int opt) +{ + ptl->ptl_options |= opt; +} + +static inline void +lnet_ptl_unsetopt(lnet_portal_t *ptl, int opt) +{ + ptl->ptl_options &= ~opt; +} + +/* match-table functions */ +struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable, + lnet_process_id_t id, __u64 mbits); +struct lnet_match_table *lnet_mt_of_attach(unsigned int index, + lnet_process_id_t id, __u64 mbits, + __u64 ignore_bits, + lnet_ins_pos_t pos); +int lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg); + +/* portals match/attach functions */ +void lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, + struct list_head *matches, struct list_head *drops); +void lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md); +int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg); + +/* initialized and finalize portals */ +int lnet_portals_create(void); +void lnet_portals_destroy(void); + +/* message functions */ +int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr, + lnet_nid_t fromnid, void *private, int rdma_req); +void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen); +lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg); +void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len); +void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc); +void lnet_drop_delayed_msg_list(struct list_head *head, char *reason); +void lnet_recv_delayed_msg_list(struct list_head *head); + +int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt); +void lnet_msg_container_cleanup(struct lnet_msg_container *container); +void lnet_msg_containers_destroy(void); +int lnet_msg_containers_create(void); + +char *lnet_msgtyp2str (int type); +void lnet_print_hdr (lnet_hdr_t * hdr); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +void lnet_counters_get(lnet_counters_t *counters); +void lnet_counters_reset(void); + +unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov); +int lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len); + +unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov); +int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, + unsigned int doffset, + unsigned int nsiov, struct iovec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, + unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, + unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob); + +static inline void +lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, struct iovec *siov, unsigned int soffset, + unsigned int nob) +{ + struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen}; + + lnet_copy_iov2iov(1, &diov, doffset, + nsiov, siov, soffset, nob); +} + +static inline void +lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset, + unsigned int nob) +{ + struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen}; + + lnet_copy_kiov2iov(1, &diov, doffset, + nsiov, skiov, soffset, nob); +} + +static inline void +lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset, + int slen, void *src, unsigned int soffset, unsigned int nob) +{ + struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen}; + lnet_copy_iov2iov(ndiov, diov, doffset, + 1, &siov, soffset, nob); +} + +static inline void +lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset, + int slen, void *src, unsigned int soffset, unsigned int nob) +{ + struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen}; + lnet_copy_iov2kiov(ndiov, dkiov, doffset, + 1, &siov, soffset, nob); +} + +void lnet_me_unlink(lnet_me_t *me); + +void lnet_md_unlink(lnet_libmd_t *md); +void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); + +void lnet_register_lnd(lnd_t *lnd); +void lnet_unregister_lnd(lnd_t *lnd); +int lnet_set_ip_niaddr (lnet_ni_t *ni); + +int lnet_connect(socket_t **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port); +void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int port); +int lnet_count_acceptor_nis(void); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); + +int lnet_count_acceptor_nis(void); +int lnet_acceptor_port(void); + +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +void lnet_get_tunables(void); +int lnet_peers_start_down(void); +int lnet_peer_buffer_credits(lnet_ni_t *ni); + +int lnet_router_checker_start(void); +void lnet_router_checker_stop(void); +void lnet_swap_pinginfo(lnet_ping_info_t *info); + +int lnet_ping_target_init(void); +void lnet_ping_target_fini(void); +int lnet_ping(lnet_process_id_t id, int timeout_ms, + lnet_process_id_t *ids, int n_ids); + +int lnet_parse_ip2nets (char **networksp, char *ip2nets); +int lnet_parse_routes (char *route_str, int *im_a_router); +int lnet_parse_networks (struct list_head *nilist, char *networks); + +int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt); +lnet_peer_t *lnet_find_peer_locked(struct lnet_peer_table *ptable, + lnet_nid_t nid); +void lnet_peer_tables_cleanup(void); +void lnet_peer_tables_destroy(void); +int lnet_peer_tables_create(void); +void lnet_debug_peer(lnet_nid_t nid); + + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h new file mode 100644 index 000000000000..86428d4b993e --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lib-types.h @@ -0,0 +1,765 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef __LNET_LIB_TYPES_H__ +#define __LNET_LIB_TYPES_H__ + +#include + +#include +#include +#include + +#define WIRE_ATTR __attribute__((packed)) + +/* Packed version of lnet_process_id_t to transfer via network */ +typedef struct { + lnet_nid_t nid; + lnet_pid_t pid; /* node id / process id */ +} WIRE_ATTR lnet_process_id_packed_t; + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} WIRE_ATTR lnet_handle_wire_t; + +typedef enum { + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +} lnet_msg_type_t; + +/* The variant fields of the portals message header are aligned on an 8 + * byte boundary in the message header. Note that all types used in these + * wire structs MUST be fixed size and the smaller types are placed at the + * end. */ +typedef struct lnet_ack { + lnet_handle_wire_t dst_wmd; + __u64 match_bits; + __u32 mlength; +} WIRE_ATTR lnet_ack_t; + +typedef struct lnet_put { + lnet_handle_wire_t ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; +} WIRE_ATTR lnet_put_t; + +typedef struct lnet_get { + lnet_handle_wire_t return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; +} WIRE_ATTR lnet_get_t; + +typedef struct lnet_reply { + lnet_handle_wire_t dst_wmd; +} WIRE_ATTR lnet_reply_t; + +typedef struct lnet_hello { + __u64 incarnation; + __u32 type; +} WIRE_ATTR lnet_hello_t; + +typedef struct { + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* lnet_msg_type_t */ + __u32 payload_length; /* payload data to follow */ + /*<------__u64 aligned------->*/ + union { + lnet_ack_t ack; + lnet_put_t put; + lnet_get_t get; + lnet_reply_t reply; + lnet_hello_t hello; + } msg; +} WIRE_ATTR lnet_hdr_t; + +/* A HELLO message contains a magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * LNET_MSG_HELLO in the type field. All other common fields are zero + * (including payload_size; i.e. no payload). + * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID. These LNDs should + * exchange HELLO messages when a connection is first established. Individual + * LNDs can put whatever else they fancy in lnet_hdr_t::msg. + */ +typedef struct { + __u32 magic; /* LNET_PROTO_TCP_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} WIRE_ATTR lnet_magicversion_t; + +/* PROTO MAGIC for LNDs */ +#define LNET_PROTO_IB_MAGIC 0x0be91b91 +#define LNET_PROTO_RA_MAGIC 0x0be91b92 +#define LNET_PROTO_QSW_MAGIC 0x0be91b93 +#define LNET_PROTO_GNI_MAGIC 0xb00fbabe /* ask Kim */ +#define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_PTL_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define LNET_PROTO_MX_MAGIC 0x4d583130 /* 'MX10'! */ +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 +#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond with a + * "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + + +#define LNET_PROTO_TCP_VERSION_MAJOR 1 +#define LNET_PROTO_TCP_VERSION_MINOR 0 + +/* Acceptor connection request */ +typedef struct { + __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ + __u32 acr_version; /* protocol version */ + __u64 acr_nid; /* target NID */ +} WIRE_ATTR lnet_acceptor_connreq_t; + +#define LNET_PROTO_ACCEPTOR_VERSION 1 + +/* forward refs */ +struct lnet_libmd; + +typedef struct lnet_msg { + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + lnet_process_id_t msg_target; + /* where is it from, it's only for building event */ + lnet_nid_t msg_from; + __u32 msg_type; + + /* commited for sending */ + unsigned int msg_tx_committed:1; + /* CPT # this message committed for sending */ + unsigned int msg_tx_cpt:15; + /* commited for receiving */ + unsigned int msg_rx_committed:1; + /* CPT # this message committed for receiving */ + unsigned int msg_rx_cpt:15; + /* queued for tx credit */ + unsigned int msg_tx_delayed:1; + /* queued for RX buffer */ + unsigned int msg_rx_delayed:1; + /* ready for pending on RX delay list */ + unsigned int msg_rx_ready_delay:1; + + unsigned int msg_vmflush:1; /* VM trying to free memory */ + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a globel router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + + struct lnet_peer *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct iovec *msg_iov; + lnet_kiov_t *msg_kiov; + + lnet_event_t msg_ev; + lnet_hdr_t msg_hdr; +} lnet_msg_t; + + +typedef struct lnet_libhandle { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lnet_libhandle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(char *)(&((type *)0)->member))) + +typedef struct lnet_eq { + struct list_head eq_list; + lnet_libhandle_t eq_lh; + lnet_seq_t eq_enq_seq; + lnet_seq_t eq_deq_seq; + unsigned int eq_size; + lnet_eq_handler_t eq_callback; + lnet_event_t *eq_events; + int **eq_refs; /* percpt refcount for EQ */ +} lnet_eq_t; + +typedef struct lnet_me { + struct list_head me_list; + lnet_libhandle_t me_lh; + lnet_process_id_t me_match_id; + unsigned int me_portal; + unsigned int me_pos; /* hash offset in mt_hash */ + __u64 me_match_bits; + __u64 me_ignore_bits; + lnet_unlink_t me_unlink; + struct lnet_libmd *me_md; +} lnet_me_t; + +typedef struct lnet_libmd { + struct list_head md_list; + lnet_libhandle_t md_lh; + lnet_me_t *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; + unsigned int md_flags; + void *md_user_ptr; + lnet_eq_t *md_eq; + unsigned int md_niov; /* # frags */ + union { + struct iovec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; + } md_iov; +} lnet_libmd_t; + +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) + +#ifdef LNET_USE_LIB_FREELIST +typedef struct +{ + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lnet_freelist_t; + +typedef struct +{ + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lnet_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* ln_test_peers */ + lnet_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lnet_test_peer_t; + +#define LNET_COOKIE_TYPE_MD 1 +#define LNET_COOKIE_TYPE_ME 2 +#define LNET_COOKIE_TYPE_EQ 3 +#define LNET_COOKIE_TYPE_BITS 2 +#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) + +struct lnet_ni; /* forward ref */ + +typedef struct lnet_lnd +{ + /* fields managed by portals */ + struct list_head lnd_list; /* stash in the LND table */ + int lnd_refcount; /* # active instances */ + + /* fields initialised by the LND */ + unsigned int lnd_type; + + int (*lnd_startup) (struct lnet_ni *ni); + void (*lnd_shutdown) (struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are... + * EITHER + * in virtual memory (struct iovec *iov != NULL) + * OR + * in pages (kernel only: plt_kiov_t *kiov != NULL). + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and + * GET messages; otherwise this is a response to an incoming message + * and 'private' is the 'private' passed to lnet_parse(). Return + * non-zero for immediate failure, otherwise complete later with + * lnet_finalize() */ + int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg); + + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immedaite failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ + int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + + /* notification of peer health */ + void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + + /* query of peer aliveness */ + void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, cfs_time_t *when); + + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, socket_t *sock); + +} lnd_t; + +#define LNET_NI_STATUS_UP 0x15aac0de +#define LNET_NI_STATUS_DOWN 0xdeadface +#define LNET_NI_STATUS_INVALID 0x00000000 +typedef struct { + lnet_nid_t ns_nid; + __u32 ns_status; + __u32 ns_unused; +} WIRE_ATTR lnet_ni_status_t; + +struct lnet_tx_queue { + int tq_credits; /* # tx credits free */ + int tq_credits_min; /* lowest it's been */ + int tq_credits_max; /* total # tx credits */ + struct list_head tq_delayed; /* delayed TXs */ +}; + +#define LNET_MAX_INTERFACES 16 + +typedef struct lnet_ni { + spinlock_t ni_lock; + struct list_head ni_list; /* chain on ln_nis */ + struct list_head ni_cptlist; /* chain on ln_nis_cpt */ + int ni_maxtxcredits; /* # tx credits */ + /* # per-peer send credits */ + int ni_peertxcredits; + /* # per-peer router buffer credits */ + int ni_peerrtrcredits; + /* seconds to consider peer dead */ + int ni_peertimeout; + int ni_ncpts; /* number of CPTs */ + __u32 *ni_cpts; /* bond NI on some CPTs */ + lnet_nid_t ni_nid; /* interface's NID */ + void *ni_data; /* instance-specific data */ + lnd_t *ni_lnd; /* procedural interface */ + struct lnet_tx_queue **ni_tx_queues; /* percpt TX queues */ + int **ni_refs; /* percpt reference count */ + long ni_last_alive; /* when I was last alive */ + lnet_ni_status_t *ni_status; /* my health status */ + /* equivalent interfaces to use */ + char *ni_interfaces[LNET_MAX_INTERFACES]; +} lnet_ni_t; + +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL + +/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x + * of old LNet, so there shouldn't be any compatibility issue */ +#define LNET_PING_FEAT_INVAL (0) /* no feature */ +#define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ +#define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ + +#define LNET_PING_FEAT_MASK (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS) + +typedef struct { + __u32 pi_magic; + __u32 pi_features; + lnet_pid_t pi_pid; + __u32 pi_nnis; + lnet_ni_status_t pi_ni[0]; +} WIRE_ATTR lnet_ping_info_t; + +/* router checker data, per router */ +#define LNET_MAX_RTR_NIS 16 +#define LNET_PINGINFO_SIZE offsetof(lnet_ping_info_t, pi_ni[LNET_MAX_RTR_NIS]) +typedef struct { + /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ + struct list_head rcd_list; + lnet_handle_md_t rcd_mdh; /* ping buffer MD */ + struct lnet_peer *rcd_gateway; /* reference to gateway */ + lnet_ping_info_t *rcd_pinginfo; /* ping buffer */ +} lnet_rc_data_t; + +typedef struct lnet_peer { + struct list_head lp_hashlist; /* chain on peer hash */ + struct list_head lp_txq; /* messages blocking for tx credits */ + struct list_head lp_rtrq; /* messages blocking for router credits */ + struct list_head lp_rtr_list; /* chain on router list */ + int lp_txcredits; /* # tx credits available */ + int lp_mintxcredits; /* low water mark */ + int lp_rtrcredits; /* # router credits */ + int lp_minrtrcredits; /* low water mark */ + unsigned int lp_alive:1; /* alive/dead? */ + unsigned int lp_notify:1; /* notification outstanding? */ + unsigned int lp_notifylnd:1; /* outstanding notification for LND? */ + unsigned int lp_notifying:1; /* some thread is handling notification */ + unsigned int lp_ping_notsent; /* SEND event outstanding from ping */ + int lp_alive_count; /* # times router went dead<->alive */ + long lp_txqnob; /* bytes queued for sending */ + cfs_time_t lp_timestamp; /* time of last aliveness news */ + cfs_time_t lp_ping_timestamp; /* time of last ping attempt */ + cfs_time_t lp_ping_deadline; /* != 0 if ping reply expected */ + cfs_time_t lp_last_alive; /* when I was last alive */ + cfs_time_t lp_last_query; /* when lp_ni was queried last time */ + lnet_ni_t *lp_ni; /* interface peer is on */ + lnet_nid_t lp_nid; /* peer's NID */ + int lp_refcount; /* # refs */ + int lp_cpt; /* CPT this peer attached on */ + /* # refs from lnet_route_t::lr_gateway */ + int lp_rtr_refcount; + /* returned RC ping features */ + unsigned int lp_ping_feats; + struct list_head lp_routes; /* routers on this peer */ + lnet_rc_data_t *lp_rcd; /* router checker state */ +} lnet_peer_t; + + +/* peer hash size */ +#define LNET_PEER_HASH_BITS 9 +#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) + +/* peer hash table */ +struct lnet_peer_table { + int pt_version; /* /proc validity stamp */ + int pt_number; /* # peers extant */ + struct list_head pt_deathrow; /* zombie peers */ + struct list_head *pt_hash; /* NID->peer hash */ +}; + +/* peer aliveness is enabled only on routers for peers in a network where the + * lnet_ni_t::ni_peertimeout has been set to a positive value */ +#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ + (lp)->lp_ni->ni_peertimeout > 0) + +typedef struct { + struct list_head lr_list; /* chain on net */ + struct list_head lr_gwlist; /* chain on gateway */ + lnet_peer_t *lr_gateway; /* router node */ + __u32 lr_net; /* remote network number */ + int lr_seq; /* sequence for round-robin */ + unsigned int lr_downis; /* number of down NIs */ + unsigned int lr_hops; /* how far I am */ +} lnet_route_t; + +#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) +#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) +#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) + +typedef struct { + struct list_head lrn_list; /* chain on ln_remote_nets_hash */ + struct list_head lrn_routes; /* routes to me */ + __u32 lrn_net; /* my net number */ +} lnet_remotenet_t; + +typedef struct { + struct list_head rbp_bufs; /* my free buffer pool */ + struct list_head rbp_msgs; /* messages blocking for a buffer */ + int rbp_npages; /* # pages in each buffer */ + int rbp_nbuffers; /* # buffers */ + int rbp_credits; /* # free buffers / blocked messages */ + int rbp_mincredits; /* low water mark */ +} lnet_rtrbufpool_t; + +typedef struct { + struct list_head rb_list; /* chain on rbp_bufs */ + lnet_rtrbufpool_t *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ +} lnet_rtrbuf_t; + +typedef struct { + __u32 msgs_alloc; + __u32 msgs_max; + __u32 errors; + __u32 send_count; + __u32 recv_count; + __u32 route_count; + __u32 drop_count; + __u64 send_length; + __u64 recv_length; + __u64 route_length; + __u64 drop_length; +} WIRE_ATTR lnet_counters_t; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +#define LNET_NRBPOOLS 3 /* # different router buffer pools */ + +enum { + /* Didn't match anything */ + LNET_MATCHMD_NONE = (1 << 0), + /* Matched OK */ + LNET_MATCHMD_OK = (1 << 1), + /* Must be discarded */ + LNET_MATCHMD_DROP = (1 << 2), + /* match and buffer is exhausted */ + LNET_MATCHMD_EXHAUSTED = (1 << 3), + /* match or drop */ + LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), +}; + +/* Options for lnet_portal_t::ptl_options */ +#define LNET_PTL_LAZY (1 << 0) +#define LNET_PTL_MATCH_UNIQUE (1 << 1) /* unique match, for RDMA */ +#define LNET_PTL_MATCH_WILDCARD (1 << 2) /* wildcard match, request portal */ + +/* parameter for matching operations (GET, PUT) */ +struct lnet_match_info { + __u64 mi_mbits; + lnet_process_id_t mi_id; + unsigned int mi_opc; + unsigned int mi_portal; + unsigned int mi_rlength; + unsigned int mi_roffset; +}; + +/* ME hash of RDMA portal */ +#define LNET_MT_HASH_BITS 8 +#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) +#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) +/* we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, + * the last entry is reserved for MEs with ignore-bits */ +#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE +/* __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which + * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the + * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] */ +#define LNET_MT_BITS_U64 6 /* 2^6 bits */ +#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) +#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) + +/* portal match table */ +struct lnet_match_table { + /* reserved for upcoming patches, CPU partition ID */ + unsigned int mt_cpt; + unsigned int mt_portal; /* portal index */ + /* match table is set as "enabled" if there's non-exhausted MD + * attached on mt_mhash, it's only valide for wildcard portal */ + unsigned int mt_enabled; + /* bitmap to flag whether MEs on mt_hash are exhausted or not */ + __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; + struct list_head *mt_mhash; /* matching hash */ +}; + +/* these are only useful for wildcard portal */ +/* Turn off message rotor for wildcard portals */ +#define LNET_PTL_ROTOR_OFF 0 +/* round-robin dispatch all PUT messages for wildcard portals */ +#define LNET_PTL_ROTOR_ON 1 +/* round-robin dispatch routed PUT message for wildcard portals */ +#define LNET_PTL_ROTOR_RR_RT 2 +/* dispatch routed PUT message by hashing source NID for wildcard portals */ +#define LNET_PTL_ROTOR_HASH_RT 3 + +typedef struct lnet_portal { + spinlock_t ptl_lock; + unsigned int ptl_index; /* portal ID, reserved */ + /* flags on this portal: lazy, unique... */ + unsigned int ptl_options; + /* list of messags which are stealing buffer */ + struct list_head ptl_msg_stealing; + /* messages blocking for MD */ + struct list_head ptl_msg_delayed; + /* Match table for each CPT */ + struct lnet_match_table **ptl_mtables; + /* spread rotor of incoming "PUT" */ + int ptl_rotor; + /* # active entries for this portal */ + int ptl_mt_nmaps; + /* array of active entries' cpu-partition-id */ + int ptl_mt_maps[0]; +} lnet_portal_t; + +#define LNET_LH_HASH_BITS 12 +#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) +#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) + +/* resource container (ME, MD, EQ) */ +struct lnet_res_container { + unsigned int rec_type; /* container type */ + __u64 rec_lh_cookie; /* cookie generator */ + struct list_head rec_active; /* active resource list */ + struct list_head *rec_lh_hash; /* handle hash */ +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_t rec_freelist; /* freelist for resources */ +#endif +}; + +/* message container */ +struct lnet_msg_container { + int msc_init; /* initialized or not */ + /* max # threads finalizing */ + int msc_nfinalizers; + /* msgs waiting to complete finalizing */ + struct list_head msc_finalizing; + struct list_head msc_active; /* active message list */ + /* threads doing finalization */ + void **msc_finalizers; +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_t msc_freelist; /* freelist for messages */ +#endif +}; + +/* Router Checker states */ +#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_RC_STATE_RUNNING 1 /* started up OK */ +#define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ + +typedef struct +{ + /* CPU partition table of LNet */ + struct cfs_cpt_table *ln_cpt_table; + /* number of CPTs in ln_cpt_table */ + unsigned int ln_cpt_number; + unsigned int ln_cpt_bits; + + /* protect LNet resources (ME/MD/EQ) */ + struct cfs_percpt_lock *ln_res_lock; + /* # portals */ + int ln_nportals; + /* the vector of portals */ + lnet_portal_t **ln_portals; + /* percpt ME containers */ + struct lnet_res_container **ln_me_containers; + /* percpt MD container */ + struct lnet_res_container **ln_md_containers; + + /* Event Queue container */ + struct lnet_res_container ln_eq_container; + wait_queue_head_t ln_eq_waitq; + spinlock_t ln_eq_wait_lock; + unsigned int ln_remote_nets_hbits; + + /* protect NI, peer table, credits, routers, rtrbuf... */ + struct cfs_percpt_lock *ln_net_lock; + /* percpt message containers for active/finalizing/freed message */ + struct lnet_msg_container **ln_msg_containers; + lnet_counters_t **ln_counters; + struct lnet_peer_table **ln_peer_tables; + /* failure simulation */ + struct list_head ln_test_peers; + + struct list_head ln_nis; /* LND instances */ + /* NIs bond on specific CPT(s) */ + struct list_head ln_nis_cpt; + /* dying LND instances */ + struct list_head ln_nis_zombie; + lnet_ni_t *ln_loni; /* the loopback NI */ + /* NI to wait for events in */ + lnet_ni_t *ln_eq_waitni; + + /* remote networks with routes to them */ + struct list_head *ln_remote_nets_hash; + /* validity stamp */ + __u64 ln_remote_nets_version; + /* list of all known routers */ + struct list_head ln_routers; + /* validity stamp */ + __u64 ln_routers_version; + /* percpt router buffer pools */ + lnet_rtrbufpool_t **ln_rtrpools; + + lnet_handle_md_t ln_ping_target_md; + lnet_handle_eq_t ln_ping_target_eq; + lnet_ping_info_t *ln_ping_info; + + /* router checker startup/shutdown state */ + int ln_rc_state; + /* router checker's event queue */ + lnet_handle_eq_t ln_rc_eqh; + /* rcd still pending on net */ + struct list_head ln_rcd_deathrow; + /* rcd ready for free */ + struct list_head ln_rcd_zombie; + /* serialise startup/shutdown */ + struct semaphore ln_rc_signal; + + struct mutex ln_api_mutex; + struct mutex ln_lnd_mutex; + int ln_init; /* LNetInit() called? */ + /* Have I called LNetNIInit myself? */ + int ln_niinit_self; + /* LNetNIInit/LNetNIFini counter */ + int ln_refcount; + /* shutdown in progress */ + int ln_shutdown; + + int ln_routing; /* am I a router? */ + lnet_pid_t ln_pid; /* requested pid */ + /* uniquely identifies this ni in this epoch */ + __u64 ln_interface_cookie; + /* registered LNDs */ + struct list_head ln_lnds; + + /* space for network names */ + char *ln_network_tokens; + int ln_network_tokens_nob; + /* test protocol compatibility flags */ + int ln_testprotocompat; + +} lnet_t; + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/linux/api-support.h b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h new file mode 100644 index 000000000000..ca78a0a4e908 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/linux/api-support.h @@ -0,0 +1,43 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_API_SUPPORT_H__ +#define __LINUX_API_SUPPORT_H__ + +#ifndef __LNET_API_SUPPORT_H__ +#error Do not #include this file directly. #include instead +#endif + + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h new file mode 100644 index 000000000000..d2c0a70f1f7e --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-lnet.h @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LIB_LNET_H__ +#define __LNET_LINUX_LIB_LNET_H__ + +#ifndef __LNET_LIB_LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +# include +# include +# include +# include + +static inline __u64 +lnet_page2phys (struct page *p) +{ + /* compiler optimizer will elide unused branches */ + + switch (sizeof(typeof(page_to_phys(p)))) { + case 4: + /* page_to_phys returns a 32 bit physical address. This must + * be a 32 bit machine with <= 4G memory and we must ensure we + * don't sign extend when converting to 64 bits. */ + return (unsigned long)page_to_phys(p); + + case 8: + /* page_to_phys returns a 64 bit physical address :) */ + return page_to_phys(p); + + default: + LBUG(); + return 0; + } +} + + +#define LNET_ROUTER + +#endif /* __LNET_LINUX_LIB_LNET_H__ */ diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h new file mode 100644 index 000000000000..669e8c038534 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/linux/lib-types.h @@ -0,0 +1,45 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LIB_TYPES_H__ +#define __LNET_LINUX_LIB_TYPES_H__ + +#ifndef __LNET_LIB_TYPES_H__ +#error Do not #include this file directly. #include instead +#endif + +# include +# include + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/linux/lnet.h b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h new file mode 100644 index 000000000000..1e888f1efc45 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/linux/lnet.h @@ -0,0 +1,56 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_LINUX_LNET_H__ +#define __LNET_LINUX_LNET_H__ + +#ifndef __LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +/* + * lnet.h + * + * User application interface file + */ + +#include +#include + +#define cfs_tcp_sendpage(sk, page, offset, size, flags) \ + tcp_sendpage(sk, page, offset, size, flags) + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h new file mode 100644 index 000000000000..1bde44ebb911 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lnet-sysctl.h @@ -0,0 +1,51 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_SYSCTL_H__ +#define __LNET_SYSCTL_H__ + +#if defined(CONFIG_SYSCTL) + + +#define CTL_KRANAL 201 +#define CTL_O2IBLND 205 +#define CTL_PTLLND 206 +#define CTL_QSWNAL 207 +#define CTL_SOCKLND 208 +#define CTL_GNILND 210 + + +#endif + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lnet.h b/drivers/staging/lustre/include/linux/lnet/lnet.h new file mode 100644 index 000000000000..c532b15d7643 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lnet.h @@ -0,0 +1,51 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_H__ +#define __LNET_H__ + +/* + * lnet.h + * + * User application interface file + */ +#include + +#include +#include + +#define LNET_NIDSTR_COUNT 1024 /* # of nidstrings */ +#define LNET_NIDSTR_SIZE 32 /* size of each one (see below for usage) */ + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/linux/lnet/lnetctl.h new file mode 100644 index 000000000000..b22daa234255 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lnetctl.h @@ -0,0 +1,80 @@ +/* + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#include +#include + +#define LNET_DEV_ID 0 +#define LNET_DEV_PATH "/dev/lnet" +#define LNET_DEV_MAJOR 10 +#define LNET_DEV_MINOR 240 +#define OBD_DEV_ID 1 +#define OBD_DEV_NAME "obd" +#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME +#define OBD_DEV_MAJOR 10 +#define OBD_DEV_MINOR 241 +#define SMFS_DEV_ID 2 +#define SMFS_DEV_PATH "/dev/snapdev" +#define SMFS_DEV_MAJOR 10 +#define SMFS_DEV_MINOR 242 + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_list_nids(int argc, char **argv); +int jt_ptl_which_nid(int argc, char **argv); +int jt_ptl_print_interfaces(int argc, char **argv); +int jt_ptl_add_interface(int argc, char **argv); +int jt_ptl_del_interface(int argc, char **argv); +int jt_ptl_print_peers (int argc, char **argv); +int jt_ptl_add_peer (int argc, char **argv); +int jt_ptl_del_peer (int argc, char **argv); +int jt_ptl_print_connections (int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_print_active_txs(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); +int jt_ptl_testprotocompat(int argc, char **argv); +int jt_ptl_memhog(int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lnetst.h b/drivers/staging/lustre/include/linux/lnet/lnetst.h new file mode 100644 index 000000000000..d90f94e94601 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/lnetst.h @@ -0,0 +1,491 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/lnetst.h + * + * Author: Liang Zhen + */ + +#ifndef __LNET_ST_H__ +#define __LNET_ST_H__ + +#include +#include +#include + +#define LST_FEAT_NONE (0) +#define LST_FEAT_BULK_LEN (1 << 0) /* enable variable page size */ + +#define LST_FEATS_EMPTY (LST_FEAT_NONE) +#define LST_FEATS_MASK (LST_FEAT_NONE | LST_FEAT_BULK_LEN) + +#define LST_NAME_SIZE 32 /* max name buffer length */ + +#define LSTIO_DEBUG 0xC00 /* debug */ +#define LSTIO_SESSION_NEW 0xC01 /* create session */ +#define LSTIO_SESSION_END 0xC02 /* end session */ +#define LSTIO_SESSION_INFO 0xC03 /* query session */ +#define LSTIO_GROUP_ADD 0xC10 /* add group */ +#define LSTIO_GROUP_LIST 0xC11 /* list all groups in session */ +#define LSTIO_GROUP_INFO 0xC12 /* query defailt infomation of specified group */ +#define LSTIO_GROUP_DEL 0xC13 /* delete group */ +#define LSTIO_NODES_ADD 0xC14 /* add nodes to specified group */ +#define LSTIO_GROUP_UPDATE 0xC15 /* update group */ +#define LSTIO_BATCH_ADD 0xC20 /* add batch */ +#define LSTIO_BATCH_START 0xC21 /* start batch */ +#define LSTIO_BATCH_STOP 0xC22 /* stop batch */ +#define LSTIO_BATCH_DEL 0xC23 /* delete batch */ +#define LSTIO_BATCH_LIST 0xC24 /* show all batches in the session */ +#define LSTIO_BATCH_INFO 0xC25 /* show defail of specified batch */ +#define LSTIO_TEST_ADD 0xC26 /* add test (to batch) */ +#define LSTIO_BATCH_QUERY 0xC27 /* query batch status */ +#define LSTIO_STAT_QUERY 0xC30 /* get stats */ + +typedef struct { + lnet_nid_t ses_nid; /* nid of console node */ + __u64 ses_stamp; /* time stamp */ +} lst_sid_t; /*** session id */ + +extern lst_sid_t LST_INVALID_SID; + +typedef struct { + __u64 bat_id; /* unique id in session */ +} lst_bid_t; /*** batch id (group of tests) */ + +/* Status of test node */ +#define LST_NODE_ACTIVE 0x1 /* node in this session */ +#define LST_NODE_BUSY 0x2 /* node is taken by other session */ +#define LST_NODE_DOWN 0x4 /* node is down */ +#define LST_NODE_UNKNOWN 0x8 /* node not in session */ + +typedef struct { + lnet_process_id_t nde_id; /* id of node */ + int nde_state; /* state of node */ +} lstcon_node_ent_t; /*** node entry, for list_group command */ + +typedef struct { + int nle_nnode; /* # of nodes */ + int nle_nactive; /* # of active nodes */ + int nle_nbusy; /* # of busy nodes */ + int nle_ndown; /* # of down nodes */ + int nle_nunknown; /* # of unknown nodes */ +} lstcon_ndlist_ent_t; /*** node_list entry, for list_batch command */ + +typedef struct { + int tse_type; /* test type */ + int tse_loop; /* loop count */ + int tse_concur; /* concurrency of test */ +} lstcon_test_ent_t; /*** test summary entry, for list_batch command */ + +typedef struct { + int bae_state; /* batch status */ + int bae_timeout; /* batch timeout */ + int bae_ntest; /* # of tests in the batch */ +} lstcon_batch_ent_t; /*** batch summary entry, for list_batch command */ + +typedef struct { + lstcon_ndlist_ent_t tbe_cli_nle; /* client (group) node_list entry */ + lstcon_ndlist_ent_t tbe_srv_nle; /* server (group) node_list entry */ + union { + lstcon_test_ent_t tbe_test; /* test entry */ + lstcon_batch_ent_t tbe_batch; /* batch entry */ + } u; +} lstcon_test_batch_ent_t; /*** test/batch verbose information entry, + *** for list_batch command */ + +typedef struct { + struct list_head rpe_link; /* link chain */ + lnet_process_id_t rpe_peer; /* peer's id */ + struct timeval rpe_stamp; /* time stamp of RPC */ + int rpe_state; /* peer's state */ + int rpe_rpc_errno; /* RPC errno */ + + lst_sid_t rpe_sid; /* peer's session id */ + int rpe_fwk_errno; /* framework errno */ + int rpe_priv[4]; /* private data */ + char rpe_payload[0]; /* private reply payload */ +} lstcon_rpc_ent_t; + +typedef struct { + int trs_rpc_stat[4]; /* RPCs stat (0: total, 1: failed, 2: finished, 4: reserved */ + int trs_rpc_errno; /* RPC errno */ + int trs_fwk_stat[8]; /* framework stat */ + int trs_fwk_errno; /* errno of the first remote error */ + void *trs_fwk_private; /* private framework stat */ +} lstcon_trans_stat_t; + +static inline int +lstcon_rpc_stat_total(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0]; +} + +static inline int +lstcon_rpc_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1]; +} + +static inline int +lstcon_rpc_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2]; +} + +static inline int +lstcon_sesop_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesop_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_active(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_sesqry_stat_busy(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_sesqry_stat_unknown(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_tsbop_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbop_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_idle(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_tsbqry_stat_run(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +static inline int +lstcon_tsbqry_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; +} + +static inline int +lstcon_statqry_stat_success(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; +} + +static inline int +lstcon_statqry_stat_failure(lstcon_trans_stat_t *stat, int inc) +{ + return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; +} + +/* create a session */ +typedef struct { + int lstio_ses_key; /* IN: local key */ + int lstio_ses_timeout; /* IN: session timeout */ + int lstio_ses_force; /* IN: force create ? */ + /** IN: session features */ + unsigned lstio_ses_feats; + lst_sid_t *lstio_ses_idp; /* OUT: session id */ + int lstio_ses_nmlen; /* IN: name length */ + char *lstio_ses_namep; /* IN: session name */ +} lstio_session_new_args_t; + +/* query current session */ +typedef struct { + lst_sid_t *lstio_ses_idp; /* OUT: session id */ + int *lstio_ses_keyp; /* OUT: local key */ + /** OUT: session features */ + unsigned *lstio_ses_featp; + lstcon_ndlist_ent_t *lstio_ses_ndinfo; /* OUT: */ + int lstio_ses_nmlen; /* IN: name length */ + char *lstio_ses_namep; /* OUT: session name */ +} lstio_session_info_args_t; + +/* delete a session */ +typedef struct { + int lstio_ses_key; /* IN: session key */ +} lstio_session_end_args_t; + +#define LST_OPC_SESSION 1 +#define LST_OPC_GROUP 2 +#define LST_OPC_NODES 3 +#define LST_OPC_BATCHCLI 4 +#define LST_OPC_BATCHSRV 5 + +typedef struct { + int lstio_dbg_key; /* IN: session key */ + int lstio_dbg_type; /* IN: debug sessin|batch|group|nodes list */ + int lstio_dbg_flags; /* IN: reserved debug flags */ + int lstio_dbg_timeout; /* IN: timeout of debug */ + + int lstio_dbg_nmlen; /* IN: len of name */ + char *lstio_dbg_namep; /* IN: name of group|batch */ + int lstio_dbg_count; /* IN: # of test nodes to debug */ + lnet_process_id_t *lstio_dbg_idsp; /* IN: id of test nodes */ + struct list_head *lstio_dbg_resultp; /* OUT: list head of result buffer */ +} lstio_debug_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ +} lstio_group_add_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ +} lstio_group_del_args_t; + +#define LST_GROUP_CLEAN 1 /* remove inactive nodes in the group */ +#define LST_GROUP_REFRESH 2 /* refresh inactive nodes in the group */ +#define LST_GROUP_RMND 3 /* delete nodes from the group */ + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_opc; /* IN: OPC */ + int lstio_grp_args; /* IN: arguments */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes id */ + lnet_process_id_t *lstio_grp_idsp; /* IN: array of nodes */ + struct list_head *lstio_grp_resultp; /* OUT: list head of result buffer */ +} lstio_group_update_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name length */ + char *lstio_grp_namep; /* IN: group name */ + int lstio_grp_count; /* IN: # of nodes */ + /** OUT: session features */ + unsigned *lstio_grp_featp; + lnet_process_id_t *lstio_grp_idsp; /* IN: nodes */ + struct list_head *lstio_grp_resultp; /* OUT: list head of result buffer */ +} lstio_group_nodes_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_idx; /* IN: group idx */ + int lstio_grp_nmlen; /* IN: name len */ + char *lstio_grp_namep; /* OUT: name */ +} lstio_group_list_args_t; + +typedef struct { + int lstio_grp_key; /* IN: session key */ + int lstio_grp_nmlen; /* IN: name len */ + char *lstio_grp_namep; /* IN: name */ + lstcon_ndlist_ent_t *lstio_grp_entp; /* OUT: description of group */ + + int *lstio_grp_idxp; /* IN/OUT: node index */ + int *lstio_grp_ndentp; /* IN/OUT: # of nodent */ + lstcon_node_ent_t *lstio_grp_dentsp; /* OUT: nodent array */ +} lstio_group_info_args_t; + +#define LST_DEFAULT_BATCH "batch" /* default batch name */ + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_add_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_del_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_timeout; /* IN: timeout for the batch */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_run_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_force; /* IN: abort unfinished test RPC */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_stop_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_testidx; /* IN: test index */ + int lstio_bat_client; /* IN: is test client? */ + int lstio_bat_timeout; /* IN: timeout for waiting */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ + struct list_head *lstio_bat_resultp; /* OUT: list head of result buffer */ +} lstio_batch_query_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_idx; /* IN: index */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: batch name */ +} lstio_batch_list_args_t; + +typedef struct { + int lstio_bat_key; /* IN: session key */ + int lstio_bat_nmlen; /* IN: name length */ + char *lstio_bat_namep; /* IN: name */ + int lstio_bat_server; /* IN: query server or not */ + int lstio_bat_testidx; /* IN: test index */ + lstcon_test_batch_ent_t *lstio_bat_entp; /* OUT: batch ent */ + + int *lstio_bat_idxp; /* IN/OUT: index of node */ + int *lstio_bat_ndentp; /* IN/OUT: # of nodent */ + lstcon_node_ent_t *lstio_bat_dentsp; /* array of nodent */ +} lstio_batch_info_args_t; + +/* add stat in session */ +typedef struct { + int lstio_sta_key; /* IN: session key */ + int lstio_sta_timeout; /* IN: timeout for stat requst */ + int lstio_sta_nmlen; /* IN: group name length */ + char *lstio_sta_namep; /* IN: group name */ + int lstio_sta_count; /* IN: # of pid */ + lnet_process_id_t *lstio_sta_idsp; /* IN: pid */ + struct list_head *lstio_sta_resultp; /* OUT: list head of result buffer */ +} lstio_stat_args_t; + +typedef enum { + LST_TEST_BULK = 1, + LST_TEST_PING = 2 +} lst_test_type_t; + +/* create a test in a batch */ +#define LST_MAX_CONCUR 1024 /* Max concurrency of test */ + +typedef struct { + int lstio_tes_key; /* IN: session key */ + int lstio_tes_bat_nmlen; /* IN: batch name len */ + char *lstio_tes_bat_name; /* IN: batch name */ + int lstio_tes_type; /* IN: test type */ + int lstio_tes_oneside; /* IN: one sided test */ + int lstio_tes_loop; /* IN: loop count */ + int lstio_tes_concur; /* IN: concurrency */ + + int lstio_tes_dist; /* IN: node distribution in destination groups */ + int lstio_tes_span; /* IN: node span in destination groups */ + int lstio_tes_sgrp_nmlen; /* IN: source group name length */ + char *lstio_tes_sgrp_name; /* IN: group name */ + int lstio_tes_dgrp_nmlen; /* IN: destination group name length */ + char *lstio_tes_dgrp_name; /* IN: group name */ + + int lstio_tes_param_len; /* IN: param buffer len */ + void *lstio_tes_param; /* IN: parameter for specified test: + lstio_bulk_param_t, + lstio_ping_param_t, + ... more */ + int *lstio_tes_retp; /* OUT: private returned value */ + struct list_head *lstio_tes_resultp; /* OUT: list head of result buffer */ +} lstio_test_args_t; + +typedef enum { + LST_BRW_READ = 1, + LST_BRW_WRITE = 2 +} lst_brw_type_t; + +typedef enum { + LST_BRW_CHECK_NONE = 1, + LST_BRW_CHECK_SIMPLE = 2, + LST_BRW_CHECK_FULL = 3 +} lst_brw_flags_t; + +typedef struct { + int blk_opc; /* bulk operation code */ + int blk_size; /* size (bytes) */ + int blk_time; /* time of running the test*/ + int blk_flags; /* reserved flags */ +} lst_test_bulk_param_t; + +typedef struct { + int png_size; /* size of ping message */ + int png_time; /* time */ + int png_loop; /* loop */ + int png_flags; /* reserved flags */ +} lst_test_ping_param_t; + +/* more tests */ +typedef struct { + __u32 errors; + __u32 rpcs_sent; + __u32 rpcs_rcvd; + __u32 rpcs_dropped; + __u32 rpcs_expired; + __u64 bulk_get; + __u64 bulk_put; +} WIRE_ATTR srpc_counters_t; + +typedef struct { + /** milliseconds since current session started */ + __u32 running_ms; + __u32 active_batches; + __u32 zombie_sessions; + __u32 brw_errors; + __u32 ping_errors; +} WIRE_ATTR sfw_counters_t; + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd.h b/drivers/staging/lustre/include/linux/lnet/ptllnd.h new file mode 100644 index 000000000000..fc1ce8ed1f8b --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/ptllnd.h @@ -0,0 +1,94 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/ptllnd.h + * + * Author: PJ Kirner + */ + +/* + * The PTLLND was designed to support Portals with + * Lustre and non-lustre UNLINK semantics. + * However for now the two targets are Cray Portals + * on the XT3 and Lustre Portals (for testing) both + * have Lustre UNLINK semantics, so this is defined + * by default. + */ +#define LUSTRE_PORTALS_UNLINK_SEMANTICS + + +#ifdef _USING_LUSTRE_PORTALS_ + +/* NIDs are 64-bits on Lustre Portals */ +#define FMT_NID LPU64 +#define FMT_PID "%d" + +/* When using Lustre Portals Lustre completion semantics are imlicit*/ +#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 + +#else /* _USING_CRAY_PORTALS_ */ + +/* NIDs are integers on Cray Portals */ +#define FMT_NID "%u" +#define FMT_PID "%d" + +/* When using Cray Portals this is defined in the Cray Portals Header*/ +/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */ + +/* Can compare handles directly on Cray Portals */ +#define PtlHandleIsEqual(a,b) ((a) == (b)) + +/* Diffrent error types on Cray Portals*/ +#define ptl_err_t ptl_ni_fail_t + +/* + * The Cray Portals has no maximum number of IOVs. The + * maximum is limited only by memory and size of the + * int parameters (2^31-1). + * Lustre only really require that the underyling + * implemenation to support at least LNET_MAX_IOV, + * so for Cray portals we can safely just use that + * value here. + * + */ +#define PTL_MD_MAX_IOV LNET_MAX_IOV + +#endif + +#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID + +/* Align incoming small request messages to an 8 byte boundary if this is + * supported to avoid alignment issues on some architectures */ +#ifndef PTL_MD_LOCAL_ALIGN8 +# define PTL_MD_LOCAL_ALIGN8 0 +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h new file mode 100644 index 000000000000..7d12b3a23a96 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/ptllnd_wire.h @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/ptllnd_wire.h + * + * Author: PJ Kirner + */ + +/* Minimum buffer size that any peer will post to receive ptllnd messages */ +#define PTLLND_MIN_BUFFER_SIZE 256 + +/************************************************************************ + * Tunable defaults that {u,k}lnds/ptllnd should have in common. + */ + +#define PTLLND_PORTAL 9 /* The same portal PTLPRC used when talking to cray portals */ +#define PTLLND_PID 9 /* The Portals PID */ +#define PTLLND_PEERCREDITS 8 /* concurrent sends to 1 peer */ + +/* Default buffer size for kernel ptllnds (guaranteed eager) */ +#define PTLLND_MAX_KLND_MSG_SIZE 512 + +/* Default buffer size for catamount ptllnds (not guaranteed eager) - large + * enough to avoid RDMA for anything sent while control is not in liblustre */ +#define PTLLND_MAX_ULND_MSG_SIZE 512 + + +/************************************************************************ + * Portals LND Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +#define PTL_RESERVED_MATCHBITS 0x100 /* below this value is reserved + * above is for bulk data transfer */ +#define LNET_MSG_MATCHBITS 0 /* the value for the message channel */ + +typedef struct +{ + lnet_hdr_t kptlim_hdr; /* portals header */ + char kptlim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kptl_immediate_msg_t; + +typedef struct +{ + lnet_hdr_t kptlrm_hdr; /* portals header */ + __u64 kptlrm_matchbits; /* matchbits */ +} WIRE_ATTR kptl_rdma_msg_t; + +typedef struct +{ + __u64 kptlhm_matchbits; /* matchbits */ + __u32 kptlhm_max_msg_size; /* max message size */ +} WIRE_ATTR kptl_hello_msg_t; + +typedef struct +{ + /* First 2 fields fixed FOR ALL TIME */ + __u32 ptlm_magic; /* I'm a Portals LND message */ + __u16 ptlm_version; /* this is my version number */ + __u8 ptlm_type; /* the message type */ + __u8 ptlm_credits; /* returned credits */ + __u32 ptlm_nob; /* # bytes in whole message */ + __u32 ptlm_cksum; /* checksum (0 == no checksum) */ + __u64 ptlm_srcnid; /* sender's NID */ + __u64 ptlm_srcstamp; /* sender's incarnation */ + __u64 ptlm_dstnid; /* destination's NID */ + __u64 ptlm_dststamp; /* destination's incarnation */ + __u32 ptlm_srcpid; /* sender's PID */ + __u32 ptlm_dstpid; /* destination's PID */ + + union { + kptl_immediate_msg_t immediate; + kptl_rdma_msg_t rdma; + kptl_hello_msg_t hello; + } WIRE_ATTR ptlm_u; + +} kptl_msg_t; + +/* kptl_msg_t::ptlm_credits is only a __u8 */ +#define PTLLND_MSG_MAX_CREDITS ((typeof(((kptl_msg_t*) 0)->ptlm_credits)) -1) + +#define PTLLND_MSG_MAGIC LNET_PROTO_PTL_MAGIC +#define PTLLND_MSG_VERSION 0x04 + +#define PTLLND_RDMA_OK 0x00 +#define PTLLND_RDMA_FAIL 0x01 + +#define PTLLND_MSG_TYPE_INVALID 0x00 +#define PTLLND_MSG_TYPE_PUT 0x01 +#define PTLLND_MSG_TYPE_GET 0x02 +#define PTLLND_MSG_TYPE_IMMEDIATE 0x03 /* No bulk data xfer*/ +#define PTLLND_MSG_TYPE_NOOP 0x04 +#define PTLLND_MSG_TYPE_HELLO 0x05 +#define PTLLND_MSG_TYPE_NAK 0x06 diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h new file mode 100644 index 000000000000..bacc74933a39 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/socklnd.h @@ -0,0 +1,103 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/include/lnet/socklnd.h + * + * #defines shared between socknal implementation and utilities + */ +#ifndef __LNET_LNET_SOCKLND_H__ +#define __LNET_LNET_SOCKLND_H__ + +#include +#include + +#define SOCKLND_CONN_NONE (-1) +#define SOCKLND_CONN_ANY 0 +#define SOCKLND_CONN_CONTROL 1 +#define SOCKLND_CONN_BULK_IN 2 +#define SOCKLND_CONN_BULK_OUT 3 +#define SOCKLND_CONN_NTYPES 4 + +#define SOCKLND_CONN_ACK SOCKLND_CONN_BULK_IN + +typedef struct { + __u32 kshm_magic; /* magic number of socklnd message */ + __u32 kshm_version; /* version of socklnd message */ + lnet_nid_t kshm_src_nid; /* sender's nid */ + lnet_nid_t kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* connection type */ + __u32 kshm_nips; /* # IP addrs */ + __u32 kshm_ips[0]; /* IP addrs */ +} WIRE_ATTR ksock_hello_msg_t; + +typedef struct { + lnet_hdr_t ksnm_hdr; /* lnet hdr */ + + /* + * ksnm_payload is removed because of winnt compiler's limitation: + * zero-sized array can only be placed at the tail of [nested] + * structure definitions. lnet payload will be stored just after + * the body of structure ksock_lnet_msg_t + */ +} WIRE_ATTR ksock_lnet_msg_t; + +typedef struct { + __u32 ksm_type; /* type of socklnd message */ + __u32 ksm_csum; /* checksum if != 0 */ + __u64 ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */ + union { + ksock_lnet_msg_t lnetmsg; /* lnet message, it's empty if it's NOOP */ + } WIRE_ATTR ksm_u; +} WIRE_ATTR ksock_msg_t; + +static inline void +socklnd_init_msg(ksock_msg_t *msg, int type) +{ + msg->ksm_csum = 0; + msg->ksm_type = type; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +#define KSOCK_MSG_NOOP 0xc0 /* ksm_u empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +/* We need to know this number to parse hello msg from ksocklnd in + * other LND (usocklnd, for example) */ +#define KSOCK_PROTO_V2 2 +#define KSOCK_PROTO_V3 3 + +#endif diff --git a/drivers/staging/lustre/include/linux/lnet/types.h b/drivers/staging/lustre/include/linux/lnet/types.h new file mode 100644 index 000000000000..4f63b7acb9d7 --- /dev/null +++ b/drivers/staging/lustre/include/linux/lnet/types.h @@ -0,0 +1,503 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LNET_TYPES_H__ +#define __LNET_TYPES_H__ + +/** \addtogroup lnet + * @{ */ + +#include + +/** \addtogroup lnet_addr + * @{ */ + +/** Portal reserved for LNet's own use. + * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments. + */ +#define LNET_RESERVED_PORTAL 0 + +/** + * Address of an end-point in an LNet network. + * + * A node can have multiple end-points and hence multiple addresses. + * An LNet network can be a simple network (e.g. tcp0) or a network of + * LNet networks connected by LNet routers. Therefore an end-point address + * has two parts: network ID, and address within a network. + * + * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID. + */ +typedef __u64 lnet_nid_t; +/** + * ID of a process in a node. Shortened as PID to distinguish from + * lnet_process_id_t, the global process ID. + */ +typedef __u32 lnet_pid_t; + +/** wildcard NID that matches any end-point address */ +#define LNET_NID_ANY ((lnet_nid_t) -1) +/** wildcard PID that matches any lnet_pid_t */ +#define LNET_PID_ANY ((lnet_pid_t) -1) + +#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ +#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ + +#define LNET_TIME_FOREVER (-1) + +/** + * Objects maintained by the LNet are accessed through handles. Handle types + * have names of the form lnet_handle_xx_t, where xx is one of the two letter + * object type codes ('eq' for event queue, 'md' for memory descriptor, and + * 'me' for match entry). + * Each type of object is given a unique handle type to enhance type checking. + * The type lnet_handle_any_t can be used when a generic handle is needed. + * Every handle value can be converted into a value of type lnet_handle_any_t + * without loss of information. + */ +typedef struct { + __u64 cookie; +} lnet_handle_any_t; + +typedef lnet_handle_any_t lnet_handle_eq_t; +typedef lnet_handle_any_t lnet_handle_md_t; +typedef lnet_handle_any_t lnet_handle_me_t; + +#define LNET_WIRE_HANDLE_COOKIE_NONE (-1) + +/** + * Invalidate handle \a h. + */ +static inline void LNetInvalidateHandle(lnet_handle_any_t *h) +{ + h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; +} + +/** + * Compare handles \a h1 and \a h2. + * + * \return 1 if handles are equal, 0 if otherwise. + */ +static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2) +{ + return (h1.cookie == h2.cookie); +} + +/** + * Check whether handle \a h is invalid. + * + * \return 1 if handle is invalid, 0 if valid. + */ +static inline int LNetHandleIsInvalid(lnet_handle_any_t h) +{ + return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); +} + +/** + * Global process ID. + */ +typedef struct { + /** node id */ + lnet_nid_t nid; + /** process id */ + lnet_pid_t pid; +} lnet_process_id_t; +/** @} lnet_addr */ + +/** \addtogroup lnet_me + * @{ */ + +/** + * Specifies whether the match entry or memory descriptor should be unlinked + * automatically (LNET_UNLINK) or not (LNET_RETAIN). + */ +typedef enum { + LNET_RETAIN = 0, + LNET_UNLINK +} lnet_unlink_t; + +/** + * Values of the type lnet_ins_pos_t are used to control where a new match + * entry is inserted. The value LNET_INS_BEFORE is used to insert the new + * entry before the current entry or before the head of the list. The value + * LNET_INS_AFTER is used to insert the new entry after the current entry + * or after the last item in the list. + */ +typedef enum { + /** insert ME before current position or head of the list */ + LNET_INS_BEFORE, + /** insert ME after current position or tail of the list */ + LNET_INS_AFTER, + /** attach ME at tail of local CPU partition ME list */ + LNET_INS_LOCAL +} lnet_ins_pos_t; + +/** @} lnet_me */ + +/** \addtogroup lnet_md + * @{ */ + +/** + * Defines the visible parts of a memory descriptor. Values of this type + * are used to initialize memory descriptors. + */ +typedef struct { + /** + * Specify the memory region associated with the memory descriptor. + * If the options field has: + * - LNET_MD_KIOV bit set: The start field points to the starting + * address of an array of lnet_kiov_t and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The lnet_kiov_t is used to describe page-based + * fragments that are not necessarily mapped in virtal memory. + * - LNET_MD_IOVEC bit set: The start field points to the starting + * address of an array of struct iovec and the length field specifies + * the number of entries in the array. The length can't be bigger + * than LNET_MAX_IOV. The struct iovec is used to describe fragments + * that have virtual addresses. + * - Otherwise: The memory region is contiguous. The start field + * specifies the starting address for the memory region and the + * length field specifies its length. + * + * When the memory region is fragmented, all fragments but the first + * one must start on page boundary, and all but the last must end on + * page boundary. + */ + void *start; + unsigned int length; + /** + * Specifies the maximum number of operations that can be performed + * on the memory descriptor. An operation is any action that could + * possibly generate an event. In the usual case, the threshold value + * is decremented for each operation on the MD. When the threshold + * drops to zero, the MD becomes inactive and does not respond to + * operations. A threshold value of LNET_MD_THRESH_INF indicates that + * there is no bound on the number of operations that may be applied + * to a MD. + */ + int threshold; + /** + * Specifies the largest incoming request that the memory descriptor + * should respond to. When the unused portion of a MD (length - + * local offset) falls below this value, the MD becomes inactive and + * does not respond to further operations. This value is only used + * if the LNET_MD_MAX_SIZE option is set. + */ + int max_size; + /** + * Specifies the behavior of the memory descriptor. A bitwise OR + * of the following values can be used: + * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD. + * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD. + * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory + * region is provided by the incoming request. By default, the + * offset is maintained locally. When maintained locally, the + * offset is incremented by the length of the request so that + * the next operation (PUT or GET) will access the next part of + * the memory region. Note that only one offset variable exists + * per memory descriptor. If both PUT and GET operations are + * performed on a memory descriptor, the offset is updated each time. + * - LNET_MD_TRUNCATE: The length provided in the incoming request can + * be reduced to match the memory available in the region (determined + * by subtracting the offset from the length of the memory region). + * By default, if the length in the incoming operation is greater + * than the amount of memory available, the operation is rejected. + * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for + * incoming PUT operations, even if requested. By default, + * acknowledgments are sent for PUT operations that request an + * acknowledgment. Acknowledgments are never sent for GET operations. + * The data sent in the REPLY serves as an implicit acknowledgment. + * - LNET_MD_KIOV: The start and length fields specify an array of + * lnet_kiov_t. + * - LNET_MD_IOVEC: The start and length fields specify an array of + * struct iovec. + * - LNET_MD_MAX_SIZE: The max_size field is valid. + * + * Note: + * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather + * capability for memory descriptors. They can't be both set. + * - When LNET_MD_MAX_SIZE is set, the total length of the memory + * region (i.e. sum of all fragment lengths) must not be less than + * \a max_size. + */ + unsigned int options; + /** + * A user-specified value that is associated with the memory + * descriptor. The value does not need to be a pointer, but must fit + * in the space used by a pointer. This value is recorded in events + * associated with operations on this MD. + */ + void *user_ptr; + /** + * A handle for the event queue used to log the operations performed on + * the memory region. If this argument is a NULL handle (i.e. nullified + * by LNetInvalidateHandle()), operations performed on this memory + * descriptor are not logged. + */ + lnet_handle_eq_t eq_handle; +} lnet_md_t; + +/* Max Transfer Unit (minimum supported everywhere). + * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) + * these limits are system wide and not interface-local. */ +#define LNET_MTU_BITS 20 +#define LNET_MTU (1 << LNET_MTU_BITS) + +/** limit on the number of fragments in discontiguous MDs */ +#define LNET_MAX_IOV 256 + +/* Max payload size */ +# define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD +# if (LNET_MAX_PAYLOAD < LNET_MTU) +# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb" +# else +# if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV)) +/* PAGE_SIZE is a constant: check with cpp! */ +# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" +# endif +# endif + +/** + * Options for the MD structure. See lnet_md_t::options. + */ +#define LNET_MD_OP_PUT (1 << 0) +/** See lnet_md_t::options. */ +#define LNET_MD_OP_GET (1 << 1) +/** See lnet_md_t::options. */ +#define LNET_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +/** See lnet_md_t::options. */ +#define LNET_MD_TRUNCATE (1 << 4) +/** See lnet_md_t::options. */ +#define LNET_MD_ACK_DISABLE (1 << 5) +/** See lnet_md_t::options. */ +#define LNET_MD_IOVEC (1 << 6) +/** See lnet_md_t::options. */ +#define LNET_MD_MAX_SIZE (1 << 7) +/** See lnet_md_t::options. */ +#define LNET_MD_KIOV (1 << 8) + +/* For compatibility with Cray Portals */ +#define LNET_MD_PHYS 0 + +/** Infinite threshold on MD operations. See lnet_md_t::threshold */ +#define LNET_MD_THRESH_INF (-1) + +/* NB lustre portals uses struct iovec internally! */ +typedef struct iovec lnet_md_iovec_t; + +/** + * A page-based fragment of a MD. + */ +typedef struct { + /** Pointer to the page where the fragment resides */ + struct page *kiov_page; + /** Length in bytes of the fragment */ + unsigned int kiov_len; + /** + * Starting offset of the fragment within the page. Note that the + * end of the fragment must not pass the end of the page; i.e., + * kiov_len + kiov_offset <= PAGE_CACHE_SIZE. + */ + unsigned int kiov_offset; +} lnet_kiov_t; +/** @} lnet_md */ + +/** \addtogroup lnet_eq + * @{ */ + +/** + * Six types of events can be logged in an event queue. + */ +typedef enum { + /** An incoming GET operation has completed on the MD. */ + LNET_EVENT_GET = 1, + /** + * An incoming PUT operation has completed on the MD. The + * underlying layers will not alter the memory (on behalf of this + * operation) once this event has been logged. + */ + LNET_EVENT_PUT, + /** + * A REPLY operation has completed. This event is logged after the + * data (if any) from the REPLY has been written into the MD. + */ + LNET_EVENT_REPLY, + /** An acknowledgment has been received. */ + LNET_EVENT_ACK, + /** + * An outgoing send (PUT or GET) operation has completed. This event + * is logged after the entire buffer has been sent and it is safe for + * the caller to reuse the buffer. + * + * Note: + * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can + * happen even when the message has not yet been put out on wire. + * - It's unsafe to assume that in an outgoing GET operation + * the LNET_EVENT_SEND event would happen before the + * LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and + * LNET_EVENT_ACK events in an outgoing PUT operation. + */ + LNET_EVENT_SEND, + /** + * A MD has been unlinked. Note that LNetMDUnlink() does not + * necessarily trigger an LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + LNET_EVENT_UNLINK, +} lnet_event_kind_t; + +#define LNET_SEQ_BASETYPE long +typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t; +#define LNET_SEQ_GT(a,b) (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0) + +/* XXX + * cygwin need the pragma line, not clear if it's needed in other places. + * checking!!! + */ +#ifdef __CYGWIN__ +#pragma pack(push, 4) +#endif + +/** + * Information about an event on a MD. + */ +typedef struct { + /** The identifier (nid, pid) of the target. */ + lnet_process_id_t target; + /** The identifier (nid, pid) of the initiator. */ + lnet_process_id_t initiator; + /** + * The NID of the immediate sender. If the request has been forwarded + * by routers, this is the NID of the last hop; otherwise it's the + * same as the initiator. + */ + lnet_nid_t sender; + /** Indicates the type of the event. */ + lnet_event_kind_t type; + /** The portal table index specified in the request */ + unsigned int pt_index; + /** A copy of the match bits specified in the request. */ + __u64 match_bits; + /** The length (in bytes) specified in the request. */ + unsigned int rlength; + /** + * The length (in bytes) of the data that was manipulated by the + * operation. For truncated operations, the manipulated length will be + * the number of bytes specified by the MD (possibly with an offset, + * see lnet_md_t). For all other operations, the manipulated length + * will be the length of the requested operation, i.e. rlength. + */ + unsigned int mlength; + /** + * The handle to the MD associated with the event. The handle may be + * invalid if the MD has been unlinked. + */ + lnet_handle_md_t md_handle; + /** + * A snapshot of the state of the MD immediately after the event has + * been processed. In particular, the threshold field in md will + * reflect the value of the threshold after the operation occurred. + */ + lnet_md_t md; + /** + * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. + * \see LNetPut + */ + __u64 hdr_data; + /** + * Indicates the completion status of the operation. It's 0 for + * successful operations, otherwise it's an error code. + */ + int status; + /** + * Indicates whether the MD has been unlinked. Note that: + * - An event with unlinked set is the last event on the MD. + * - This field is also set for an explicit LNET_EVENT_UNLINK event. + * \see LNetMDUnlink + */ + int unlinked; + /** + * The displacement (in bytes) into the memory region that the + * operation used. The offset can be determined by the operation for + * a remote managed MD or by the local MD. + * \see lnet_md_t::options + */ + unsigned int offset; + /** + * The sequence number for this event. Sequence numbers are unique + * to each event. + */ + volatile lnet_seq_t sequence; +} lnet_event_t; +#ifdef __CYGWIN__ +#pragma pop +#endif + +/** + * Event queue handler function type. + * + * The EQ handler runs for each event that is deposited into the EQ. The + * handler is supplied with a pointer to the event that triggered the + * handler invocation. + * + * The handler must not block, must be reentrant, and must not call any LNet + * API functions. It should return as quickly as possible. + */ +typedef void (*lnet_eq_handler_t)(lnet_event_t *event); +#define LNET_EQ_HANDLER_NONE NULL +/** @} lnet_eq */ + +/** \addtogroup lnet_data + * @{ */ + +/** + * Specify whether an acknowledgment should be sent by target when the PUT + * operation completes (i.e., when the data has been written to a MD of the + * target process). + * + * \see lnet_md_t::options for the discussion on LNET_MD_ACK_DISABLE by which + * acknowledgments can be disabled for a MD. + */ +typedef enum { + /** Request an acknowledgment */ + LNET_ACK_REQ, + /** Request that no acknowledgment should be generated. */ + LNET_NOACK_REQ +} lnet_ack_req_t; +/** @} lnet_data */ + +/** @} lnet */ +#endif diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig new file mode 100644 index 000000000000..00850eeb6a8c --- /dev/null +++ b/drivers/staging/lustre/lnet/Kconfig @@ -0,0 +1,40 @@ +config LNET + tristate "Lustre networking subsystem" + depends on LUSTRE_FS + +config LNET_MAX_PAYLOAD + int "Lustre lnet max transfer payload (default 2MB)" + depends on LUSTRE_FS + default "1048576" + help + This option defines the maximum size of payload in bytes that lnet + can put into its transport. + + If unsure, use default. + +config LNET_SELFTEST + tristate "Lustre networking self testing" + depends on LNET + help + Choose Y here if you want to do lnet self testing. To compile this + as a module, choose M here: the module will be called lnet_selftest. + + To compile this as a kernel modules, choose M here and it will be + called lnet_selftest. + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LNET_XPRT_IB + tristate "LNET infiniband support" + depends on LNET && INFINIBAND && INFINIBAND_ADDR_TRANS + default LNET && INFINIBAND + help + This option allows the LNET users to use infiniband as an + RDMA-enabled transport. + + To compile this as a kernel module, choose M here and it will be + called ko2iblnd. + + If unsure, say N. diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile new file mode 100644 index 000000000000..374212b1555a --- /dev/null +++ b/drivers/staging/lustre/lnet/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LNET) := klnds/ lnet/ selftest/ diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile new file mode 100644 index 000000000000..c23e4f67f837 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LNET) += o2iblnd/ socklnd/ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile new file mode 100644 index 000000000000..71b7d8418357 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o +ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o + + +ccflags-y := -I$(src)/../../include diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c new file mode 100644 index 000000000000..f4b958bbe5a3 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -0,0 +1,3256 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +lnd_t the_o2iblnd = { + .lnd_type = O2IBLND, + .lnd_startup = kiblnd_startup, + .lnd_shutdown = kiblnd_shutdown, + .lnd_ctl = kiblnd_ctl, + .lnd_query = kiblnd_query, + .lnd_send = kiblnd_send, + .lnd_recv = kiblnd_recv, +}; + +kib_data_t kiblnd_data; + +__u32 +kiblnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +static char * +kiblnd_msgtype2str(int type) +{ + switch (type) { + case IBLND_MSG_CONNREQ: + return "CONNREQ"; + + case IBLND_MSG_CONNACK: + return "CONNACK"; + + case IBLND_MSG_NOOP: + return "NOOP"; + + case IBLND_MSG_IMMEDIATE: + return "IMMEDIATE"; + + case IBLND_MSG_PUT_REQ: + return "PUT_REQ"; + + case IBLND_MSG_PUT_NAK: + return "PUT_NAK"; + + case IBLND_MSG_PUT_ACK: + return "PUT_ACK"; + + case IBLND_MSG_PUT_DONE: + return "PUT_DONE"; + + case IBLND_MSG_GET_REQ: + return "GET_REQ"; + + case IBLND_MSG_GET_DONE: + return "GET_DONE"; + + default: + return "???"; + } +} + +static int +kiblnd_msgtype2size(int type) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + + switch (type) { + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + return hdr_size + sizeof(kib_connparams_t); + + case IBLND_MSG_NOOP: + return hdr_size; + + case IBLND_MSG_IMMEDIATE: + return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]); + + case IBLND_MSG_PUT_REQ: + return hdr_size + sizeof(kib_putreq_msg_t); + + case IBLND_MSG_PUT_ACK: + return hdr_size + sizeof(kib_putack_msg_t); + + case IBLND_MSG_GET_REQ: + return hdr_size + sizeof(kib_get_msg_t); + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + return hdr_size + sizeof(kib_completion_msg_t); + default: + return -1; + } +} + +static int +kiblnd_unpack_rd(kib_msg_t *msg, int flip) +{ + kib_rdma_desc_t *rd; + int nob; + int n; + int i; + + LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ || + msg->ibm_type == IBLND_MSG_PUT_ACK); + + rd = msg->ibm_type == IBLND_MSG_GET_REQ ? + &msg->ibm_u.get.ibgm_rd : + &msg->ibm_u.putack.ibpam_rd; + + if (flip) { + __swab32s(&rd->rd_key); + __swab32s(&rd->rd_nfrags); + } + + n = rd->rd_nfrags; + + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return 1; + } + + nob = offsetof (kib_msg_t, ibm_u) + + kiblnd_rd_msg_size(rd, msg->ibm_type, n); + + if (msg->ibm_nob < nob) { + CERROR("Short %s: %d(%d)\n", + kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); + return 1; + } + + if (!flip) + return 0; + + for (i = 0; i < n; i++) { + __swab32s(&rd->rd_frags[i].rf_nob); + __swab64s(&rd->rd_frags[i].rf_addr); + } + + return 0; +} + +void +kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp) +{ + kib_net_t *net = ni->ni_data; + + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBLND_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = ni->ni_nid; + msg->ibm_srcstamp = net->ibn_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + + if (*kiblnd_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); + } +} + +int +kiblnd_unpack_msg(kib_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + __u16 version; + int msg_nob; + int flip; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } + + version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (version != IBLND_MSG_VERSION && + version != IBLND_MSG_VERSION_1) { + CERROR("Bad version: %x\n", version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kiblnd_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + msg->ibm_version = version; + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { + CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), + msg_nob, kiblnd_msgtype2size(msg->ibm_type)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBLND_MSG_NOOP: + case IBLND_MSG_IMMEDIATE: + case IBLND_MSG_PUT_REQ: + break; + + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_GET_REQ: + if (kiblnd_unpack_rd(msg, flip)) + return -EPROTO; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + if (flip) { + __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + } + break; + } + return 0; +} + +int +kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_net_t *net = ni->ni_data; + int cpt = lnet_cpt_of_nid(nid); + unsigned long flags; + + LASSERT(net != NULL); + LASSERT(nid != LNET_NID_ANY); + + LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer)); + if (peer == NULL) { + CERROR("Cannot allocate peer\n"); + return -ENOMEM; + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_ni = ni; + peer->ibp_nid = nid; + peer->ibp_error = 0; + peer->ibp_last_alive = 0; + atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD(&peer->ibp_conns); + INIT_LIST_HEAD(&peer->ibp_tx_queue); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT (net->ibn_shutdown == 0); + + /* npeers only grows with the global lock held */ + atomic_inc(&net->ibn_npeers); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + *peerp = peer; + return 0; +} + +void +kiblnd_destroy_peer (kib_peer_t *peer) +{ + kib_net_t *net = peer->ibp_ni->ni_data; + + LASSERT (net != NULL); + LASSERT (atomic_read(&peer->ibp_refcount) == 0); + LASSERT (!kiblnd_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (peer->ibp_accepting == 0); + LASSERT (list_empty(&peer->ibp_conns)); + LASSERT (list_empty(&peer->ibp_tx_queue)); + + LIBCFS_FREE(peer, sizeof(*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec(&net->ibn_npeers); +} + +kib_peer_t * +kiblnd_find_peer_locked (lnet_nid_t nid) +{ + /* the caller is responsible for accounting the additional reference + * that this creates */ + struct list_head *peer_list = kiblnd_nid2peerlist(nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry(tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", + peer, libcfs_nid2str(nid), + atomic_read(&peer->ibp_refcount), + peer->ibp_version); + return peer; + } + return NULL; +} + +void +kiblnd_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kiblnd_peer_active(peer)); + list_del_init(&peer->ibp_list); + /* lose peerlist's ref */ + kiblnd_peer_decref(peer); +} + +int +kiblnd_get_peer_info (lnet_ni_t *ni, int index, + lnet_nid_t *nidp, int *count) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *count = atomic_read(&peer->ibp_refcount); + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return 0; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return -ENOENT; +} + +void +kiblnd_del_peer_locked (kib_peer_t *peer) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (list_empty(&peer->ibp_conns)) { + kiblnd_unlink_peer_locked(peer); + } else { + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kiblnd_close_conn_locked(conn, 0); + } + /* NB closing peer's last conn unlinked it. */ + } + /* NB peer now unlinked; might even be freed if the peer table had the + * last ref on it. */ +} + +int +kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid) +{ + LIST_HEAD (zombies); + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + } else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) + continue; + + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); + + list_splice_init(&peer->ibp_tx_queue, + &zombies); + } + + kiblnd_del_peer_locked(peer); + rc = 0; /* matched something */ + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(ni, &zombies, -EIO); + + return rc; +} + +kib_conn_t * +kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, kib_conn_t, + ibc_list); + kiblnd_conn_addref(conn); + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return conn; + } + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return NULL; +} + +void +kiblnd_debug_rx (kib_rx_t *rx) +{ + CDEBUG(D_CONSOLE, " %p status %d msg_type %x cred %d\n", + rx, rx->rx_status, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits); +} + +void +kiblnd_debug_tx (kib_tx_t *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " + "cookie "LPX64" msg %s%s type %x cred %d\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, tx->tx_deadline, tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits); +} + +void +kiblnd_debug_conn (kib_conn_t *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n", + atomic_read(&conn->ibc_refcount), conn, + conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " state %d nposted %d/%d cred %d o_cred %d r_cred %d\n", + conn->ibc_state, conn->ibc_noops_posted, + conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); + + CDEBUG(D_CONSOLE, " tx_noops:\n"); + list_for_each(tmp, &conn->ibc_tx_noops) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) + kiblnd_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + +int +kiblnd_translate_mtu(int value) +{ + switch (value) { + default: + return -1; + case 0: + return 0; + case 256: + return IB_MTU_256; + case 512: + return IB_MTU_512; + case 1024: + return IB_MTU_1024; + case 2048: + return IB_MTU_2048; + case 4096: + return IB_MTU_4096; + } +} + +static void +kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) +{ + int mtu; + + /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ + if (cmid->route.path_rec == NULL) + return; + + mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); + LASSERT (mtu >= 0); + if (mtu != 0) + cmid->route.path_rec->mtu = mtu; +} + +static int +kiblnd_get_completion_vector(kib_conn_t *conn, int cpt) +{ + cpumask_t *mask; + int vectors; + int off; + int i; + + vectors = conn->ibc_cmid->device->num_comp_vectors; + if (vectors <= 1) + return 0; + + mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); + + /* hash NID to CPU id in this partition... */ + off = conn->ibc_peer->ibp_nid % cpus_weight(*mask); + for_each_cpu_mask(i, *mask) { + if (off-- == 0) + return i % vectors; + } + + LBUG(); + return 1; +} + +kib_conn_t * +kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid, + int state, int version) +{ + /* CAVEAT EMPTOR: + * If the new conn is created successfully it takes over the caller's + * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer' remains and + * she must dispose of 'cmid'. (Actually I'd block forever if I tried + * to destroy 'cmid' here since I'm called from the CM which still has + * its ref on 'cmid'). */ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_net_t *net = peer->ibp_ni->ni_data; + kib_dev_t *dev; + struct ib_qp_init_attr *init_qp_attr; + struct kib_sched_info *sched; + kib_conn_t *conn; + struct ib_cq *cq; + unsigned long flags; + int cpt; + int rc; + int i; + + LASSERT(net != NULL); + LASSERT(!in_interrupt()); + + dev = net->ibn_dev; + + cpt = lnet_cpt_of_nid(peer->ibp_nid); + sched = kiblnd_data.kib_scheds[cpt]; + + LASSERT(sched->ibs_nthreads > 0); + + LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt, + sizeof(*init_qp_attr)); + if (init_qp_attr == NULL) { + CERROR("Can't allocate qp_attr for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_0; + } + + LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn)); + if (conn == NULL) { + CERROR("Can't allocate connection for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_1; + } + + conn->ibc_state = IBLND_CONN_INIT; + conn->ibc_version = version; + conn->ibc_peer = peer; /* I take the caller's ref */ + cmid->context = conn; /* for future CM callbacks */ + conn->ibc_cmid = cmid; + + INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_noops); + INIT_LIST_HEAD(&conn->ibc_tx_queue); + INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); + INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); + INIT_LIST_HEAD(&conn->ibc_active_txs); + spin_lock_init(&conn->ibc_lock); + + LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt, + sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed_2; + } + + write_lock_irqsave(glock, flags); + if (dev->ibd_failover) { + write_unlock_irqrestore(glock, flags); + CERROR("%s: failover in progress\n", dev->ibd_ifname); + goto failed_2; + } + + if (dev->ibd_hdev->ibh_ibdev != cmid->device) { + /* wakeup failover thread and teardown connection */ + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + write_unlock_irqrestore(glock, flags); + CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", + cmid->device->name, dev->ibd_ifname); + goto failed_2; + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + conn->ibc_hdev = dev->ibd_hdev; + + kiblnd_setup_mtu_locked(cmid); + + write_unlock_irqrestore(glock, flags); + + LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt, + IBLND_RX_MSGS(version) * sizeof(kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } + + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, + IBLND_RX_MSG_PAGES(version)); + if (rc != 0) + goto failed_2; + + kiblnd_map_rx_descs(conn); + + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES(version), + kiblnd_get_completion_vector(conn, cpt)); + if (IS_ERR(cq)) { + CERROR("Can't create CQ: %ld, cqe: %d\n", + PTR_ERR(cq), IBLND_CQ_ENTRIES(version)); + goto failed_2; + } + + conn->ibc_cq = cq; + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc != 0) { + CERROR("Can't request completion notificiation: %d\n", rc); + goto failed_2; + } + + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; + init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version); + init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version); + init_qp_attr->cap.max_send_sge = 1; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; + + conn->ibc_sched = sched; + + rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); + if (rc != 0) { + CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", + rc, init_qp_attr->cap.max_send_wr, + init_qp_attr->cap.max_recv_wr); + goto failed_2; + } + + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + + /* 1 ref for caller and each rxmsg */ + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version)); + conn->ibc_nrx = IBLND_RX_MSGS(version); + + /* post receives */ + for (i = 0; i < IBLND_RX_MSGS(version); i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], + IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); + + /* Make posted receives complete */ + kiblnd_abort_receives(conn); + + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ + spin_lock_irqsave(&sched->ibs_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS(version) - i; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + /* cmid will be destroyed by CM(ofed) after cm_callback + * returned, so we can't refer it anymore + * (by kiblnd_connd()->kiblnd_destroy_conn) */ + rdma_destroy_qp(conn->ibc_cmid); + conn->ibc_cmid = NULL; + + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS(version)) + kiblnd_conn_decref(conn); + + return NULL; + } + } + + /* Init successful! */ + LASSERT (state == IBLND_CONN_ACTIVE_CONNECT || + state == IBLND_CONN_PASSIVE_WAIT); + conn->ibc_state = state; + + /* 1 more conn */ + atomic_inc(&net->ibn_nconns); + return conn; + + failed_2: + kiblnd_destroy_conn(conn); + failed_1: + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + failed_0: + return NULL; +} + +void +kiblnd_destroy_conn (kib_conn_t *conn) +{ + struct rdma_cm_id *cmid = conn->ibc_cmid; + kib_peer_t *peer = conn->ibc_peer; + int rc; + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_early_rxs)); + LASSERT (list_empty(&conn->ibc_tx_noops)); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_noops_posted == 0); + LASSERT (conn->ibc_nsends_posted == 0); + + switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + + case IBLND_CONN_DISCONNECTED: + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); + break; + + case IBLND_CONN_INIT: + break; + } + + /* conn->ibc_cmid might be destroyed by CM already */ + if (cmid != NULL && cmid->qp != NULL) + rdma_destroy_qp(cmid); + + if (conn->ibc_cq != NULL) { + rc = ib_destroy_cq(conn->ibc_cq); + if (rc != 0) + CWARN("Error destroying CQ: %d\n", rc); + } + + if (conn->ibc_rx_pages != NULL) + kiblnd_unmap_rx_descs(conn); + + if (conn->ibc_rxs != NULL) { + LIBCFS_FREE(conn->ibc_rxs, + IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t)); + } + + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + + if (conn->ibc_hdev != NULL) + kiblnd_hdev_decref(conn->ibc_hdev); + + /* See CAVEAT EMPTOR above in kiblnd_create_conn */ + if (conn->ibc_state != IBLND_CONN_INIT) { + kib_net_t *net = peer->ibp_ni->ni_data; + + kiblnd_peer_decref(peer); + rdma_destroy_id(cmid); + atomic_dec(&net->ibn_nconns); + } + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int +kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + CDEBUG(D_NET, "Closing conn -> %s, " + "version: %x, reason: %d\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_version, why); + + kiblnd_close_conn_locked(conn, why); + count++; + } + + return count; +} + +int +kiblnd_close_stale_conns_locked (kib_peer_t *peer, + int version, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_version == version && + conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn -> %s version: %x, " + "incarnation:"LPX64"(%x, "LPX64")\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_version, conn->ibc_incarnation, + version, incarnation); + + kiblnd_close_conn_locked(conn, -ESTALE); + count++; + } + + return count; +} + +int +kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid) +{ + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + unsigned long flags; + int count = 0; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kiblnd_close_peer_conns_locked(peer, 0); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == LNET_NID_ANY) + return 0; + + return (count == 0) ? -ENOENT : 0; +} + +int +kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + rc = kiblnd_get_peer_info(ni, data->ioc_count, + &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + + case IOC_LIBCFS_DEL_PEER: { + rc = kiblnd_del_peer(ni, data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn; + + rc = 0; + conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); + if (conn == NULL) { + rc = -ENOENT; + break; + } + + LASSERT (conn->ibc_cmid != NULL); + data->ioc_nid = conn->ibc_peer->ibp_nid; + if (conn->ibc_cmid->route.path_rec == NULL) + data->ioc_u32[0] = 0; /* iWarp has no path MTU */ + else + data->ioc_u32[0] = + ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); + kiblnd_conn_decref(conn); + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kiblnd_close_matching_conns(ni, data->ioc_nid); + break; + } + + default: + break; + } + + return rc; +} + +void +kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) +{ + cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_peer_t *peer; + unsigned long flags; + + read_lock_irqsave(glock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + LASSERT (peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + last_alive = peer->ibp_last_alive; + } + + read_unlock_irqrestore(glock, flags); + + if (last_alive != 0) + *when = last_alive; + + /* peer is not persistent in hash, trigger peer creation + * and connection establishment with a NULL tx */ + if (peer == NULL) + kiblnd_launch_tx(ni, NULL, nid); + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1); + return; +} + +void +kiblnd_free_pages(kib_pages_t *p) +{ + int npages = p->ibp_npages; + int i; + + for (i = 0; i < npages; i++) { + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + } + + LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages) +{ + kib_pages_t *p; + int i; + + LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt, + offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", npages); + return -ENOMEM; + } + + memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + __GFP_IO); + if (p->ibp_pages[i] == NULL) { + CERROR("Can't allocate page %d of %d\n", i, npages); + kiblnd_free_pages(p); + return -ENOMEM; + } + } + + *pp = p; + return 0; +} + +void +kiblnd_unmap_rx_descs(kib_conn_t *conn) +{ + kib_rx_t *rx; + int i; + + LASSERT (conn->ibc_rxs != NULL); + LASSERT (conn->ibc_hdev != NULL); + + for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) { + rx = &conn->ibc_rxs[i]; + + LASSERT (rx->rx_nob >= 0); /* not posted */ + + kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(rx, rx_msgunmap, + rx->rx_msgaddr), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } + + kiblnd_free_pages(conn->ibc_rx_pages); + + conn->ibc_rx_pages = NULL; +} + +void +kiblnd_map_rx_descs(kib_conn_t *conn) +{ + kib_rx_t *rx; + struct page *pg; + int pg_off; + int ipg; + int i; + + for (pg_off = ipg = i = 0; + i < IBLND_RX_MSGS(conn->ibc_version); i++) { + pg = conn->ibc_rx_pages->ibp_pages[ipg]; + rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off); + + rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, + rx->rx_msg, IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, + rx->rx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); + + CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n", + i, rx->rx_msg, rx->rx_msgaddr, + lnet_page2phys(pg) + pg_off); + + pg_off += IBLND_MSG_SIZE; + LASSERT (pg_off <= PAGE_SIZE); + + if (pg_off == PAGE_SIZE) { + pg_off = 0; + ipg++; + LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version)); + } + } +} + +static void +kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo) +{ + kib_hca_dev_t *hdev = tpo->tpo_hdev; + kib_tx_t *tx; + int i; + + LASSERT (tpo->tpo_pool.po_allocated == 0); + + if (hdev == NULL) + return; + + for (i = 0; i < tpo->tpo_pool.po_size; i++) { + tx = &tpo->tpo_tx_descs[i]; + kiblnd_dma_unmap_single(hdev->ibh_ibdev, + KIBLND_UNMAP_ADDR(tx, tx_msgunmap, + tx->tx_msgaddr), + IBLND_MSG_SIZE, DMA_TO_DEVICE); + } + + kiblnd_hdev_decref(hdev); + tpo->tpo_hdev = NULL; +} + +static kib_hca_dev_t * +kiblnd_current_hdev(kib_dev_t *dev) +{ + kib_hca_dev_t *hdev; + unsigned long flags; + int i = 0; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (dev->ibd_failover) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + if (i++ % 50 == 0) + CDEBUG(D_NET, "%s: Wait for failover\n", + dev->ibd_ifname); + schedule_timeout(cfs_time_seconds(1) / 100); + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + + kiblnd_hdev_addref_locked(dev->ibd_hdev); + hdev = dev->ibd_hdev; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + return hdev; +} + +static void +kiblnd_map_tx_pool(kib_tx_pool_t *tpo) +{ + kib_pages_t *txpgs = tpo->tpo_tx_pages; + kib_pool_t *pool = &tpo->tpo_pool; + kib_net_t *net = pool->po_owner->ps_net; + kib_dev_t *dev; + struct page *page; + kib_tx_t *tx; + int page_offset; + int ipage; + int i; + + LASSERT (net != NULL); + + dev = net->ibn_dev; + + /* pre-mapped messages are not bigger than 1 page */ + CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0); + + tpo->tpo_hdev = kiblnd_current_hdev(dev); + + for (ipage = page_offset = i = 0; i < pool->po_size; i++) { + page = txpgs->ibp_pages[ipage]; + tx = &tpo->tpo_tx_descs[i]; + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + tx->tx_msgaddr = kiblnd_dma_map_single( + tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, + IBLND_MSG_SIZE, DMA_TO_DEVICE); + LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, + tx->tx_msgaddr)); + KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); + + list_add(&tx->tx_list, &pool->po_free_list); + + page_offset += IBLND_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= txpgs->ibp_npages); + } + } +} + +struct ib_mr * +kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size) +{ + __u64 index; + + LASSERT (hdev->ibh_mrs[0] != NULL); + + if (hdev->ibh_nmrs == 1) + return hdev->ibh_mrs[0]; + + index = addr >> hdev->ibh_mr_shift; + + if (index < hdev->ibh_nmrs && + index == ((addr + size - 1) >> hdev->ibh_mr_shift)) + return hdev->ibh_mrs[index]; + + return NULL; +} + +struct ib_mr * +kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd) +{ + struct ib_mr *prev_mr; + struct ib_mr *mr; + int i; + + LASSERT (hdev->ibh_mrs[0] != NULL); + + if (*kiblnd_tunables.kib_map_on_demand > 0 && + *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags) + return NULL; + + if (hdev->ibh_nmrs == 1) + return hdev->ibh_mrs[0]; + + for (i = 0, mr = prev_mr = NULL; + i < rd->rd_nfrags; i++) { + mr = kiblnd_find_dma_mr(hdev, + rd->rd_frags[i].rf_addr, + rd->rd_frags[i].rf_nob); + if (prev_mr == NULL) + prev_mr = mr; + + if (mr == NULL || prev_mr != mr) { + /* Can't covered by one single MR */ + mr = NULL; + break; + } + } + + return mr; +} + +void +kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool) +{ + LASSERT (pool->fpo_map_count == 0); + + if (pool->fpo_fmr_pool != NULL) + ib_destroy_fmr_pool(pool->fpo_fmr_pool); + + if (pool->fpo_hdev != NULL) + kiblnd_hdev_decref(pool->fpo_hdev); + + LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t)); +} + +void +kiblnd_destroy_fmr_pool_list(struct list_head *head) +{ + kib_fmr_pool_t *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, kib_fmr_pool_t, fpo_list); + list_del(&pool->fpo_list); + kiblnd_destroy_fmr_pool(pool); + } +} + +static int kiblnd_fmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; + + return max(IBLND_FMR_POOL, size); +} + +static int kiblnd_fmr_flush_trigger(int ncpts) +{ + int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; + + return max(IBLND_FMR_POOL_FLUSH, size); +} + +int +kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo) +{ + /* FMR pool for RDMA */ + kib_dev_t *dev = fps->fps_net->ibn_dev; + kib_fmr_pool_t *fpo; + struct ib_fmr_pool_param param = { + .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, + .page_shift = PAGE_SHIFT, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + .pool_size = fps->fps_pool_size, + .dirty_watermark = fps->fps_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = !!*kiblnd_tunables.kib_fmr_cache}; + int rc; + + LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); + if (fpo == NULL) + return -ENOMEM; + + fpo->fpo_hdev = kiblnd_current_hdev(dev); + + fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); + if (IS_ERR(fpo->fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fpo_fmr_pool); + CERROR("Failed to create FMR pool: %d\n", rc); + + kiblnd_hdev_decref(fpo->fpo_hdev); + LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t)); + return rc; + } + + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_owner = fps; + *pp_fpo = fpo; + + return 0; +} + +static void +kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies) +{ + if (fps->fps_net == NULL) /* intialized? */ + return; + + spin_lock(&fps->fps_lock); + + while (!list_empty(&fps->fps_pool_list)) { + kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next, + kib_fmr_pool_t, fpo_list); + fpo->fpo_failed = 1; + list_del(&fpo->fpo_list); + if (fpo->fpo_map_count == 0) + list_add(&fpo->fpo_list, zombies); + else + list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); + } + + spin_unlock(&fps->fps_lock); +} + +static void +kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps) +{ + if (fps->fps_net != NULL) { /* initialized? */ + kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); + kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); + } +} + +static int +kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net, + int pool_size, int flush_trigger) +{ + kib_fmr_pool_t *fpo; + int rc; + + memset(fps, 0, sizeof(kib_fmr_poolset_t)); + + fps->fps_net = net; + fps->fps_cpt = cpt; + fps->fps_pool_size = pool_size; + fps->fps_flush_trigger = flush_trigger; + spin_lock_init(&fps->fps_lock); + INIT_LIST_HEAD(&fps->fps_pool_list); + INIT_LIST_HEAD(&fps->fps_failed_pool_list); + + rc = kiblnd_create_fmr_pool(fps, &fpo); + if (rc == 0) + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + + return rc; +} + +static int +kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now) +{ + if (fpo->fpo_map_count != 0) /* still in use */ + return 0; + if (fpo->fpo_failed) + return 1; + return cfs_time_aftereq(now, fpo->fpo_deadline); +} + +void +kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) +{ + LIST_HEAD (zombies); + kib_fmr_pool_t *fpo = fmr->fmr_pool; + kib_fmr_poolset_t *fps = fpo->fpo_owner; + cfs_time_t now = cfs_time_current(); + kib_fmr_pool_t *tmp; + int rc; + + rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); + LASSERT (rc == 0); + + if (status != 0) { + rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool); + LASSERT (rc == 0); + } + + fmr->fmr_pool = NULL; + fmr->fmr_pfmr = NULL; + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count --; /* decref the pool */ + + list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { + /* the first pool is persistent */ + if (fps->fps_pool_list.next == &fpo->fpo_list) + continue; + + if (kiblnd_fmr_pool_is_idle(fpo, now)) { + list_move(&fpo->fpo_list, &zombies); + fps->fps_version ++; + } + } + spin_unlock(&fps->fps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_fmr_pool_list(&zombies); +} + +int +kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, + __u64 iov, kib_fmr_t *fmr) +{ + struct ib_pool_fmr *pfmr; + kib_fmr_pool_t *fpo; + __u64 version; + int rc; + + again: + spin_lock(&fps->fps_lock); + version = fps->fps_version; + list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + fpo->fpo_map_count++; + spin_unlock(&fps->fps_lock); + + pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_pool = fpo; + fmr->fmr_pfmr = pfmr; + return 0; + } + + spin_lock(&fps->fps_lock); + fpo->fpo_map_count--; + if (PTR_ERR(pfmr) != -EAGAIN) { + spin_unlock(&fps->fps_lock); + return PTR_ERR(pfmr); + } + + /* EAGAIN and ... */ + if (version != fps->fps_version) { + spin_unlock(&fps->fps_lock); + goto again; + } + } + + if (fps->fps_increasing) { + spin_unlock(&fps->fps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "FMR pool, waiting for her to complete\n"); + schedule(); + goto again; + + } + + if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) { + /* someone failed recently */ + spin_unlock(&fps->fps_lock); + return -EAGAIN; + } + + fps->fps_increasing = 1; + spin_unlock(&fps->fps_lock); + + CDEBUG(D_NET, "Allocate new FMR pool\n"); + rc = kiblnd_create_fmr_pool(fps, &fpo); + spin_lock(&fps->fps_lock); + fps->fps_increasing = 0; + if (rc == 0) { + fps->fps_version++; + list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); + } else { + fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); + } + spin_unlock(&fps->fps_lock); + + goto again; +} + +static void +kiblnd_fini_pool(kib_pool_t *pool) +{ + LASSERT (list_empty(&pool->po_free_list)); + LASSERT (pool->po_allocated == 0); + + CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); +} + +static void +kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size) +{ + CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); + + memset(pool, 0, sizeof(kib_pool_t)); + INIT_LIST_HEAD(&pool->po_free_list); + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + pool->po_owner = ps; + pool->po_size = size; +} + +void +kiblnd_destroy_pool_list(struct list_head *head) +{ + kib_pool_t *pool; + + while (!list_empty(head)) { + pool = list_entry(head->next, kib_pool_t, po_list); + list_del(&pool->po_list); + + LASSERT (pool->po_owner != NULL); + pool->po_owner->ps_pool_destroy(pool); + } +} + +static void +kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies) +{ + if (ps->ps_net == NULL) /* intialized? */ + return; + + spin_lock(&ps->ps_lock); + while (!list_empty(&ps->ps_pool_list)) { + kib_pool_t *po = list_entry(ps->ps_pool_list.next, + kib_pool_t, po_list); + po->po_failed = 1; + list_del(&po->po_list); + if (po->po_allocated == 0) + list_add(&po->po_list, zombies); + else + list_add(&po->po_list, &ps->ps_failed_pool_list); + } + spin_unlock(&ps->ps_lock); +} + +static void +kiblnd_fini_poolset(kib_poolset_t *ps) +{ + if (ps->ps_net != NULL) { /* initialized? */ + kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); + kiblnd_destroy_pool_list(&ps->ps_pool_list); + } +} + +static int +kiblnd_init_poolset(kib_poolset_t *ps, int cpt, + kib_net_t *net, char *name, int size, + kib_ps_pool_create_t po_create, + kib_ps_pool_destroy_t po_destroy, + kib_ps_node_init_t nd_init, + kib_ps_node_fini_t nd_fini) +{ + kib_pool_t *pool; + int rc; + + memset(ps, 0, sizeof(kib_poolset_t)); + + ps->ps_cpt = cpt; + ps->ps_net = net; + ps->ps_pool_create = po_create; + ps->ps_pool_destroy = po_destroy; + ps->ps_node_init = nd_init; + ps->ps_node_fini = nd_fini; + ps->ps_pool_size = size; + if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) + >= sizeof(ps->ps_name)) + return -E2BIG; + spin_lock_init(&ps->ps_lock); + INIT_LIST_HEAD(&ps->ps_pool_list); + INIT_LIST_HEAD(&ps->ps_failed_pool_list); + + rc = ps->ps_pool_create(ps, size, &pool); + if (rc == 0) + list_add(&pool->po_list, &ps->ps_pool_list); + else + CERROR("Failed to create the first pool for %s\n", ps->ps_name); + + return rc; +} + +static int +kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now) +{ + if (pool->po_allocated != 0) /* still in use */ + return 0; + if (pool->po_failed) + return 1; + return cfs_time_aftereq(now, pool->po_deadline); +} + +void +kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node) +{ + LIST_HEAD (zombies); + kib_poolset_t *ps = pool->po_owner; + kib_pool_t *tmp; + cfs_time_t now = cfs_time_current(); + + spin_lock(&ps->ps_lock); + + if (ps->ps_node_fini != NULL) + ps->ps_node_fini(pool, node); + + LASSERT (pool->po_allocated > 0); + list_add(node, &pool->po_free_list); + pool->po_allocated --; + + list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { + /* the first pool is persistent */ + if (ps->ps_pool_list.next == &pool->po_list) + continue; + + if (kiblnd_pool_is_idle(pool, now)) + list_move(&pool->po_list, &zombies); + } + spin_unlock(&ps->ps_lock); + + if (!list_empty(&zombies)) + kiblnd_destroy_pool_list(&zombies); +} + +struct list_head * +kiblnd_pool_alloc_node(kib_poolset_t *ps) +{ + struct list_head *node; + kib_pool_t *pool; + int rc; + + again: + spin_lock(&ps->ps_lock); + list_for_each_entry(pool, &ps->ps_pool_list, po_list) { + if (list_empty(&pool->po_free_list)) + continue; + + pool->po_allocated ++; + pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); + node = pool->po_free_list.next; + list_del(node); + + if (ps->ps_node_init != NULL) { + /* still hold the lock */ + ps->ps_node_init(pool, node); + } + spin_unlock(&ps->ps_lock); + return node; + } + + /* no available tx pool and ... */ + if (ps->ps_increasing) { + /* another thread is allocating a new pool */ + spin_unlock(&ps->ps_lock); + CDEBUG(D_NET, "Another thread is allocating new " + "%s pool, waiting for her to complete\n", + ps->ps_name); + schedule(); + goto again; + } + + if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) { + /* someone failed recently */ + spin_unlock(&ps->ps_lock); + return NULL; + } + + ps->ps_increasing = 1; + spin_unlock(&ps->ps_lock); + + CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); + + rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); + + spin_lock(&ps->ps_lock); + ps->ps_increasing = 0; + if (rc == 0) { + list_add_tail(&pool->po_list, &ps->ps_pool_list); + } else { + ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY); + CERROR("Can't allocate new %s pool because out of memory\n", + ps->ps_name); + } + spin_unlock(&ps->ps_lock); + + goto again; +} + +void +kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr) +{ + kib_pmr_pool_t *ppo = pmr->pmr_pool; + struct ib_mr *mr = pmr->pmr_mr; + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list); + if (mr != NULL) + ib_dereg_mr(mr); +} + +int +kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr) +{ + kib_phys_mr_t *pmr; + struct list_head *node; + int rc; + int i; + + node = kiblnd_pool_alloc_node(&pps->pps_poolset); + if (node == NULL) { + CERROR("Failed to allocate PMR descriptor\n"); + return -ENOMEM; + } + + pmr = container_of(node, kib_phys_mr_t, pmr_list); + if (pmr->pmr_pool->ppo_hdev != hdev) { + kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node); + return -EAGAIN; + } + + for (i = 0; i < rd->rd_nfrags; i ++) { + pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr; + pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob; + } + + pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd, + pmr->pmr_ipb, rd->rd_nfrags, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE, + iova); + if (!IS_ERR(pmr->pmr_mr)) { + pmr->pmr_iova = *iova; + *pp_pmr = pmr; + return 0; + } + + rc = PTR_ERR(pmr->pmr_mr); + CERROR("Failed ib_reg_phys_mr: %d\n", rc); + + pmr->pmr_mr = NULL; + kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node); + + return rc; +} + +static void +kiblnd_destroy_pmr_pool(kib_pool_t *pool) +{ + kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool); + kib_phys_mr_t *pmr; + + LASSERT (pool->po_allocated == 0); + + while (!list_empty(&pool->po_free_list)) { + pmr = list_entry(pool->po_free_list.next, + kib_phys_mr_t, pmr_list); + + LASSERT (pmr->pmr_mr == NULL); + list_del(&pmr->pmr_list); + + if (pmr->pmr_ipb != NULL) { + LIBCFS_FREE(pmr->pmr_ipb, + IBLND_MAX_RDMA_FRAGS * + sizeof(struct ib_phys_buf)); + } + + LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t)); + } + + kiblnd_fini_pool(pool); + if (ppo->ppo_hdev != NULL) + kiblnd_hdev_decref(ppo->ppo_hdev); + + LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t)); +} + +static inline int kiblnd_pmr_pool_size(int ncpts) +{ + int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts; + + return max(IBLND_PMR_POOL, size); +} + +static int +kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) +{ + struct kib_pmr_pool *ppo; + struct kib_pool *pool; + kib_phys_mr_t *pmr; + int i; + + LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_pmr_pool_t)); + if (ppo == NULL) { + CERROR("Failed to allocate PMR pool\n"); + return -ENOMEM; + } + + pool = &ppo->ppo_pool; + kiblnd_init_pool(ps, pool, size); + + for (i = 0; i < size; i++) { + LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(), + ps->ps_cpt, sizeof(kib_phys_mr_t)); + if (pmr == NULL) + break; + + pmr->pmr_pool = ppo; + LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb)); + if (pmr->pmr_ipb == NULL) + break; + + list_add(&pmr->pmr_list, &pool->po_free_list); + } + + if (i < size) { + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev); + *pp_po = pool; + return 0; +} + +static void +kiblnd_destroy_tx_pool(kib_pool_t *pool) +{ + kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool); + int i; + + LASSERT (pool->po_allocated == 0); + + if (tpo->tpo_tx_pages != NULL) { + kiblnd_unmap_tx_pool(tpo); + kiblnd_free_pages(tpo->tpo_tx_pages); + } + + if (tpo->tpo_tx_descs == NULL) + goto out; + + for (i = 0; i < pool->po_size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + list_del(&tx->tx_list); + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, + LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_frags != NULL) + LIBCFS_FREE(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + } + + LIBCFS_FREE(tpo->tpo_tx_descs, + pool->po_size * sizeof(kib_tx_t)); +out: + kiblnd_fini_pool(pool); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); +} + +static int kiblnd_tx_pool_size(int ncpts) +{ + int ntx = *kiblnd_tunables.kib_ntx / ncpts; + + return max(IBLND_TX_POOL, ntx); +} + +static int +kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po) +{ + int i; + int npg; + kib_pool_t *pool; + kib_tx_pool_t *tpo; + + LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo)); + if (tpo == NULL) { + CERROR("Failed to allocate TX pool\n"); + return -ENOMEM; + } + + pool = &tpo->tpo_pool; + kiblnd_init_pool(ps, pool, size); + tpo->tpo_tx_descs = NULL; + tpo->tpo_tx_pages = NULL; + + npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE; + if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) { + CERROR("Can't allocate tx pages: %d\n", npg); + LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t)); + return -ENOMEM; + } + + LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt, + size * sizeof(kib_tx_t)); + if (tpo->tpo_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", size); + ps->ps_pool_destroy(pool); + return -ENOMEM; + } + + memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t)); + + for (i = 0; i < size; i++) { + kib_tx_t *tx = &tpo->tpo_tx_descs[i]; + + tx->tx_pool = tpo; + if (ps->ps_net->ibn_fmr_ps != NULL) { + LIBCFS_CPT_ALLOC(tx->tx_pages, + lnet_cpt_table(), ps->ps_cpt, + LNET_MAX_IOV * sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + break; + } + + LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt, + IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + break; + + sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS); + + LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + break; + + LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + break; + } + + if (i == size) { + kiblnd_map_tx_pool(tpo); + *pp_po = pool; + return 0; + } + + ps->ps_pool_destroy(pool); + return -ENOMEM; +} + +static void +kiblnd_tx_init(kib_pool_t *pool, struct list_head *node) +{ + kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t, + tps_poolset); + kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list); + + tx->tx_cookie = tps->tps_next_tx_cookie ++; +} + +void +kiblnd_net_fini_pools(kib_net_t *net) +{ + int i; + + cfs_cpt_for_each(i, lnet_cpt_table()) { + kib_tx_poolset_t *tps; + kib_fmr_poolset_t *fps; + kib_pmr_poolset_t *pps; + + if (net->ibn_tx_ps != NULL) { + tps = net->ibn_tx_ps[i]; + kiblnd_fini_poolset(&tps->tps_poolset); + } + + if (net->ibn_fmr_ps != NULL) { + fps = net->ibn_fmr_ps[i]; + kiblnd_fini_fmr_poolset(fps); + } + + if (net->ibn_pmr_ps != NULL) { + pps = net->ibn_pmr_ps[i]; + kiblnd_fini_poolset(&pps->pps_poolset); + } + } + + if (net->ibn_tx_ps != NULL) { + cfs_percpt_free(net->ibn_tx_ps); + net->ibn_tx_ps = NULL; + } + + if (net->ibn_fmr_ps != NULL) { + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + } + + if (net->ibn_pmr_ps != NULL) { + cfs_percpt_free(net->ibn_pmr_ps); + net->ibn_pmr_ps = NULL; + } +} + +int +kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) +{ + unsigned long flags; + int cpt; + int rc; + int i; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (*kiblnd_tunables.kib_map_on_demand == 0 && + net->ibn_dev->ibd_hdev->ibh_nmrs == 1) { + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + goto create_tx_pool; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (*kiblnd_tunables.kib_fmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_fmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + rc = -EINVAL; + goto failed; + } + + /* TX pool must be created later than FMR/PMR, see LU-2268 + * for details */ + LASSERT(net->ibn_tx_ps == NULL); + + /* premapping can fail if ibd_nmr > 1, so we always create + * FMR/PMR pool and map-on-demand if premapping failed */ + + net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_fmr_poolset_t)); + if (net->ibn_fmr_ps == NULL) { + CERROR("Failed to allocate FMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, + kiblnd_fmr_pool_size(ncpts), + kiblnd_fmr_flush_trigger(ncpts)); + if (rc == -ENOSYS && i == 0) /* no FMR */ + break; /* create PMR pool */ + + if (rc != 0) { /* a real error */ + CERROR("Can't initialize FMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + if (i > 0) { + LASSERT(i == ncpts); + goto create_tx_pool; + } + + cfs_percpt_free(net->ibn_fmr_ps); + net->ibn_fmr_ps = NULL; + + CWARN("Device does not support FMR, failing back to PMR\n"); + + if (*kiblnd_tunables.kib_pmr_pool_size < + *kiblnd_tunables.kib_ntx / 4) { + CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n", + *kiblnd_tunables.kib_pmr_pool_size, + *kiblnd_tunables.kib_ntx / 4); + rc = -EINVAL; + goto failed; + } + + net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_pmr_poolset_t)); + if (net->ibn_pmr_ps == NULL) { + CERROR("Failed to allocate PMR pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset, + cpt, net, "PMR", + kiblnd_pmr_pool_size(ncpts), + kiblnd_create_pmr_pool, + kiblnd_destroy_pmr_pool, NULL, NULL); + if (rc != 0) { + CERROR("Can't initialize PMR pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + create_tx_pool: + net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(kib_tx_poolset_t)); + if (net->ibn_tx_ps == NULL) { + CERROR("Failed to allocate tx pool array\n"); + rc = -ENOMEM; + goto failed; + } + + for (i = 0; i < ncpts; i++) { + cpt = (cpts == NULL) ? i : cpts[i]; + rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, + cpt, net, "TX", + kiblnd_tx_pool_size(ncpts), + kiblnd_create_tx_pool, + kiblnd_destroy_tx_pool, + kiblnd_tx_init, NULL); + if (rc != 0) { + CERROR("Can't initialize TX pool for CPT %d: %d\n", + cpt, rc); + goto failed; + } + } + + return 0; + failed: + kiblnd_net_fini_pools(net); + LASSERT(rc != 0); + return rc; +} + +static int +kiblnd_hdev_get_attr(kib_hca_dev_t *hdev) +{ + struct ib_device_attr *attr; + int rc; + + /* It's safe to assume a HCA can handle a page size + * matching that of the native system */ + hdev->ibh_page_shift = PAGE_SHIFT; + hdev->ibh_page_size = 1 << PAGE_SHIFT; + hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); + + LIBCFS_ALLOC(attr, sizeof(*attr)); + if (attr == NULL) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + rc = ib_query_device(hdev->ibh_ibdev, attr); + if (rc == 0) + hdev->ibh_mr_size = attr->max_mr_size; + + LIBCFS_FREE(attr, sizeof(*attr)); + + if (rc != 0) { + CERROR("Failed to query IB device: %d\n", rc); + return rc; + } + + if (hdev->ibh_mr_size == ~0ULL) { + hdev->ibh_mr_shift = 64; + return 0; + } + + for (hdev->ibh_mr_shift = 0; + hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) { + if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) || + hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1) + return 0; + } + + CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size); + return -EINVAL; +} + +void +kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev) +{ + int i; + + if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL) + return; + + for (i = 0; i < hdev->ibh_nmrs; i++) { + if (hdev->ibh_mrs[i] == NULL) + break; + + ib_dereg_mr(hdev->ibh_mrs[i]); + } + + LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); + hdev->ibh_mrs = NULL; + hdev->ibh_nmrs = 0; +} + +void +kiblnd_hdev_destroy(kib_hca_dev_t *hdev) +{ + kiblnd_hdev_cleanup_mrs(hdev); + + if (hdev->ibh_pd != NULL) + ib_dealloc_pd(hdev->ibh_pd); + + if (hdev->ibh_cmid != NULL) + rdma_destroy_id(hdev->ibh_cmid); + + LIBCFS_FREE(hdev, sizeof(*hdev)); +} + +int +kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev) +{ + struct ib_mr *mr; + int i; + int rc; + __u64 mm_size; + __u64 mr_size; + int acflags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE; + + rc = kiblnd_hdev_get_attr(hdev); + if (rc != 0) + return rc; + + if (hdev->ibh_mr_shift == 64) { + LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs)); + if (hdev->ibh_mrs == NULL) { + CERROR("Failed to allocate MRs table\n"); + return -ENOMEM; + } + + hdev->ibh_mrs[0] = NULL; + hdev->ibh_nmrs = 1; + + mr = ib_get_dma_mr(hdev->ibh_pd, acflags); + if (IS_ERR(mr)) { + CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + hdev->ibh_mrs[0] = mr; + + goto out; + } + + mr_size = (1ULL << hdev->ibh_mr_shift); + mm_size = (unsigned long)high_memory - PAGE_OFFSET; + + hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift); + + if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) { + /* it's 4T..., assume we will re-code at that time */ + CERROR("Can't support memory size: x"LPX64 + " with MR size: x"LPX64"\n", mm_size, mr_size); + return -EINVAL; + } + + /* create an array of MRs to cover all memory */ + LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); + if (hdev->ibh_mrs == NULL) { + CERROR("Failed to allocate MRs' table\n"); + return -ENOMEM; + } + + memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs); + + for (i = 0; i < hdev->ibh_nmrs; i++) { + struct ib_phys_buf ipb; + __u64 iova; + + ipb.size = hdev->ibh_mr_size; + ipb.addr = i * mr_size; + iova = ipb.addr; + + mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova); + if (IS_ERR(mr)) { + CERROR("Failed ib_reg_phys_mr addr "LPX64 + " size "LPX64" : %ld\n", + ipb.addr, ipb.size, PTR_ERR(mr)); + kiblnd_hdev_cleanup_mrs(hdev); + return PTR_ERR(mr); + } + + LASSERT (iova == ipb.addr); + + hdev->ibh_mrs[i] = mr; + } + +out: + if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1) + LCONSOLE_INFO("Register global MR array, MR size: " + LPX64", array size: %d\n", + hdev->ibh_mr_size, hdev->ibh_nmrs); + return 0; +} + +static int +kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ /* DUMMY */ + return 0; +} + +static int +kiblnd_dev_need_failover(kib_dev_t *dev) +{ + struct rdma_cm_id *cmid; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + if (dev->ibd_hdev == NULL || /* initializing */ + dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */ + *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ + return 1; + + /* XXX: it's UGLY, but I don't have better way to find + * ib-bonding HCA failover because: + * + * a. no reliable CM event for HCA failover... + * b. no OFED API to get ib_device for current net_device... + * + * We have only two choices at this point: + * + * a. rdma_bind_addr(), it will conflict with listener cmid + * b. rdma_resolve_addr() to zero addr */ + cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + return rc; + } + + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, 1); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n", + dev->ibd_ifname, HIPQUAD(dev->ibd_ifip), + cmid->device, rc); + rdma_destroy_id(cmid); + return rc; + } + + if (dev->ibd_hdev->ibh_ibdev == cmid->device) { + /* don't need device failover */ + rdma_destroy_id(cmid); + return 0; + } + + return 1; +} + +int +kiblnd_dev_failover(kib_dev_t *dev) +{ + LIST_HEAD (zombie_tpo); + LIST_HEAD (zombie_ppo); + LIST_HEAD (zombie_fpo); + struct rdma_cm_id *cmid = NULL; + kib_hca_dev_t *hdev = NULL; + kib_hca_dev_t *old; + struct ib_pd *pd; + kib_net_t *net; + struct sockaddr_in addr; + unsigned long flags; + int rc = 0; + int i; + + LASSERT (*kiblnd_tunables.kib_dev_failover > 1 || + dev->ibd_can_failover || + dev->ibd_hdev == NULL); + + rc = kiblnd_dev_need_failover(dev); + if (rc <= 0) + goto out; + + if (dev->ibd_hdev != NULL && + dev->ibd_hdev->ibh_cmid != NULL) { + /* XXX it's not good to close old listener at here, + * because we can fail to create new listener. + * But we have to close it now, otherwise rdma_bind_addr + * will return EADDRINUSE... How crap! */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + cmid = dev->ibd_hdev->ibh_cmid; + /* make next schedule of kiblnd_dev_need_failover() + * return 1 for me */ + dev->ibd_hdev->ibh_cmid = NULL; + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + rdma_destroy_id(cmid); + } + + cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(cmid)) { + rc = PTR_ERR(cmid); + CERROR("Failed to create cmid for failover: %d\n", rc); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip); + addr.sin_port = htons(*kiblnd_tunables.kib_service); + + /* Bind to failover device or port */ + rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); + if (rc != 0 || cmid->device == NULL) { + CERROR("Failed to bind %s:%u.%u.%u.%u to device(%p): %d\n", + dev->ibd_ifname, HIPQUAD(dev->ibd_ifip), + cmid->device, rc); + rdma_destroy_id(cmid); + goto out; + } + + LIBCFS_ALLOC(hdev, sizeof(*hdev)); + if (hdev == NULL) { + CERROR("Failed to allocate kib_hca_dev\n"); + rdma_destroy_id(cmid); + rc = -ENOMEM; + goto out; + } + + atomic_set(&hdev->ibh_ref, 1); + hdev->ibh_dev = dev; + hdev->ibh_cmid = cmid; + hdev->ibh_ibdev = cmid->device; + + pd = ib_alloc_pd(cmid->device); + if (IS_ERR(pd)) { + rc = PTR_ERR(pd); + CERROR("Can't allocate PD: %d\n", rc); + goto out; + } + + hdev->ibh_pd = pd; + + rc = rdma_listen(cmid, 0); + if (rc != 0) { + CERROR("Can't start new listener: %d\n", rc); + goto out; + } + + rc = kiblnd_hdev_setup_mrs(hdev); + if (rc != 0) { + CERROR("Can't setup device: %d\n", rc); + goto out; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + old = dev->ibd_hdev; + dev->ibd_hdev = hdev; /* take over the refcount */ + hdev = old; + + list_for_each_entry(net, &dev->ibd_nets, ibn_list) { + cfs_cpt_for_each(i, lnet_cpt_table()) { + kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, + &zombie_tpo); + + if (net->ibn_fmr_ps != NULL) { + kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], + &zombie_fpo); + + } else if (net->ibn_pmr_ps != NULL) { + kiblnd_fail_poolset(&net->ibn_pmr_ps[i]-> + pps_poolset, &zombie_ppo); + } + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + out: + if (!list_empty(&zombie_tpo)) + kiblnd_destroy_pool_list(&zombie_tpo); + if (!list_empty(&zombie_ppo)) + kiblnd_destroy_pool_list(&zombie_ppo); + if (!list_empty(&zombie_fpo)) + kiblnd_destroy_fmr_pool_list(&zombie_fpo); + if (hdev != NULL) + kiblnd_hdev_decref(hdev); + + if (rc != 0) + dev->ibd_failed_failover++; + else + dev->ibd_failed_failover = 0; + + return rc; +} + +void +kiblnd_destroy_dev (kib_dev_t *dev) +{ + LASSERT (dev->ibd_nnets == 0); + LASSERT (list_empty(&dev->ibd_nets)); + + list_del(&dev->ibd_fail_list); + list_del(&dev->ibd_list); + + if (dev->ibd_hdev != NULL) + kiblnd_hdev_decref(dev->ibd_hdev); + + LIBCFS_FREE(dev, sizeof(*dev)); +} + +kib_dev_t * +kiblnd_create_dev(char *ifname) +{ + struct net_device *netdev; + kib_dev_t *dev; + __u32 netmask; + __u32 ip; + int up; + int rc; + + rc = libcfs_ipif_query(ifname, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", + ifname, rc); + return NULL; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ifname); + return NULL; + } + + LIBCFS_ALLOC(dev, sizeof(*dev)); + if (dev == NULL) + return NULL; + + memset(dev, 0, sizeof(*dev)); + netdev = dev_get_by_name(&init_net, ifname); + if (netdev == NULL) { + dev->ibd_can_failover = 0; + } else { + dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); + dev_put(netdev); + } + + INIT_LIST_HEAD(&dev->ibd_nets); + INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ + INIT_LIST_HEAD(&dev->ibd_fail_list); + dev->ibd_ifip = ip; + strcpy(&dev->ibd_ifname[0], ifname); + + /* initialize the device */ + rc = kiblnd_dev_failover(dev); + if (rc != 0) { + CERROR("Can't initialize device: %d\n", rc); + LIBCFS_FREE(dev, sizeof(*dev)); + return NULL; + } + + list_add_tail(&dev->ibd_list, + &kiblnd_data.kib_devs); + return dev; +} + +void +kiblnd_base_shutdown(void) +{ + struct kib_sched_info *sched; + int i; + + LASSERT (list_empty(&kiblnd_data.kib_devs)); + + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + switch (kiblnd_data.kib_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + case IBLND_INIT_DATA: + LASSERT (kiblnd_data.kib_peers != NULL); + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + LASSERT (list_empty(&kiblnd_data.kib_peers[i])); + } + LASSERT (list_empty(&kiblnd_data.kib_connd_zombies)); + LASSERT (list_empty(&kiblnd_data.kib_connd_conns)); + + /* flag threads to terminate; wake and wait for them to die */ + kiblnd_data.kib_shutdown = 1; + + /* NB: we really want to stop scheduler threads net by net + * instead of the whole module, this should be improved + * with dynamic configuration LNet */ + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) + wake_up_all(&sched->ibs_waitq); + + wake_up_all(&kiblnd_data.kib_connd_waitq); + wake_up_all(&kiblnd_data.kib_failover_waitq); + + i = 2; + while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + cfs_pause(cfs_time_seconds(1)); + } + + /* fall through */ + + case IBLND_INIT_NOTHING: + break; + } + + if (kiblnd_data.kib_peers != NULL) { + LIBCFS_FREE(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + } + + if (kiblnd_data.kib_scheds != NULL) + cfs_percpt_free(kiblnd_data.kib_scheds); + + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + kiblnd_data.kib_init = IBLND_INIT_NOTHING; + module_put(THIS_MODULE); +} + +void +kiblnd_shutdown (lnet_ni_t *ni) +{ + kib_net_t *net = ni->ni_data; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + int i; + unsigned long flags; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); + + if (net == NULL) + goto out; + + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + write_lock_irqsave(g_lock, flags); + net->ibn_shutdown = 1; + write_unlock_irqrestore(g_lock, flags); + + switch (net->ibn_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); + + /* Wait for all peer state to clean up */ + i = 2; + while (atomic_read(&net->ibn_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ + "%s: waiting for %d peers to disconnect\n", + libcfs_nid2str(ni->ni_nid), + atomic_read(&net->ibn_npeers)); + cfs_pause(cfs_time_seconds(1)); + } + + kiblnd_net_fini_pools(net); + + write_lock_irqsave(g_lock, flags); + LASSERT(net->ibn_dev->ibd_nnets > 0); + net->ibn_dev->ibd_nnets--; + list_del(&net->ibn_list); + write_unlock_irqrestore(g_lock, flags); + + /* fall through */ + + case IBLND_INIT_NOTHING: + LASSERT (atomic_read(&net->ibn_nconns) == 0); + + if (net->ibn_dev != NULL && + net->ibn_dev->ibd_nnets == 0) + kiblnd_destroy_dev(net->ibn_dev); + + break; + } + + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + net->ibn_init = IBLND_INIT_NOTHING; + ni->ni_data = NULL; + + LIBCFS_FREE(net, sizeof(*net)); + +out: + if (list_empty(&kiblnd_data.kib_devs)) + kiblnd_base_shutdown(); + return; +} + +int +kiblnd_base_startup(void) +{ + struct kib_sched_info *sched; + int rc; + int i; + + LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING); + + try_module_get(THIS_MODULE); + memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */ + + rwlock_init(&kiblnd_data.kib_global_lock); + + INIT_LIST_HEAD(&kiblnd_data.kib_devs); + INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); + + kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; + LIBCFS_ALLOC(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + if (kiblnd_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); + + spin_lock_init(&kiblnd_data.kib_connd_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); + init_waitqueue_head(&kiblnd_data.kib_connd_waitq); + init_waitqueue_head(&kiblnd_data.kib_failover_waitq); + + kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*sched)); + if (kiblnd_data.kib_scheds == NULL) + goto failed; + + cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { + int nthrs; + + spin_lock_init(&sched->ibs_lock); + INIT_LIST_HEAD(&sched->ibs_conns); + init_waitqueue_head(&sched->ibs_waitq); + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); + } else { + /* max to half of CPUs, another half is reserved for + * upper layer modules */ + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + } + + sched->ibs_nthreads_max = nthrs; + sched->ibs_cpt = i; + } + + kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; + + /* lists/ptrs/locks initialised */ + kiblnd_data.kib_init = IBLND_INIT_DATA; + /*****************************************************/ + + rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); + if (rc != 0) { + CERROR("Can't spawn o2iblnd connd: %d\n", rc); + goto failed; + } + + if (*kiblnd_tunables.kib_dev_failover != 0) + rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, + "kiblnd_failover"); + + if (rc != 0) { + CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kiblnd_data.kib_init = IBLND_INIT_ALL; + /*****************************************************/ + + return 0; + + failed: + kiblnd_base_shutdown(); + return -ENETDOWN; +} + +int +kiblnd_start_schedulers(struct kib_sched_info *sched) +{ + int rc = 0; + int nthrs; + int i; + + if (sched->ibs_nthreads == 0) { + if (*kiblnd_tunables.kib_nscheds > 0) { + nthrs = sched->ibs_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + sched->ibs_cpt); + nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); + nthrs = min(IBLND_N_SCHED_HIGH, nthrs); + } + } else { + LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); + /* increase one thread if there is new interface */ + nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); + snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", + KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + sched->ibs_cpt, sched->ibs_nthreads + i, rc); + break; + } + + sched->ibs_nthreads += i; + return rc; +} + +int +kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts) +{ + int cpt; + int rc; + int i; + + for (i = 0; i < ncpts; i++) { + struct kib_sched_info *sched; + + cpt = (cpts == NULL) ? i : cpts[i]; + sched = kiblnd_data.kib_scheds[cpt]; + + if (!newdev && sched->ibs_nthreads > 0) + continue; + + rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); + if (rc != 0) { + CERROR("Failed to start scheduler threads for %s\n", + dev->ibd_ifname); + return rc; + } + } + return 0; +} + +kib_dev_t * +kiblnd_dev_search(char *ifname) +{ + kib_dev_t *alias = NULL; + kib_dev_t *dev; + char *colon; + char *colon2; + + colon = strchr(ifname, ':'); + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + return dev; + + if (alias != NULL) + continue; + + colon2 = strchr(dev->ibd_ifname, ':'); + if (colon != NULL) + *colon = 0; + if (colon2 != NULL) + *colon2 = 0; + + if (strcmp(&dev->ibd_ifname[0], ifname) == 0) + alias = dev; + + if (colon != NULL) + *colon = ':'; + if (colon2 != NULL) + *colon2 = ':'; + } + return alias; +} + +int +kiblnd_startup (lnet_ni_t *ni) +{ + char *ifname; + kib_dev_t *ibdev = NULL; + kib_net_t *net; + struct timeval tv; + unsigned long flags; + int rc; + int newdev; + + LASSERT (ni->ni_lnd == &the_o2iblnd); + + if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { + rc = kiblnd_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + ni->ni_data = net; + if (net == NULL) + goto failed; + + memset(net, 0, sizeof(*net)); + + do_gettimeofday(&tv); + net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout; + ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; + ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits; + ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits; + + if (ni->ni_interfaces[0] != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ + + CLASSERT (LNET_MAX_INTERFACES > 1); + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + goto failed; + } + + ifname = ni->ni_interfaces[0]; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } + + if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { + CERROR("IPoIB interface name too long: %s\n", ifname); + goto failed; + } + + ibdev = kiblnd_dev_search(ifname); + + newdev = ibdev == NULL; + /* hmm...create kib_dev even for alias */ + if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0) + ibdev = kiblnd_create_dev(ifname); + + if (ibdev == NULL) + goto failed; + + net->ibn_dev = ibdev; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); + + rc = kiblnd_dev_start_threads(ibdev, newdev, + ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto failed; + + rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) { + CERROR("Failed to initialize NI pools: %d\n", rc); + goto failed; + } + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + ibdev->ibd_nnets++; + list_add_tail(&net->ibn_list, &ibdev->ibd_nets); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + net->ibn_init = IBLND_INIT_ALL; + + return 0; + +failed: + if (net->ibn_dev == NULL && ibdev != NULL) + kiblnd_destroy_dev(ibdev); + + kiblnd_shutdown(ni); + + CDEBUG(D_NET, "kiblnd_startup failed\n"); + return -ENETDOWN; +} + +void __exit +kiblnd_module_fini (void) +{ + lnet_unregister_lnd(&the_o2iblnd); + kiblnd_tunables_fini(); +} + +int __init +kiblnd_module_init (void) +{ + int rc; + + CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE); + CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + + rc = kiblnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_o2iblnd); + + return 0; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00"); +MODULE_LICENSE("GPL"); + +module_init(kiblnd_module_init); +module_exit(kiblnd_module_fini); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h new file mode 100644 index 000000000000..e4626bf82fc7 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -0,0 +1,1057 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd.h + * + * Author: Eric Barton + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ +/* # scheduler loops before reschedule */ +#define IBLND_RESCHED 100 + +#define IBLND_N_SCHED 2 +#define IBLND_N_SCHED_HIGH 4 + +typedef struct +{ + int *kib_dev_failover; /* HCA failover */ + unsigned int *kib_service; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peertxcredits; /* # concurrent sends to 1 peer */ + int *kib_peerrtrcredits; /* # per-peer router buffer credits */ + int *kib_peercredits_hiw; /* # when eagerly to return credits */ + int *kib_peertimeout; /* seconds to consider peer dead */ + char **kib_default_ipif; /* default IPoIB interface */ + int *kib_retry_count; + int *kib_rnr_retry_count; + int *kib_concurrent_sends; /* send work queue sizing */ + int *kib_ib_mtu; /* IB MTU */ + int *kib_map_on_demand; /* map-on-demand if RD has more fragments + * than this value, 0 disable map-on-demand */ + int *kib_pmr_pool_size; /* # physical MR in pool */ + int *kib_fmr_pool_size; /* # FMRs in pool */ + int *kib_fmr_flush_trigger; /* When to trigger FMR flush */ + int *kib_fmr_cache; /* enable FMR pool cache? */ +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + ctl_table_header_t *kib_sysctl; /* sysctl interface */ +#endif + int *kib_require_priv_port;/* accept only privileged ports */ + int *kib_use_priv_port; /* use privileged port for active connect */ + /* # threads on each CPT */ + int *kib_nscheds; +} kib_tunables_t; + +extern kib_tunables_t kiblnd_tunables; + +#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ +#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ + +#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ +#define IBLND_CREDITS_MAX ((typeof(((kib_msg_t*) 0)->ibm_credits)) - 1) /* Max # of peer credits */ + +#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_MSG_QUEUE_SIZE_V1 : \ + *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */ +#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_CREDIT_HIGHWATER_V1 : \ + *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */ + +#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt) + +static inline int +kiblnd_concurrent_sends_v1(void) +{ + if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) + return IBLND_MSG_QUEUE_SIZE_V1 * 2; + + if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) + return IBLND_MSG_QUEUE_SIZE_V1 / 2; + + return *kiblnd_tunables.kib_concurrent_sends; +} + +#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \ + kiblnd_concurrent_sends_v1() : \ + *kiblnd_tunables.kib_concurrent_sends) +/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ +#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) +#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) + +#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ +#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand != 0 ? \ + *kiblnd_tunables.kib_map_on_demand : \ + IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */ +#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS) + +/************************/ +/* derived constants... */ +/* Pools (shared by connections on each CPT) */ +/* These pools can grow at runtime, so don't need give a very large value */ +#define IBLND_TX_POOL 256 +#define IBLND_PMR_POOL 256 +#define IBLND_FMR_POOL 256 +#define IBLND_FMR_POOL_FLUSH 192 + +/* TX messages (shared by all connections) */ +#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) + +/* RX messages (per connection) */ +#define IBLND_RX_MSGS(v) (IBLND_MSG_QUEUE_SIZE(v) * 2 + IBLND_OOB_MSGS(v)) +#define IBLND_RX_MSG_BYTES(v) (IBLND_RX_MSGS(v) * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES(v) ((IBLND_RX_MSG_BYTES(v) + PAGE_SIZE - 1) / PAGE_SIZE) + +/* WRs and CQEs (per connection) */ +#define IBLND_RECV_WRS(v) IBLND_RX_MSGS(v) +#define IBLND_SEND_WRS(v) ((IBLND_RDMA_FRAGS(v) + 1) * IBLND_CONCURRENT_SENDS(v)) +#define IBLND_CQ_ENTRIES(v) (IBLND_RECV_WRS(v) + IBLND_SEND_WRS(v)) + +struct kib_hca_dev; + +/* o2iblnd can run over aliased interface */ +#ifdef IFALIASZ +#define KIB_IFNAME_SIZE IFALIASZ +#else +#define KIB_IFNAME_SIZE 256 +#endif + +typedef struct +{ + struct list_head ibd_list; /* chain on kib_devs */ + struct list_head ibd_fail_list; /* chain on kib_failed_devs */ + __u32 ibd_ifip; /* IPoIB interface IP */ + /** IPoIB interface name */ + char ibd_ifname[KIB_IFNAME_SIZE]; + int ibd_nnets; /* # nets extant */ + + cfs_time_t ibd_next_failover; + int ibd_failed_failover; /* # failover failures */ + unsigned int ibd_failover; /* failover in progress */ + unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */ + struct list_head ibd_nets; + struct kib_hca_dev *ibd_hdev; +} kib_dev_t; + +typedef struct kib_hca_dev +{ + struct rdma_cm_id *ibh_cmid; /* listener cmid */ + struct ib_device *ibh_ibdev; /* IB device */ + int ibh_page_shift; /* page shift of current HCA */ + int ibh_page_size; /* page size of current HCA */ + __u64 ibh_page_mask; /* page mask of current HCA */ + int ibh_mr_shift; /* bits shift of max MR size */ + __u64 ibh_mr_size; /* size of MR */ + int ibh_nmrs; /* # of global MRs */ + struct ib_mr **ibh_mrs; /* global MR */ + struct ib_pd *ibh_pd; /* PD */ + kib_dev_t *ibh_dev; /* owner */ + atomic_t ibh_ref; /* refcount */ +} kib_hca_dev_t; + +/** # of seconds to keep pool alive */ +#define IBLND_POOL_DEADLINE 300 +/** # of seconds to retry if allocation failed */ +#define IBLND_POOL_RETRY 1 + +typedef struct +{ + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; /* page array */ +} kib_pages_t; + +struct kib_pmr_pool; + +typedef struct { + struct list_head pmr_list; /* chain node */ + struct ib_phys_buf *pmr_ipb; /* physical buffer */ + struct ib_mr *pmr_mr; /* IB MR */ + struct kib_pmr_pool *pmr_pool; /* owner of this MR */ + __u64 pmr_iova; /* Virtual I/O address */ + int pmr_refcount; /* reference count */ +} kib_phys_mr_t; + +struct kib_pool; +struct kib_poolset; + +typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, + int inc, struct kib_pool **pp_po); +typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); +typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); +typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); + +struct kib_net; + +#define IBLND_POOL_NAME_LEN 32 + +typedef struct kib_poolset +{ + spinlock_t ps_lock; /* serialize */ + struct kib_net *ps_net; /* network it belongs to */ + char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */ + struct list_head ps_pool_list; /* list of pools */ + struct list_head ps_failed_pool_list; /* failed pool list */ + cfs_time_t ps_next_retry; /* time stamp for retry if failed to allocate */ + int ps_increasing; /* is allocating new pool */ + int ps_pool_size; /* new pool size */ + int ps_cpt; /* CPT id */ + + kib_ps_pool_create_t ps_pool_create; /* create a new pool */ + kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ + kib_ps_node_init_t ps_node_init; /* initialize new allocated node */ + kib_ps_node_fini_t ps_node_fini; /* finalize node */ +} kib_poolset_t; + +typedef struct kib_pool +{ + struct list_head po_list; /* chain on pool list */ + struct list_head po_free_list; /* pre-allocated node */ + kib_poolset_t *po_owner; /* pool_set of this pool */ + cfs_time_t po_deadline; /* deadline of this pool */ + int po_allocated; /* # of elements in use */ + int po_failed; /* pool is created on failed HCA */ + int po_size; /* # of pre-allocated elements */ +} kib_pool_t; + +typedef struct { + kib_poolset_t tps_poolset; /* pool-set */ + __u64 tps_next_tx_cookie; /* cookie of TX */ +} kib_tx_poolset_t; + +typedef struct { + kib_pool_t tpo_pool; /* pool */ + struct kib_hca_dev *tpo_hdev; /* device for this pool */ + struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ + kib_pages_t *tpo_tx_pages; /* premapped tx msg pages */ +} kib_tx_pool_t; + +typedef struct { + kib_poolset_t pps_poolset; /* pool-set */ +} kib_pmr_poolset_t; + +typedef struct kib_pmr_pool { + struct kib_hca_dev *ppo_hdev; /* device for this pool */ + kib_pool_t ppo_pool; /* pool */ +} kib_pmr_pool_t; + +typedef struct +{ + spinlock_t fps_lock; /* serialize */ + struct kib_net *fps_net; /* IB network */ + struct list_head fps_pool_list; /* FMR pool list */ + struct list_head fps_failed_pool_list; /* FMR pool list */ + __u64 fps_version; /* validity stamp */ + int fps_cpt; /* CPT id */ + int fps_pool_size; + int fps_flush_trigger; + /* is allocating new pool */ + int fps_increasing; + /* time stamp for retry if failed to allocate */ + cfs_time_t fps_next_retry; +} kib_fmr_poolset_t; + +typedef struct +{ + struct list_head fpo_list; /* chain on pool list */ + struct kib_hca_dev *fpo_hdev; /* device for this pool */ + kib_fmr_poolset_t *fpo_owner; /* owner of this pool */ + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + cfs_time_t fpo_deadline; /* deadline of this pool */ + int fpo_failed; /* fmr pool is failed */ + int fpo_map_count; /* # of mapped FMR */ +} kib_fmr_pool_t; + +typedef struct { + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ + kib_fmr_pool_t *fmr_pool; /* pool of FMR */ +} kib_fmr_t; + +typedef struct kib_net +{ + struct list_head ibn_list; /* chain on kib_dev_t::ibd_nets */ + __u64 ibn_incarnation; /* my epoch */ + int ibn_init; /* initialisation state */ + int ibn_shutdown; /* shutting down? */ + + atomic_t ibn_npeers; /* # peers extant */ + atomic_t ibn_nconns; /* # connections extant */ + + kib_tx_poolset_t **ibn_tx_ps; /* tx pool-set */ + kib_fmr_poolset_t **ibn_fmr_ps; /* fmr pool-set */ + kib_pmr_poolset_t **ibn_pmr_ps; /* pmr pool-set */ + + kib_dev_t *ibn_dev; /* underlying IB device */ +} kib_net_t; + +#define KIB_THREAD_SHIFT 16 +#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) +#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) +#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) + +struct kib_sched_info { + /* serialise */ + spinlock_t ibs_lock; + /* schedulers sleep here */ + wait_queue_head_t ibs_waitq; + /* conns to check for rx completions */ + struct list_head ibs_conns; + /* number of scheduler threads */ + int ibs_nthreads; + /* max allowed scheduler threads */ + int ibs_nthreads_max; + int ibs_cpt; /* CPT id */ +}; + +typedef struct +{ + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + struct list_head kib_devs; /* IB devices extant */ + /* list head of failed devices */ + struct list_head kib_failed_devs; + /* schedulers sleep here */ + wait_queue_head_t kib_failover_waitq; + atomic_t kib_nthreads; /* # live threads */ + /* stabilize net/dev/peer/conn ops */ + rwlock_t kib_global_lock; + /* hash table of all my known peers */ + struct list_head *kib_peers; + /* size of kib_peers */ + int kib_peer_hash_size; + /* the connd task (serialisation assertions) */ + void *kib_connd; + /* connections to setup/teardown */ + struct list_head kib_connd_conns; + /* connections with zero refcount */ + struct list_head kib_connd_zombies; + /* connection daemon sleeps here */ + wait_queue_head_t kib_connd_waitq; + spinlock_t kib_connd_lock; /* serialise */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ + /* percpt data for schedulers */ + struct kib_sched_info **kib_scheds; +} kib_data_t; + +#define IBLND_INIT_NOTHING 0 +#define IBLND_INIT_DATA 1 +#define IBLND_INIT_ALL 2 + +/************************************************************************ + * IB Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +typedef struct kib_connparams +{ + __u16 ibcp_queue_depth; + __u16 ibcp_max_frags; + __u32 ibcp_max_msg_size; +} WIRE_ATTR kib_connparams_t; + +typedef struct +{ + lnet_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kib_immediate_msg_t; + +typedef struct +{ + __u32 rf_nob; /* # bytes this frag */ + __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ +} WIRE_ATTR kib_rdma_frag_t; + +typedef struct +{ + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrags; /* # fragments */ + kib_rdma_frag_t rd_frags[0]; /* buffer frags */ +} WIRE_ATTR kib_rdma_desc_t; + +typedef struct +{ + lnet_hdr_t ibprm_hdr; /* portals header */ + __u64 ibprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kib_putreq_msg_t; + +typedef struct +{ + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ +} WIRE_ATTR kib_putack_msg_t; + +typedef struct +{ + lnet_hdr_t ibgm_hdr; /* portals header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ +} WIRE_ATTR kib_get_msg_t; + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __s32 ibcm_status; /* < 0 failure: >= 0 length */ +} WIRE_ATTR kib_completion_msg_t; + +typedef struct +{ + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an ibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + + union { + kib_connparams_t connparams; + kib_immediate_msg_t immediate; + kib_putreq_msg_t putreq; + kib_putack_msg_t putack; + kib_get_msg_t get; + kib_completion_msg_t completion; + } WIRE_ATTR ibm_u; +} WIRE_ATTR kib_msg_t; + +#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ + +#define IBLND_MSG_VERSION_1 0x11 +#define IBLND_MSG_VERSION_2 0x12 +#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 + +#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ +#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ +#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +typedef struct { + __u32 ibr_magic; /* sender's magic */ + __u16 ibr_version; /* sender's version */ + __u8 ibr_why; /* reject reason */ + __u8 ibr_padding; /* padding */ + __u64 ibr_incarnation; /* incarnation of peer */ + kib_connparams_t ibr_cp; /* connection parameters */ +} WIRE_ATTR kib_rej_t; + +/* connection rejection reasons */ +#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ +#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ +#define IBLND_REJECT_FATAL 3 /* Anything else */ + +#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */ +#define IBLND_REJECT_CONN_STALE 5 /* stale peer */ + +#define IBLND_REJECT_RDMA_FRAGS 6 /* Fatal: peer's rdma frags can't match mine */ +#define IBLND_REJECT_MSG_QUEUE_SIZE 7 /* Fatal: peer's msg queue size can't match mine */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_nob; /* # bytes received (-1 while posted) */ + enum ib_wc_status rx_status; /* completion status */ + kib_msg_t *rx_msg; /* message buffer (host vaddr) */ + __u64 rx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (rx_msgunmap); /* for dma_unmap_single() */ + struct ib_recv_wr rx_wrq; /* receive work item... */ + struct ib_sge rx_sge; /* ...and its memory */ +} kib_rx_t; + +#define IBLND_POSTRX_DONT_POST 0 /* don't post */ +#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ +#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */ + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + kib_tx_pool_t *tx_pool; /* pool I'm from */ + struct kib_conn *tx_conn; /* owning conn */ + short tx_sending; /* # tx callbacks outstanding */ + short tx_queued; /* queued for sending */ + short tx_waiting; /* waiting for peer */ + int tx_status; /* LNET completion status */ + unsigned long tx_deadline; /* completion deadline */ + __u64 tx_cookie; /* completion cookie */ + lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ + kib_msg_t *tx_msg; /* message buffer (host vaddr) */ + __u64 tx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (tx_msgunmap); /* for dma_unmap_single() */ + int tx_nwrq; /* # send work items */ + struct ib_send_wr *tx_wrq; /* send work items... */ + struct ib_sge *tx_sge; /* ...and their memory */ + kib_rdma_desc_t *tx_rd; /* rdma descriptor */ + int tx_nfrags; /* # entries in... */ + struct scatterlist *tx_frags; /* dma_map_sg descriptor */ + __u64 *tx_pages; /* rdma phys page addrs */ + union { + kib_phys_mr_t *pmr; /* MR for physical buffer */ + kib_fmr_t fmr; /* FMR */ + } tx_u; + int tx_dmadir; /* dma direction */ +} kib_tx_t; + +typedef struct kib_connvars +{ + /* connection-in-progress variables */ + kib_msg_t cv_msg; +} kib_connvars_t; + +typedef struct kib_conn +{ + struct kib_sched_info *ibc_sched; /* scheduler information */ + struct kib_peer *ibc_peer; /* owning peer */ + kib_hca_dev_t *ibc_hdev; /* HCA bound on */ + struct list_head ibc_list; /* stash on peer's conn list */ + struct list_head ibc_sched_list; /* schedule for attention */ + __u16 ibc_version; /* version of connection */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_noops_posted; /* # uncompleted NOOPs */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_reserved_credits;/* # ACK/DONE msg credits */ + int ibc_comms_error; /* set on comms error */ + unsigned int ibc_nrx:16; /* receive buffers owned */ + unsigned int ibc_scheduled:1; /* scheduled for attention */ + unsigned int ibc_ready:1; /* CQ callback fired */ + /* time of last send */ + unsigned long ibc_last_send; + /** link chain for kiblnd_check_conns only */ + struct list_head ibc_connd_list; + /** rxs completed before ESTABLISHED */ + struct list_head ibc_early_rxs; + /** IBLND_MSG_NOOPs for IBLND_MSG_VERSION_1 */ + struct list_head ibc_tx_noops; + struct list_head ibc_tx_queue; /* sends that need a credit */ + struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + + struct rdma_cm_id *ibc_cmid; /* CM id */ + struct ib_cq *ibc_cq; /* completion queue */ + + kib_connvars_t *ibc_connvars; /* in-progress connection state */ +} kib_conn_t; + +#define IBLND_CONN_INIT 0 /* being initialised */ +#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ +#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ +#define IBLND_CONN_ESTABLISHED 3 /* connection established */ +#define IBLND_CONN_CLOSING 4 /* being closed */ +#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_ni_t *ibp_ni; /* LNet interface */ + atomic_t ibp_refcount; /* # users */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + __u16 ibp_version; /* version of peer */ + __u64 ibp_incarnation; /* incarnation of peer */ + int ibp_connecting; /* current active connection attempts */ + int ibp_accepting; /* current passive connection attempts */ + int ibp_error; /* errno on closing this peer */ + cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ +} kib_peer_t; + +extern kib_data_t kiblnd_data; + +extern void kiblnd_hdev_destroy(kib_hca_dev_t *hdev); + +static inline void +kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev) +{ + LASSERT (atomic_read(&hdev->ibh_ref) > 0); + atomic_inc(&hdev->ibh_ref); +} + +static inline void +kiblnd_hdev_decref(kib_hca_dev_t *hdev) +{ + LASSERT (atomic_read(&hdev->ibh_ref) > 0); + if (atomic_dec_and_test(&hdev->ibh_ref)) + kiblnd_hdev_destroy(hdev); +} + +static inline int +kiblnd_dev_can_failover(kib_dev_t *dev) +{ + if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover == 0) /* disabled */ + return 0; + + if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ + return 1; + + return dev->ibd_can_failover; +} + +#define kiblnd_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kiblnd_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kiblnd_data.kib_connd_zombies); \ + wake_up(&kiblnd_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ + } \ +} while (0) + +#define kiblnd_peer_addref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + atomic_inc(&(peer)->ibp_refcount); \ +} while (0) + +#define kiblnd_peer_decref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \ + if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ + kiblnd_destroy_peer(peer); \ +} while (0) + +static inline struct list_head * +kiblnd_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = + ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; + + return (&kiblnd_data.kib_peers [hash]); +} + +static inline int +kiblnd_peer_active (kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline kib_conn_t * +kiblnd_get_conn_locked (kib_peer_t *peer) +{ + LASSERT (!list_empty(&peer->ibp_conns)); + + /* just return the first connection */ + return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list); +} + +static inline int +kiblnd_send_keepalive(kib_conn_t *conn) +{ + return (*kiblnd_tunables.kib_keepalive > 0) && + cfs_time_after(jiffies, conn->ibc_last_send + + *kiblnd_tunables.kib_keepalive*HZ); +} + +static inline int +kiblnd_need_noop(kib_conn_t *conn) +{ + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (conn->ibc_outstanding_credits < + IBLND_CREDITS_HIGHWATER(conn->ibc_version) && + !kiblnd_send_keepalive(conn)) + return 0; /* No need to send NOOP */ + + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) + return 0; /* NOOP can be piggybacked */ + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || + conn->ibc_credits == 0); + } + + if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ + conn->ibc_credits == 0) /* no credit */ + return 0; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); +} + +static inline void +kiblnd_abort_receives(kib_conn_t *conn) +{ + ib_modify_qp(conn->ibc_cmid->qp, + &kiblnd_data.kib_error_qpa, IB_QP_STATE); +} + +static inline const char * +kiblnd_queue2str (kib_conn_t *conn, struct list_head *q) +{ + if (q == &conn->ibc_tx_queue) + return "tx_queue"; + + if (q == &conn->ibc_tx_queue_rsrvd) + return "tx_queue_rsrvd"; + + if (q == &conn->ibc_tx_queue_nocred) + return "tx_queue_nocred"; + + if (q == &conn->ibc_active_txs) + return "active_txs"; + + LBUG(); + return NULL; +} + +/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the + * lowest bits of the work request id to stash the work item type. */ + +#define IBLND_WID_TX 0 +#define IBLND_WID_RDMA 1 +#define IBLND_WID_RX 2 +#define IBLND_WID_MASK 3UL + +static inline __u64 +kiblnd_ptr2wreqid (void *ptr, int type) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & IBLND_WID_MASK) == 0); + LASSERT ((type & ~IBLND_WID_MASK) == 0); + return (__u64)(lptr | type); +} + +static inline void * +kiblnd_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); +} + +static inline int +kiblnd_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBLND_WID_MASK); +} + +static inline void +kiblnd_set_conn_state (kib_conn_t *conn, int state) +{ + conn->ibc_state = state; + mb(); +} + +static inline void +kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; +} + +static inline int +kiblnd_rd_size (kib_rdma_desc_t *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrags; i++) + size += rd->rd_frags[i].rf_nob; + + return size; +} + +static inline __u64 +kiblnd_rd_frag_addr(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_frags[index].rf_addr; +} + +static inline __u32 +kiblnd_rd_frag_size(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_frags[index].rf_nob; +} + +static inline __u32 +kiblnd_rd_frag_key(kib_rdma_desc_t *rd, int index) +{ + return rd->rd_key; +} + +static inline int +kiblnd_rd_consume_frag(kib_rdma_desc_t *rd, int index, __u32 nob) +{ + if (nob < rd->rd_frags[index].rf_nob) { + rd->rd_frags[index].rf_addr += nob; + rd->rd_frags[index].rf_nob -= nob; + } else { + index ++; + } + + return index; +} + +static inline int +kiblnd_rd_msg_size(kib_rdma_desc_t *rd, int msgtype, int n) +{ + LASSERT (msgtype == IBLND_MSG_GET_REQ || + msgtype == IBLND_MSG_PUT_ACK); + + return msgtype == IBLND_MSG_GET_REQ ? + offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]) : + offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); +} + + +static inline __u64 +kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return ib_dma_mapping_error(dev, dma_addr); +} + +static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, + void *msg, size_t size, + enum dma_data_direction direction) +{ + return ib_dma_map_single(dev, msg, size, direction); +} + +static inline void kiblnd_dma_unmap_single(struct ib_device *dev, + __u64 addr, size_t size, + enum dma_data_direction direction) +{ + ib_dma_unmap_single(dev, addr, size, direction); +} + +#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) +#define KIBLND_UNMAP_ADDR(p, m, a) (a) + +static inline int kiblnd_dma_map_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + return ib_dma_map_sg(dev, sg, nents, direction); +} + +static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + ib_dma_unmap_sg(dev, sg, nents, direction); +} + +static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_address(dev, sg); +} + +static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, + struct scatterlist *sg) +{ + return ib_sg_dma_len(dev, sg); +} + +/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly + * right because OFED1.2 defines it as const, to use it we have to add + * (void *) cast to overcome "const" */ + +#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) +#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) + + +struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd); +struct ib_mr *kiblnd_find_dma_mr(kib_hca_dev_t *hdev, + __u64 addr, __u64 size); +void kiblnd_map_rx_descs(kib_conn_t *conn); +void kiblnd_unmap_rx_descs(kib_conn_t *conn); +int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, + kib_rdma_desc_t *rd, int nfrags); +void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx); +void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node); +struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps); + +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, + int npages, __u64 iov, kib_fmr_t *fmr); +void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status); + +int kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev, + kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr); +void kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr); + +int kiblnd_startup (lnet_ni_t *ni); +void kiblnd_shutdown (lnet_ni_t *ni); +int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); +void kiblnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when); + +int kiblnd_tunables_init(void); +void kiblnd_tunables_fini(void); + +int kiblnd_connd (void *arg); +int kiblnd_scheduler(void *arg); +int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name); +int kiblnd_failover_thread (void *arg); + +int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages); +void kiblnd_free_pages (kib_pages_t *p); + +int kiblnd_cm_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event); +int kiblnd_translate_mtu(int value); + +int kiblnd_dev_failover(kib_dev_t *dev); +int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); +void kiblnd_destroy_peer (kib_peer_t *peer); +void kiblnd_destroy_dev (kib_dev_t *dev); +void kiblnd_unlink_peer_locked (kib_peer_t *peer); +void kiblnd_peer_alive (kib_peer_t *peer); +kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid); +void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error); +int kiblnd_close_stale_conns_locked (kib_peer_t *peer, + int version, __u64 incarnation); +int kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why); + +void kiblnd_connreq_done(kib_conn_t *conn, int status); +kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, + int state, int version); +void kiblnd_destroy_conn (kib_conn_t *conn); +void kiblnd_close_conn (kib_conn_t *conn, int error); +void kiblnd_close_conn_locked (kib_conn_t *conn, int error); + +int kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type, + int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); + +void kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid); +void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob); +void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, + int status); +void kiblnd_check_sends (kib_conn_t *conn); + +void kiblnd_qp_event(struct ib_event *event, void *arg); +void kiblnd_cq_event(struct ib_event *event, void *arg); +void kiblnd_cq_completion(struct ib_cq *cq, void *arg); + +void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version, + int credits, lnet_nid_t dstnid, __u64 dststamp); +int kiblnd_unpack_msg(kib_msg_t *msg, int nob); +int kiblnd_post_rx (kib_rx_t *rx, int credit); + +int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c new file mode 100644 index 000000000000..cc6232126dd0 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -0,0 +1,3529 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_cb.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +void +kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx) +{ + lnet_msg_t *lntmsg[2]; + kib_net_t *net = ni->ni_data; + int rc; + int i; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ + LASSERT (tx->tx_pool != NULL); + + kiblnd_unmap_tx(ni, tx); + + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; + + if (tx->tx_conn != NULL) { + LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni); + + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nwrq = 0; + tx->tx_status = 0; + + kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + lnet_finalize(ni, lntmsg[i], rc); + } +} + +void +kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status) +{ + kib_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kiblnd_tx_done(ni, tx); + } +} + +kib_tx_t * +kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target) +{ + kib_net_t *net = (kib_net_t *)ni->ni_data; + struct list_head *node; + kib_tx_t *tx; + kib_tx_poolset_t *tps; + + tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)]; + node = kiblnd_pool_alloc_node(&tps->tps_poolset); + if (node == NULL) + return NULL; + tx = container_of(node, kib_tx_t, tx_list); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + LASSERT (tx->tx_u.pmr == NULL); + LASSERT (tx->tx_nfrags == 0); + + return tx; +} + +void +kiblnd_drop_rx(kib_rx_t *rx) +{ + kib_conn_t *conn = rx->rx_conn; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + spin_lock_irqsave(&sched->ibs_lock, flags); + LASSERT(conn->ibc_nrx > 0); + conn->ibc_nrx--; + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_conn_decref(conn); +} + +int +kiblnd_post_rx (kib_rx_t *rx, int credit) +{ + kib_conn_t *conn = rx->rx_conn; + kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data; + struct ib_recv_wr *bad_wrq = NULL; + struct ib_mr *mr; + int rc; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (credit == IBLND_POSTRX_NO_CREDIT || + credit == IBLND_POSTRX_PEER_CREDIT || + credit == IBLND_POSTRX_RSRVD_CREDIT); + + mr = kiblnd_find_dma_mr(conn->ibc_hdev, rx->rx_msgaddr, IBLND_MSG_SIZE); + LASSERT (mr != NULL); + + rx->rx_sge.lkey = mr->lkey; + rx->rx_sge.addr = rx->rx_msgaddr; + rx->rx_sge.length = IBLND_MSG_SIZE; + + rx->rx_wrq.next = NULL; + rx->rx_wrq.sg_list = &rx->rx_sge; + rx->rx_wrq.num_sge = 1; + rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); + + LASSERT (conn->ibc_state >= IBLND_CONN_INIT); + LASSERT (rx->rx_nob >= 0); /* not posted */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return 0; + } + + rx->rx_nob = -1; /* flag posted */ + + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); + if (rc != 0) { + CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); + rx->rx_nob = 0; + } + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ + return rc; + + if (rc != 0) { + kiblnd_close_conn(conn, rc); + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return rc; + } + + if (credit == IBLND_POSTRX_NO_CREDIT) + return 0; + + spin_lock(&conn->ibc_lock); + if (credit == IBLND_POSTRX_PEER_CREDIT) + conn->ibc_outstanding_credits++; + else + conn->ibc_reserved_credits++; + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); + return 0; +} + +kib_tx_t * +kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) +{ + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending != 0 || tx->tx_waiting); + + if (tx->tx_cookie != cookie) + continue; + + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; + + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} + +void +kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int idle; + + spin_lock(&conn->ibc_lock); + + tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_close_conn(conn, -EPROTO); + return; + } + + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBLND_MSG_GET_REQ) { + lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(ni, tx); +} + +void +kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie) +{ + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t)); + + kiblnd_queue_tx(tx, conn); +} + +void +kiblnd_handle_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int credits = msg->ibm_credits; + kib_tx_t *tx; + int rc = 0; + int rc2; + int post_credit; + + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + + if (conn->ibc_credits + credits > + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)) { + rc2 = conn->ibc_credits; + spin_unlock(&conn->ibc_lock); + + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); + + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } + + conn->ibc_credits += credits; + + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP && + !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ + conn->ibc_outstanding_credits++; + + spin_unlock(&conn->ibc_lock); + kiblnd_check_sends(conn); + } + + switch (msg->ibm_type) { + default: + CERROR("Bad IBLND message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_NO_CREDIT; + rc = -EPROTO; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) { + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + } + + if (credits != 0) /* credit already posted */ + post_credit = IBLND_POSTRX_NO_CREDIT; + else /* a keepalive NOOP */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_IMMEDIATE: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_NAK: + CWARN ("PUT_NACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_PUT_ACK: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + + spin_lock(&conn->ibc_lock); + tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); + + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } + + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, + kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBLND_MSG_PUT_DONE: + post_credit = IBLND_POSTRX_PEER_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_GET_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_GET_DONE: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + if (rc < 0) /* protocol error */ + kiblnd_close_conn(conn, rc); + + if (post_credit != IBLND_POSTRX_DONT_POST) + kiblnd_post_rx(rx, post_credit); +} + +void +kiblnd_rx_complete (kib_rx_t *rx, int status, int nob) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_net_t *net = ni->ni_data; + int rc; + int err = -EIO; + + LASSERT (net != NULL); + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) + goto ignore; + + if (status != IB_WC_SUCCESS) { + CNETERR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), status); + goto failed; + } + + LASSERT (nob >= 0); + rx->rx_nob = nob; + + rc = kiblnd_unpack_msg(msg, rx->rx_nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != net->ibn_incarnation) { + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; + goto failed; + } + + /* set time last known alive */ + kiblnd_peer_alive(conn->ibc_peer); + + /* racing with connection establishment/teardown! */ + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + + write_lock_irqsave(g_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(g_lock, flags); + return; + } + write_unlock_irqrestore(g_lock, flags); + } + kiblnd_handle_rx(rx); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kiblnd_close_conn(conn, err); + ignore: + kiblnd_drop_rx(rx); /* Don't re-post rx. */ +} + +struct page * +kiblnd_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) { + page = vmalloc_to_page ((void *)vaddr); + LASSERT (page != NULL); + return page; + } +#ifdef CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } +#endif + page = virt_to_page (vaddr); + LASSERT (page != NULL); + return page; +} + +static int +kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) +{ + kib_hca_dev_t *hdev; + __u64 *pages = tx->tx_pages; + kib_fmr_poolset_t *fps; + int npages; + int size; + int cpt; + int rc; + int i; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + hdev = tx->tx_pool->tpo_hdev; + + for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { + for (size = 0; size < rd->rd_frags[i].rf_nob; + size += hdev->ibh_page_size) { + pages[npages ++] = (rd->rd_frags[i].rf_addr & + hdev->ibh_page_mask) + size; + } + } + + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + fps = net->ibn_fmr_ps[cpt]; + rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr); + if (rc != 0) { + CERROR ("Can't map %d pages: %d\n", npages, rc); + return rc; + } + + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.fmr.fmr_pfmr->fmr->rkey : + tx->tx_u.fmr.fmr_pfmr->fmr->lkey; + rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; + rd->rd_frags[0].rf_nob = nob; + rd->rd_nfrags = 1; + + return 0; +} + +static int +kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) +{ + kib_hca_dev_t *hdev; + kib_pmr_poolset_t *pps; + __u64 iova; + int cpt; + int rc; + + LASSERT(tx->tx_pool != NULL); + LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL); + + hdev = tx->tx_pool->tpo_hdev; + + iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask; + + cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; + + pps = net->ibn_pmr_ps[cpt]; + rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr); + if (rc != 0) { + CERROR("Failed to create MR by phybuf: %d\n", rc); + return rc; + } + + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + rd->rd_key = (rd != tx->tx_rd) ? tx->tx_u.pmr->pmr_mr->rkey : + tx->tx_u.pmr->pmr_mr->lkey; + rd->rd_nfrags = 1; + rd->rd_frags[0].rf_addr = iova; + rd->rd_frags[0].rf_nob = nob; + + return 0; +} + +void +kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) +{ + kib_net_t *net = ni->ni_data; + + LASSERT(net != NULL); + + if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) { + kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status); + tx->tx_u.fmr.fmr_pfmr = NULL; + + } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) { + kiblnd_pmr_pool_unmap(tx->tx_u.pmr); + tx->tx_u.pmr = NULL; + } + + if (tx->tx_nfrags != 0) { + kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + tx->tx_nfrags = 0; + } +} + +int +kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, + kib_rdma_desc_t *rd, int nfrags) +{ + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + kib_net_t *net = ni->ni_data; + struct ib_mr *mr = NULL; + __u32 nob; + int i; + + /* If rd is not tx_rd, it's going to get sent to a peer and I'm the + * RDMA sink */ + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + tx->tx_nfrags = nfrags; + + rd->rd_nfrags = + kiblnd_dma_map_sg(hdev->ibh_ibdev, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + + for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( + hdev->ibh_ibdev, &tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( + hdev->ibh_ibdev, &tx->tx_frags[i]); + nob += rd->rd_frags[i].rf_nob; + } + + /* looking for pre-mapping MR */ + mr = kiblnd_find_rd_dma_mr(hdev, rd); + if (mr != NULL) { + /* found pre-mapping MR */ + rd->rd_key = (rd != tx->tx_rd) ? mr->rkey : mr->lkey; + return 0; + } + + if (net->ibn_fmr_ps != NULL) + return kiblnd_fmr_map_tx(net, tx, rd, nob); + else if (net->ibn_pmr_ps != NULL) + return kiblnd_pmr_map_tx(net, tx, rd, nob); + + return -EINVAL; +} + + +int +kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + unsigned int niov, struct iovec *iov, int offset, int nob) +{ + kib_net_t *net = ni->ni_data; + struct page *page; + struct scatterlist *sg; + unsigned long vaddr; + int fragnob; + int page_offset; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (net != NULL); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT (niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kiblnd_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR ("Can't find page\n"); + return -EFAULT; + } + + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + sg_set_page(sg, page, fragnob, page_offset); + sg++; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +int +kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) +{ + kib_net_t *net = ni->ni_data; + struct scatterlist *sg; + int fragnob; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (net != NULL); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT (nkiov > 0); + + fragnob = min((int)(kiov->kiov_len - offset), nob); + + sg_set_page(sg, kiov->kiov_page, fragnob, + kiov->kiov_offset + offset); + sg++; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); +} + +int +kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit) +{ + kib_msg_t *msg = tx->tx_msg; + kib_peer_t *peer = conn->ibc_peer; + int ver = conn->ibc_version; + int rc; + int done; + struct ib_send_wr *bad_wrq; + + LASSERT (tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nwrq > 0); + LASSERT (tx->tx_nwrq <= 1 + IBLND_RDMA_FRAGS(ver)); + + LASSERT (credit == 0 || credit == 1); + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE(ver)); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE(ver)); + + if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) { + /* tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + if (credit != 0 && !IBLND_OOB_CAPABLE(ver) && + conn->ibc_credits == 1 && /* last credit reserved */ + msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(peer->ibp_nid)); + return -EAGAIN; + } + + /* NB don't drop ibc_lock before bumping tx_sending */ + list_del(&tx->tx_list); + tx->tx_queued = 0; + + if (msg->ibm_type == IBLND_MSG_NOOP && + (!kiblnd_need_noop(conn) || /* redundant NOOP */ + (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ + conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { + /* OK to drop when posted enough NOOPs, since + * kiblnd_check_sends will queue NOOP again when + * posted NOOPs complete */ + spin_unlock(&conn->ibc_lock); + kiblnd_tx_done(peer->ibp_ni, tx); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_noops_posted); + return 0; + } + + kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits, + peer->ibp_nid, conn->ibc_incarnation); + + conn->ibc_credits -= credit; + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted++; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() + * from the first send; hence the ++ rather than = below. */ + tx->tx_sending++; + list_add(&tx->tx_list, &conn->ibc_active_txs); + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { + rc = ib_post_send(conn->ibc_cmid->qp, + tx->tx_wrq, &bad_wrq); + } + + conn->ibc_last_send = jiffies; + + if (rc == 0) + return 0; + + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_credits += credit; + conn->ibc_outstanding_credits += msg->ibm_credits; + conn->ibc_nsends_posted--; + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + tx->tx_status = rc; + tx->tx_waiting = 0; + tx->tx_sending--; + + done = (tx->tx_sending == 0); + if (done) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CERROR("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer->ibp_nid)); + else + CDEBUG(D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(peer->ibp_nid)); + + kiblnd_close_conn(conn, rc); + + if (done) + kiblnd_tx_done(peer->ibp_ni, tx); + + spin_lock(&conn->ibc_lock); + + return -EIO; +} + +void +kiblnd_check_sends (kib_conn_t *conn) +{ + int ver = conn->ibc_version; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx; + + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + spin_lock(&conn->ibc_lock); + + LASSERT (conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver)); + LASSERT (!IBLND_OOB_CAPABLE(ver) || + conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + + if (kiblnd_need_noop(conn)) { + spin_unlock(&conn->ibc_lock); + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx != NULL) + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); + + spin_lock(&conn->ibc_lock); + if (tx != NULL) + kiblnd_queue_tx_locked(tx, conn); + } + + kiblnd_conn_addref(conn); /* 1 ref for me.... (see b21911) */ + + for (;;) { + int credit; + + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + credit = 0; + tx = list_entry(conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + } else if (!list_empty(&conn->ibc_tx_noops)) { + LASSERT (!IBLND_OOB_CAPABLE(ver)); + credit = 1; + tx = list_entry(conn->ibc_tx_noops.next, + kib_tx_t, tx_list); + } else if (!list_empty(&conn->ibc_tx_queue)) { + credit = 1; + tx = list_entry(conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + } else + break; + + if (kiblnd_post_tx_locked(conn, tx, credit) != 0) + break; + } + + spin_unlock(&conn->ibc_lock); + + kiblnd_conn_decref(conn); /* ...until here */ +} + +void +kiblnd_tx_complete (kib_tx_t *tx, int status) +{ + int failed = (status != IB_WC_SUCCESS); + kib_conn_t *conn = tx->tx_conn; + int idle; + + LASSERT (tx->tx_sending > 0); + + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CNETERR("Tx -> %s cookie "LPX64 + " sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); + + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } + + spin_lock(&conn->ibc_lock); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. */ + + tx->tx_sending--; + conn->ibc_nsends_posted--; + if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_noops_posted--; + + if (failed) { + tx->tx_waiting = 0; /* don't wait for peer */ + tx->tx_status = -EIO; + } + + idle = (tx->tx_sending == 0) && /* This is the final callback */ + !tx->tx_waiting && /* Not waiting for peer */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ + if (idle) + list_del(&tx->tx_list); + + kiblnd_conn_addref(conn); /* 1 ref for me.... */ + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); + + kiblnd_check_sends(conn); + + kiblnd_conn_decref(conn); /* ...until here */ +} + +void +kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob) +{ + kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; + struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; + struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + struct ib_mr *mr; + + LASSERT (tx->tx_nwrq >= 0); + LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT (nob <= IBLND_MSG_SIZE); + + kiblnd_init_msg(tx->tx_msg, type, body_nob); + + mr = kiblnd_find_dma_mr(hdev, tx->tx_msgaddr, nob); + LASSERT (mr != NULL); + + sge->lkey = mr->lkey; + sge->addr = tx->tx_msgaddr; + sge->length = nob; + + memset(wrq, 0, sizeof(*wrq)); + + wrq->next = NULL; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_SEND; + wrq->send_flags = IB_SEND_SIGNALED; + + tx->tx_nwrq++; +} + +int +kiblnd_init_rdma (kib_conn_t *conn, kib_tx_t *tx, int type, + int resid, kib_rdma_desc_t *dstrd, __u64 dstcookie) +{ + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + struct ib_sge *sge = &tx->tx_sge[0]; + struct ib_send_wr *wrq = &tx->tx_wrq[0]; + int rc = resid; + int srcidx; + int dstidx; + int wrknob; + + LASSERT (!in_interrupt()); + LASSERT (tx->tx_nwrq == 0); + LASSERT (type == IBLND_MSG_GET_DONE || + type == IBLND_MSG_PUT_DONE); + + srcidx = dstidx = 0; + + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx == dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq == IBLND_RDMA_FRAGS(conn->ibc_version)) { + CERROR("RDMA too fragmented for %s (%d): " + "%d/%d src %d/%d dst frags\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + IBLND_RDMA_FRAGS(conn->ibc_version), + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } + + wrknob = MIN(MIN(kiblnd_rd_frag_size(srcrd, srcidx), + kiblnd_rd_frag_size(dstrd, dstidx)), resid); + + sge = &tx->tx_sge[tx->tx_nwrq]; + sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); + sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); + sge->length = wrknob; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->next = wrq + 1; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_RDMA_WRITE; + wrq->send_flags = 0; + + wrq->wr.rdma.remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); + wrq->wr.rdma.rkey = kiblnd_rd_frag_key(dstrd, dstidx); + + srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob); + dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob); + + resid -= wrknob; + + tx->tx_nwrq++; + wrq++; + sge++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, + type, sizeof (kib_completion_msg_t)); + + return rc; +} + +void +kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + struct list_head *q; + + LASSERT (tx->tx_nwrq > 0); /* work items set up */ + LASSERT (!tx->tx_queued); /* not queued for sending already */ + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + tx->tx_queued = 1; + tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ); + + if (tx->tx_conn == NULL) { + kiblnd_conn_addref(conn); + tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); + } else { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + } + + switch (tx->tx_msg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_PUT_REQ: + case IBLND_MSG_GET_REQ: + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + q = &conn->ibc_tx_queue_nocred; + break; + + case IBLND_MSG_NOOP: + if (IBLND_OOB_CAPABLE(conn->ibc_version)) + q = &conn->ibc_tx_queue_nocred; + else + q = &conn->ibc_tx_noops; + break; + + case IBLND_MSG_IMMEDIATE: + q = &conn->ibc_tx_queue; + break; + } + + list_add_tail(&tx->tx_list, q); +} + +void +kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + spin_lock(&conn->ibc_lock); + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); +} + +static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, + struct sockaddr_in *srcaddr, + struct sockaddr_in *dstaddr, + int timeout_ms) +{ + unsigned short port; + int rc; + + /* allow the port to be reused */ + rc = rdma_set_reuseaddr(cmid, 1); + if (rc != 0) { + CERROR("Unable to set reuse on cmid: %d\n", rc); + return rc; + } + + /* look for a free privileged port */ + for (port = PROT_SOCK-1; port > 0; port--) { + srcaddr->sin_port = htons(port); + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)srcaddr, + (struct sockaddr *)dstaddr, + timeout_ms); + if (rc == 0) { + CDEBUG(D_NET, "bound to port %hu\n", port); + return 0; + } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { + CDEBUG(D_NET, "bind to port %hu failed: %d\n", + port, rc); + } else { + return rc; + } + } + + CERROR("Failed to bind to a free privileged port\n"); + return rc; +} + +void +kiblnd_connect_peer (kib_peer_t *peer) +{ + struct rdma_cm_id *cmid; + kib_dev_t *dev; + kib_net_t *net = peer->ibp_ni->ni_data; + struct sockaddr_in srcaddr; + struct sockaddr_in dstaddr; + int rc; + + LASSERT (net != NULL); + LASSERT (peer->ibp_connecting > 0); + + cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP, + IB_QPT_RC); + + if (IS_ERR(cmid)) { + CERROR("Can't create CMID for %s: %ld\n", + libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid)); + rc = PTR_ERR(cmid); + goto failed; + } + + dev = net->ibn_dev; + memset(&srcaddr, 0, sizeof(srcaddr)); + srcaddr.sin_family = AF_INET; + srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); + + memset(&dstaddr, 0, sizeof(dstaddr)); + dstaddr.sin_family = AF_INET; + dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); + dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid)); + + kiblnd_peer_addref(peer); /* cmid's ref */ + + if (*kiblnd_tunables.kib_use_priv_port) { + rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, + *kiblnd_tunables.kib_timeout * 1000); + } else { + rc = rdma_resolve_addr(cmid, + (struct sockaddr *)&srcaddr, + (struct sockaddr *)&dstaddr, + *kiblnd_tunables.kib_timeout * 1000); + } + if (rc != 0) { + /* Can't initiate address resolution: */ + CERROR("Can't resolve addr for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + goto failed2; + } + + LASSERT (cmid->device != NULL); + CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n", + libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname, + HIPQUAD(dev->ibd_ifip), cmid->device->name); + + return; + + failed2: + kiblnd_peer_decref(peer); /* cmid's ref */ + rdma_destroy_id(cmid); + failed: + kiblnd_peer_connect_failed(peer, 1, rc); +} + +void +kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + int rc; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx == NULL || tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx == NULL || tx->tx_nwrq > 0); /* work items have been set up */ + + /* First time, just use a read lock since I expect to find my peer + * connected */ + read_lock_irqsave(g_lock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL && !list_empty(&peer->ibp_conns)) { + /* Found a peer with an established connection */ + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } + + read_unlock(g_lock); + /* Re-try with a write lock */ + write_lock(g_lock); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + if (list_empty(&peer->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } + + write_unlock_irqrestore(g_lock, flags); + + /* Allocate a peer ready to add to the peer table and retry */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); + if (tx != NULL) { + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kiblnd_tx_done(ni, tx); + } + return; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + if (list_empty(&peer2->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT (peer2->ibp_connecting != 0 || + peer2->ibp_accepting != 0); + if (tx != NULL) + list_add_tail(&tx->tx_list, + &peer2->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer2); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + if (tx != NULL) + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + + kiblnd_peer_decref(peer); + return; + } + + /* Brand new peer */ + LASSERT (peer->ibp_connecting == 0); + peer->ibp_connecting = 1; + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); + + if (tx != NULL) + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + + kiblnd_connect_peer(peer); + kiblnd_peer_decref(peer); +} + +int +kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate txd for GET to %s\n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kiblnd_setup_rd_iov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]); + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + if (payload_kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + } + + /* send IMMEDIATE */ + + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBLND_MSG_SIZE); + + tx = kiblnd_get_idle_tx(ni, target.nid); + if (tx == NULL) { + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); + + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; +} + +void +kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) +{ + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + niov, iov, offset, nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + rc = kiblnd_init_rdma(rx->rx_conn, tx, + IBLND_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (nob == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kiblnd_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kiblnd_tx_done(ni, tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); +} + +int +kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int post_credit = IBLND_POSTRX_PEER_CREDIT; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_IMMEDIATE: + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize (ni, lntmsg, 0); + break; + + case IBLND_MSG_PUT_REQ: + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, iov, offset, mlen); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_tx_done(ni, tx); + /* tell peer it's over */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]); + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; + + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kiblnd_queue_tx(tx, conn); + + /* reposted buffer reserved for PUT_DONE */ + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + + case IBLND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kiblnd_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; + } + + kiblnd_post_rx(rx, post_credit); + return rc; +} + +int +kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + task_t *task = kthread_run(fn, arg, name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + atomic_inc(&kiblnd_data.kib_nthreads); + return 0; +} + +void +kiblnd_thread_fini (void) +{ + atomic_dec (&kiblnd_data.kib_nthreads); +} + +void +kiblnd_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kiblnd_peer_notify (kib_peer_t *peer) +{ + int error = 0; + cfs_time_t last_alive = 0; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + + last_alive = peer->ibp_last_alive; + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(peer->ibp_ni, + peer->ibp_nid, 0, last_alive); +} + +void +kiblnd_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + kib_dev_t *dev; + unsigned long flags; + + LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + return; /* already being handled */ + + if (error == 0 && + list_empty(&conn->ibc_tx_noops) && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s\n", + libcfs_nid2str(peer->ibp_nid)); + } else { + CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", + libcfs_nid2str(peer->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); + } + + dev = ((kib_net_t *)peer->ibp_ni->ni_data)->ibn_dev; + list_del(&conn->ibc_list); + /* connd (see below) takes over ibc_list's ref */ + + if (list_empty (&peer->ibp_conns) && /* no more conns */ + kiblnd_peer_active(peer)) { /* still in peer table */ + kiblnd_unlink_peer_locked(peer); + + /* set/clear error on last conn */ + peer->ibp_error = conn->ibc_comms_error; + } + + kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); + + if (error != 0 && + kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + wake_up(&kiblnd_data.kib_failover_waitq); + } + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); + wake_up(&kiblnd_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); +} + +void +kiblnd_close_conn(kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + kiblnd_close_conn_locked(conn, error); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_handle_early_rxs(kib_conn_t *conn) +{ + unsigned long flags; + kib_rx_t *rx; + + LASSERT(!in_interrupt()); + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_handle_rx(rx); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_safe (tmp, nxt, txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || + tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_waiting = 0; + + if (tx->tx_sending == 0) { + tx->tx_queued = 0; + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + } + + spin_unlock(&conn->ibc_lock); + + kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED); +} + +void +kiblnd_finalise_conn (kib_conn_t *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state > IBLND_CONN_INIT); + + kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); + + /* abort_receives moves QP state to IB_QPS_ERR. This is only required + * for connections that didn't get as far as being connected, because + * rdma_disconnect() does this for free. */ + kiblnd_abort_receives(conn); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kiblnd_abort_txs(conn, &conn->ibc_tx_noops); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kiblnd_abort_txs(conn, &conn->ibc_active_txs); + + kiblnd_handle_early_rxs(conn); +} + +void +kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error) +{ + LIST_HEAD (zombies); + unsigned long flags; + + LASSERT (error != 0); + LASSERT (!in_interrupt()); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (active) { + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + } else { + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + } + + if (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0) { + /* another connection attempt under way... */ + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (kiblnd_peer_active(peer)) + kiblnd_unlink_peer_locked(peer); + + peer->ibp_error = error; + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_peer_notify(peer); + + if (list_empty (&zombies)) + return; + + CNETERR("Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); + + kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH); +} + +void +kiblnd_connreq_done(kib_conn_t *conn, int status) +{ + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + struct list_head txs; + unsigned long flags; + int active; + + active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + CDEBUG(D_NET,"%s: active(%d), version(%x), status(%d)\n", + libcfs_nid2str(peer->ibp_nid), active, + conn->ibc_version, status); + + LASSERT (!in_interrupt()); + LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + peer->ibp_connecting > 0) || + (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && + peer->ibp_accepting > 0)); + + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; + + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(peer, active, status); + kiblnd_finalise_conn(conn); + return; + } + + /* connection established */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + conn->ibc_last_send = jiffies; + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer); + + /* Add conn to peer's list and nuke any dangling conns from a different + * peer instance... */ + kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer->ibp_conns); + if (active) + peer->ibp_connecting--; + else + peer->ibp_accepting--; + + if (peer->ibp_version == 0) { + peer->ibp_version = conn->ibc_version; + peer->ibp_incarnation = conn->ibc_incarnation; + } + + if (peer->ibp_version != conn->ibc_version || + peer->ibp_incarnation != conn->ibc_incarnation) { + kiblnd_close_stale_conns_locked(peer, conn->ibc_version, + conn->ibc_incarnation); + peer->ibp_version = conn->ibc_version; + peer->ibp_incarnation = conn->ibc_incarnation; + } + + /* grab pending txs while I have the lock */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (!kiblnd_peer_active(peer) || /* peer has been deleted */ + conn->ibc_comms_error != 0) { /* error has happened already */ + lnet_ni_t *ni = peer->ibp_ni; + + /* start to shut down connection */ + kiblnd_close_conn_locked(conn, -ECONNABORTED); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(ni, &txs, -ECONNABORTED); + + return; + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock(&conn->ibc_lock); + while (!list_empty(&txs)) { + tx = list_entry(txs.next, kib_tx_t, tx_list); + list_del(&tx->tx_list); + + kiblnd_queue_tx_locked(tx, conn); + } + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); + + /* schedule blocked rxs */ + kiblnd_handle_early_rxs(conn); +} + +void +kiblnd_reject(struct rdma_cm_id *cmid, kib_rej_t *rej) +{ + int rc; + + rc = rdma_reject(cmid, rej, sizeof(*rej)); + + if (rc != 0) + CWARN("Error %d sending reject\n", rc); +} + +int +kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob) +{ + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + kib_msg_t *reqmsg = priv; + kib_msg_t *ackmsg; + kib_dev_t *ibdev; + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + lnet_ni_t *ni = NULL; + kib_net_t *net = NULL; + lnet_nid_t nid; + struct rdma_conn_param cp; + kib_rej_t rej; + int version = IBLND_MSG_VERSION; + unsigned long flags; + int rc; + struct sockaddr_in *peer_addr; + LASSERT (!in_interrupt()); + + /* cmid inherits 'context' from the corresponding listener id */ + ibdev = (kib_dev_t *)cmid->context; + LASSERT (ibdev != NULL); + + memset(&rej, 0, sizeof(rej)); + rej.ibr_magic = IBLND_MSG_MAGIC; + rej.ibr_why = IBLND_REJECT_FATAL; + rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; + + peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr); + if (*kiblnd_tunables.kib_require_priv_port && + ntohs(peer_addr->sin_port) >= PROT_SOCK) { + __u32 ip = ntohl(peer_addr->sin_addr.s_addr); + CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n", + HIPQUAD(ip), ntohs(peer_addr->sin_port)); + goto failed; + } + + if (priv_nob < offsetof(kib_msg_t, ibm_type)) { + CERROR("Short connection request\n"); + goto failed; + } + + /* Future protocol version compatibility support! If the + * o2iblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer which protocol I + * speak. */ + if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || + reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + goto failed; + if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && + reqmsg->ibm_version != IBLND_MSG_VERSION && + reqmsg->ibm_version != IBLND_MSG_VERSION_1) + goto failed; + if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) + goto failed; + + rc = kiblnd_unpack_msg(reqmsg, priv_nob); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto failed; + } + + nid = reqmsg->ibm_srcnid; + ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid)); + + if (ni != NULL) { + net = (kib_net_t *)ni->ni_data; + rej.ibr_incarnation = net->ibn_incarnation; + } + + if (ni == NULL || /* no matching net */ + ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ + net->ibn_dev != ibdev) { /* wrong device */ + CERROR("Can't accept %s on %s (%s:%d:%u.%u.%u.%u): " + "bad dst nid %s\n", libcfs_nid2str(nid), + ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid), + ibdev->ibd_ifname, ibdev->ibd_nnets, + HIPQUAD(ibdev->ibd_ifip), + libcfs_nid2str(reqmsg->ibm_dstnid)); + + goto failed; + } + + /* check time stamp as soon as possible */ + if (reqmsg->ibm_dststamp != 0 && + reqmsg->ibm_dststamp != net->ibn_incarnation) { + CWARN("Stale connection request\n"); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* I can accept peer's version */ + version = reqmsg->ibm_version; + + if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from %s\n", + reqmsg->ibm_type, libcfs_nid2str(nid)); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_queue_depth != + IBLND_MSG_QUEUE_SIZE(version)) { + CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", + libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(version)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; + + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_frags != + IBLND_RDMA_FRAGS(version)) { + CERROR("Can't accept %s(version %x): " + "incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(nid), version, + reqmsg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(version)); + + if (version == IBLND_MSG_VERSION) + rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; + + goto failed; + + } + + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } + + /* assume 'nid' is a new peer; create */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + if (peer2->ibp_version == 0) { + peer2->ibp_version = version; + peer2->ibp_incarnation = reqmsg->ibm_srcstamp; + } + + /* not the guy I've talked with */ + if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || + peer2->ibp_version != version) { + kiblnd_close_peer_conns_locked(peer2, -ESTALE); + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn stale %s [old ver: %x, new ver: %x]\n", + libcfs_nid2str(nid), peer2->ibp_version, version); + + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_CONN_STALE; + goto failed; + } + + /* tie-break connection race in favour of the higher NID */ + if (peer2->ibp_connecting != 0 && + nid < ni->ni_nid) { + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn race %s\n", libcfs_nid2str(peer2->ibp_nid)); + + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_CONN_RACE; + goto failed; + } + + peer2->ibp_accepting++; + kiblnd_peer_addref(peer2); + + write_unlock_irqrestore(g_lock, flags); + kiblnd_peer_decref(peer); + peer = peer2; + } else { + /* Brand new peer */ + LASSERT (peer->ibp_accepting == 0); + LASSERT (peer->ibp_version == 0 && + peer->ibp_incarnation == 0); + + peer->ibp_accepting = 1; + peer->ibp_version = version; + peer->ibp_incarnation = reqmsg->ibm_srcstamp; + + /* I have a ref on ni that prevents it being shutdown */ + LASSERT (net->ibn_shutdown == 0); + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + } + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 0, -ENOMEM); + kiblnd_peer_decref(peer); + rej.ibr_why = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. */ + + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE(version); + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(version); + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(version) + <= IBLND_RX_MSGS(version)); + + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); + + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); + ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + + kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + rej.ibr_version = version; + rej.ibr_why = IBLND_REJECT_FATAL; + + kiblnd_reject(cmid, &rej); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + lnet_ni_decref(ni); + return 0; + + failed: + if (ni != NULL) + lnet_ni_decref(ni); + + rej.ibr_version = version; + rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + kiblnd_reject(cmid, &rej); + + return -ECONNREFUSED; +} + +void +kiblnd_reconnect (kib_conn_t *conn, int version, + __u64 incarnation, int why, kib_connparams_t *cp) +{ + kib_peer_t *peer = conn->ibc_peer; + char *reason; + int retry = 0; + unsigned long flags; + + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */ + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress + * NB: reconnect is still needed even when ibp_tx_queue is + * empty if ibp_version != version because reconnect may be + * initiated by kiblnd_query() */ + if ((!list_empty(&peer->ibp_tx_queue) || + peer->ibp_version != version) && + peer->ibp_connecting == 1 && + peer->ibp_accepting == 0) { + retry = 1; + peer->ibp_connecting++; + + peer->ibp_version = version; + peer->ibp_incarnation = incarnation; + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (!retry) + return; + + switch (why) { + default: + reason = "Unknown"; + break; + + case IBLND_REJECT_CONN_STALE: + reason = "stale"; + break; + + case IBLND_REJECT_CONN_RACE: + reason = "conn race"; + break; + + case IBLND_REJECT_CONN_UNCOMPAT: + reason = "version negotiation"; + break; + } + + CNETERR("%s: retrying (%s), %x, %x, " + "queue_dep: %d, max_frag: %d, msg_size: %d\n", + libcfs_nid2str(peer->ibp_nid), + reason, IBLND_MSG_VERSION, version, + cp != NULL? cp->ibcp_queue_depth :IBLND_MSG_QUEUE_SIZE(version), + cp != NULL? cp->ibcp_max_frags : IBLND_RDMA_FRAGS(version), + cp != NULL? cp->ibcp_max_msg_size: IBLND_MSG_SIZE); + + kiblnd_connect_peer(peer); +} + +void +kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + switch (reason) { + case IB_CM_REJ_STALE_CONN: + kiblnd_reconnect(conn, IBLND_MSG_VERSION, 0, + IBLND_REJECT_CONN_STALE, NULL); + break; + + case IB_CM_REJ_INVALID_SERVICE_ID: + CNETERR("%s rejected: no listener at %d\n", + libcfs_nid2str(peer->ibp_nid), + *kiblnd_tunables.kib_service); + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + if (priv_nob >= offsetof(kib_rej_t, ibr_padding)) { + kib_rej_t *rej = priv; + kib_connparams_t *cp = NULL; + int flip = 0; + __u64 incarnation = -1; + + /* NB. default incarnation is -1 because: + * a) V1 will ignore dst incarnation in connreq. + * b) V2 will provide incarnation while rejecting me, + * -1 will be overwrote. + * + * if I try to connect to a V1 peer with V2 protocol, + * it rejected me then upgrade to V2, I have no idea + * about the upgrading and try to reconnect with V1, + * in this case upgraded V2 can find out I'm trying to + * talk to the old guy and reject me(incarnation is -1). + */ + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + flip = 1; + } + + if (priv_nob >= sizeof(kib_rej_t) && + rej->ibr_version > IBLND_MSG_VERSION_1) { + /* priv_nob is always 148 in current version + * of OFED, so we still need to check version. + * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) */ + cp = &rej->ibr_cp; + + if (flip) { + __swab64s(&rej->ibr_incarnation); + __swab16s(&cp->ibcp_queue_depth); + __swab16s(&cp->ibcp_max_frags); + __swab32s(&cp->ibcp_max_msg_size); + } + + incarnation = rej->ibr_incarnation; + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION && + rej->ibr_version != IBLND_MSG_VERSION_1) { + CERROR("%s rejected: o2iblnd version %x error\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_version); + break; + } + + if (rej->ibr_why == IBLND_REJECT_FATAL && + rej->ibr_version == IBLND_MSG_VERSION_1) { + CDEBUG(D_NET, "rejected by old version peer %s: %x\n", + libcfs_nid2str(peer->ibp_nid), rej->ibr_version); + + if (conn->ibc_version != IBLND_MSG_VERSION_1) + rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + case IBLND_REJECT_CONN_STALE: + case IBLND_REJECT_CONN_UNCOMPAT: + kiblnd_reconnect(conn, rej->ibr_version, + incarnation, rej->ibr_why, cp); + break; + + case IBLND_REJECT_MSG_QUEUE_SIZE: + CERROR("%s rejected: incompatible message queue depth %d, %d\n", + libcfs_nid2str(peer->ibp_nid), cp->ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(conn->ibc_version)); + break; + + case IBLND_REJECT_RDMA_FRAGS: + CERROR("%s rejected: incompatible # of RDMA fragments %d, %d\n", + libcfs_nid2str(peer->ibp_nid), cp->ibcp_max_frags, + IBLND_RDMA_FRAGS(conn->ibc_version)); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer->ibp_nid)); + break; + + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_why); + break; + } + break; + } + /* fall through */ + default: + CNETERR("%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, -ECONNREFUSED); +} + +void +kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + lnet_ni_t *ni = peer->ibp_ni; + kib_net_t *net = ni->ni_data; + kib_msg_t *msg = priv; + int ver = conn->ibc_version; + int rc = kiblnd_unpack_msg(msg, priv_nob); + unsigned long flags; + + LASSERT (net != NULL); + + if (rc != 0) { + CERROR("Can't unpack connack from %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + goto failed; + } + + if (msg->ibm_type != IBLND_MSG_CONNACK) { + CERROR("Unexpected message %d from %s\n", + msg->ibm_type, libcfs_nid2str(peer->ibp_nid)); + rc = -EPROTO; + goto failed; + } + + if (ver != msg->ibm_version) { + CERROR("%s replied version %x is different with " + "requested version %x\n", + libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth != + IBLND_MSG_QUEUE_SIZE(ver)) { + CERROR("%s has incompatible queue depth %d(%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE(ver)); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_frags != + IBLND_RDMA_FRAGS(ver)) { + CERROR("%s has incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_frags, + IBLND_RDMA_FRAGS(ver)); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + rc = -EPROTO; + goto failed; + } + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (msg->ibm_dstnid == ni->ni_nid && + msg->ibm_dststamp == net->ibn_incarnation) + rc = 0; + else + rc = -ESTALE; + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Bad connection reply from %s, rc = %d, " + "version: %x max_frags: %d\n", + libcfs_nid2str(peer->ibp_nid), rc, + msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); + goto failed; + } + + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE(ver); + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + IBLND_OOB_MSGS(ver) + <= IBLND_RX_MSGS(ver)); + + kiblnd_connreq_done(conn, 0); + return; + + failed: + /* NB My QP has already established itself, so I handle anything going + * wrong here by setting ibc_comms_error. + * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then + * immediately tears it down. */ + + LASSERT (rc != 0); + conn->ibc_comms_error = rc; + kiblnd_connreq_done(conn, 0); +} + +int +kiblnd_active_connect (struct rdma_cm_id *cmid) +{ + kib_peer_t *peer = (kib_peer_t *)cmid->context; + kib_conn_t *conn; + kib_msg_t *msg; + struct rdma_conn_param cp; + int version; + __u64 incarnation; + unsigned long flags; + int rc; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + incarnation = peer->ibp_incarnation; + version = (peer->ibp_version == 0) ? IBLND_MSG_VERSION : + peer->ibp_version; + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, version); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 1, -ENOMEM); + kiblnd_peer_decref(peer); /* lose cmid's ref */ + return -ENOMEM; + } + + /* conn "owns" cmid now, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. conn also takes over cmid's ref + * on peer */ + + msg = &conn->ibc_connvars->cv_msg; + + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); + msg->ibm_u.connparams.ibcp_max_frags = IBLND_RDMA_FRAGS(version); + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + + kiblnd_pack_msg(peer->ibp_ni, msg, version, + 0, peer->ibp_nid, incarnation); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = msg; + cp.private_data_len = msg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + LASSERT(cmid->context == (void *)conn); + LASSERT(conn->ibc_cmid == cmid); + + rc = rdma_connect(cmid, &cp); + if (rc != 0) { + CERROR("Can't connect to %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + return 0; +} + +int +kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ + kib_peer_t *peer; + kib_conn_t *conn; + int rc; + + switch (event->event) { + default: + CERROR("Unexpected event: %d, status: %d\n", + event->event, event->status); + LBUG(); + + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* destroy cmid on failure */ + rc = kiblnd_passive_connect(cmid, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + CDEBUG(D_NET, "connreq: %d\n", rc); + return rc; + + case RDMA_CM_EVENT_ADDR_ERROR: + peer = (kib_peer_t *)cmid->context; + CNETERR("%s: ADDR ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ADDR_RESOLVED: + peer = (kib_peer_t *)cmid->context; + + CDEBUG(D_NET,"%s Addr resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status != 0) { + CNETERR("Can't resolve address for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + rc = event->status; + } else { + rc = rdma_resolve_route( + cmid, *kiblnd_tunables.kib_timeout * 1000); + if (rc == 0) + return 0; + /* Can't initiate route resolution */ + CERROR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + } + kiblnd_peer_connect_failed(peer, 1, rc); + kiblnd_peer_decref(peer); + return rc; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_ERROR: + peer = (kib_peer_t *)cmid->context; + CNETERR("%s: ROUTE ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + peer = (kib_peer_t *)cmid->context; + CDEBUG(D_NET,"%s Route resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status == 0) + return kiblnd_active_connect(cmid); + + CNETERR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, event->status); + kiblnd_peer_decref(peer); + return event->status; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_UNREACHABLE: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: UNREACHABLE %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_CONNECT_ERROR: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CNETERR("%s: CONNECT ERROR %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_REJECTED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CERROR ("%s: REJECTED %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status); + kiblnd_connreq_done(conn, -ECONNRESET); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + kiblnd_rejected(conn, event->status, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, 0); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + CDEBUG(D_NET, "ESTABLISHED(active): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_check_connreply(conn, + (void *)KIBLND_CONN_PARAM(event), + KIBLND_CONN_PARAM_LEN(event)); + break; + } + /* net keeps its ref on conn! */ + return 0; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); + return 0; + case RDMA_CM_EVENT_DISCONNECTED: + conn = (kib_conn_t *)cmid->context; + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("%s DISCONNECTED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, -ECONNRESET); + } else { + kiblnd_close_conn(conn, 0); + } + kiblnd_conn_decref(conn); + cmid->context = NULL; + return 0; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR_MSG(0x131, + "Received notification of device removal\n" + "Please shutdown LNET to allow this to proceed\n"); + /* Can't remove network from underneath LNET for now, so I have + * to ignore this */ + return 0; + + case RDMA_CM_EVENT_ADDR_CHANGE: + LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); + return 0; + } +} + +static int +kiblnd_check_txs_locked(kib_conn_t *conn, struct list_head *txs) +{ + kib_tx_t *tx; + struct list_head *ttmp; + + list_for_each (ttmp, txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + if (txs != &conn->ibc_active_txs) { + LASSERT (tx->tx_queued); + } else { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } + + if (cfs_time_aftereq (jiffies, tx->tx_deadline)) { + CERROR("Timed out tx: %s, %lu seconds\n", + kiblnd_queue2str(conn, txs), + cfs_duration_sec(jiffies - tx->tx_deadline)); + return 1; + } + } + + return 0; +} + +static int +kiblnd_conn_timed_out_locked(kib_conn_t *conn) +{ + return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); +} + +void +kiblnd_check_conns (int idx) +{ + LIST_HEAD (closes); + LIST_HEAD (checksends); + struct list_head *peers = &kiblnd_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + unsigned long flags; + + /* NB. We expect to have a look at all the peers and not find any + * RDMAs to time out, so we just use a shared lock while we + * take a look... */ + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + int timedout; + int sendnoop; + + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); + + sendnoop = kiblnd_need_noop(conn); + timedout = kiblnd_conn_timed_out_locked(conn); + if (!sendnoop && !timedout) { + spin_unlock(&conn->ibc_lock); + continue; + } + + if (timedout) { + CERROR("Timed out RDMA with %s (%lu): " + "c: %u, oc: %u, rc: %u\n", + libcfs_nid2str(peer->ibp_nid), + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive), + conn->ibc_credits, + conn->ibc_outstanding_credits, + conn->ibc_reserved_credits); + list_add(&conn->ibc_connd_list, &closes); + } else { + list_add(&conn->ibc_connd_list, + &checksends); + } + /* +ref for 'closes' or 'checksends' */ + kiblnd_conn_addref(conn); + + spin_unlock(&conn->ibc_lock); + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Handle timeout by closing the whole + * connection. We can only be sure RDMA activity + * has ceased once the QP has been modified. */ + while (!list_empty(&closes)) { + conn = list_entry(closes.next, + kib_conn_t, ibc_connd_list); + list_del(&conn->ibc_connd_list); + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); + } + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + while (!list_empty(&checksends)) { + conn = list_entry(checksends.next, + kib_conn_t, ibc_connd_list); + list_del(&conn->ibc_connd_list); + kiblnd_check_sends(conn); + kiblnd_conn_decref(conn); + } +} + +void +kiblnd_disconnect_conn (kib_conn_t *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (current == kiblnd_data.kib_connd); + LASSERT (conn->ibc_state == IBLND_CONN_CLOSING); + + rdma_disconnect(conn->ibc_cmid); + kiblnd_finalise_conn(conn); + + kiblnd_peer_notify(conn->ibc_peer); +} + +int +kiblnd_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + int timeout; + int i; + int dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; + + cfs_block_allsigs (); + + init_waitqueue_entry_current (&wait); + kiblnd_data.kib_connd = current; + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + + dropped_lock = 0; + + if (!list_empty (&kiblnd_data.kib_connd_zombies)) { + conn = list_entry(kiblnd_data. \ + kib_connd_zombies.next, + kib_conn_t, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, + flags); + dropped_lock = 1; + + kiblnd_destroy_conn(conn); + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + if (!list_empty(&kiblnd_data.kib_connd_conns)) { + conn = list_entry(kiblnd_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del(&conn->ibc_list); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, + flags); + dropped_lock = 1; + + kiblnd_disconnect_conn(conn); + kiblnd_conn_decref(conn); + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = kiblnd_data.kib_peer_hash_size; + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + dropped_lock = 1; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (*kiblnd_tunables.kib_timeout > n * p) + chunk = (chunk * n * p) / + *kiblnd_tunables.kib_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kiblnd_check_conns(peer_index); + peer_index = (peer_index + 1) % + kiblnd_data.kib_peer_hash_size; + } + + deadline += p * HZ; + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + + waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +void +kiblnd_qp_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } +} + +void +kiblnd_complete (struct ib_wc *wc) +{ + switch (kiblnd_wreqid2type(wc->wr_id)) { + default: + LBUG(); + + case IBLND_WID_RDMA: + /* We only get RDMA completion notification if it fails. All + * subsequent work items, including the final SEND will fail + * too. However we can't print out any more info about the + * failing RDMA because 'tx' might be back on the idle list or + * even reused already if we didn't manage to post all our work + * items */ + CNETERR("RDMA (tx: %p) failed: %d\n", + kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_TX: + kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_RX: + kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, + wc->byte_len); + return; + } +} + +void +kiblnd_cq_completion(struct ib_cq *cq, void *arg) +{ + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + kib_conn_t *conn = (kib_conn_t *)arg; + struct kib_sched_info *sched = conn->ibc_sched; + unsigned long flags; + + LASSERT(cq == conn->ibc_cq); + + spin_lock_irqsave(&sched->ibs_lock, flags); + + conn->ibc_ready = 1; + + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); + + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); +} + +void +kiblnd_cq_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + CERROR("%s: async CQ event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); +} + +int +kiblnd_scheduler(void *arg) +{ + long id = (long)arg; + struct kib_sched_info *sched; + kib_conn_t *conn; + wait_queue_t wait; + unsigned long flags; + struct ib_wc wc; + int did_something; + int busy_loops = 0; + int rc; + + cfs_block_allsigs(); + + init_waitqueue_entry_current(&wait); + + sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; + + rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); + if (rc != 0) { + CWARN("Failed to bind on CPT %d, please verify whether " + "all CPUs are healthy and reload modules if necessary, " + "otherwise your system might under risk of low " + "performance\n", sched->ibs_cpt); + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + if (busy_loops++ >= IBLND_RESCHED) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + did_something = 0; + + if (!list_empty(&sched->ibs_conns)) { + conn = list_entry(sched->ibs_conns.next, + kib_conn_t, ibc_sched_list); + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + if (rc < 0) { + CWARN("%s: ib_req_notify_cq failed: %d, " + "closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, + flags); + continue; + } + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + if (rc < 0) { + CWARN("%s: ib_poll_cq failed: %d, " + "closing connection\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc); + kiblnd_close_conn(conn, -EIO); + kiblnd_conn_decref(conn); + spin_lock_irqsave(&sched->ibs_lock, flags); + continue; + } + + spin_lock_irqsave(&sched->ibs_lock, flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + /* +1 ref for sched_conns */ + kiblnd_conn_addref(conn); + list_add_tail(&conn->ibc_sched_list, + &sched->ibs_conns); + if (waitqueue_active(&sched->ibs_waitq)) + wake_up(&sched->ibs_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + spin_unlock_irqrestore(&sched->ibs_lock, flags); + kiblnd_complete(&wc); + + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + kiblnd_conn_decref(conn); /* ...drop my ref from above */ + did_something = 1; + } + + if (did_something) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&sched->ibs_waitq, &wait); + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + waitq_wait(&wait, TASK_INTERRUPTIBLE); + busy_loops = 0; + + remove_wait_queue(&sched->ibs_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&sched->ibs_lock, flags); + } + + spin_unlock_irqrestore(&sched->ibs_lock, flags); + + kiblnd_thread_fini(); + return 0; +} + +int +kiblnd_failover_thread(void *arg) +{ + rwlock_t *glock = &kiblnd_data.kib_global_lock; + kib_dev_t *dev; + wait_queue_t wait; + unsigned long flags; + int rc; + + LASSERT (*kiblnd_tunables.kib_dev_failover != 0); + + cfs_block_allsigs (); + + init_waitqueue_entry_current(&wait); + write_lock_irqsave(glock, flags); + + while (!kiblnd_data.kib_shutdown) { + int do_failover = 0; + int long_sleep; + + list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, + ibd_fail_list) { + if (cfs_time_before(cfs_time_current(), + dev->ibd_next_failover)) + continue; + do_failover = 1; + break; + } + + if (do_failover) { + list_del_init(&dev->ibd_fail_list); + dev->ibd_failover = 1; + write_unlock_irqrestore(glock, flags); + + rc = kiblnd_dev_failover(dev); + + write_lock_irqsave(glock, flags); + + LASSERT (dev->ibd_failover); + dev->ibd_failover = 0; + if (rc >= 0) { /* Device is OK or failover succeed */ + dev->ibd_next_failover = cfs_time_shift(3); + continue; + } + + /* failed to failover, retry later */ + dev->ibd_next_failover = + cfs_time_shift(min(dev->ibd_failed_failover, 10)); + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + + continue; + } + + /* long sleep if no more pending failover */ + long_sleep = list_empty(&kiblnd_data.kib_failed_devs); + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_unlock_irqrestore(glock, flags); + + rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : + cfs_time_seconds(1)); + set_current_state(TASK_RUNNING); + remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); + write_lock_irqsave(glock, flags); + + if (!long_sleep || rc != 0) + continue; + + /* have a long sleep, routine check all active devices, + * we need checking like this because if there is not active + * connection on the dev and no SEND from local, we may listen + * on wrong HCA for ever while there is a bonding failover */ + list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { + if (kiblnd_dev_can_failover(dev)) { + list_add_tail(&dev->ibd_fail_list, + &kiblnd_data.kib_failed_devs); + } + } + } + + write_unlock_irqrestore(glock, flags); + + kiblnd_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c new file mode 100644 index 000000000000..e21028b72302 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -0,0 +1,493 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/o2iblnd/o2iblnd_modparams.c + * + * Author: Eric Barton + */ + +#include "o2iblnd.h" + +static int service = 987; +CFS_MODULE_PARM(service, "i", int, 0444, + "service number (within RDMA_PS_TCP)"); + +static int cksum = 0; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +/* Number of threads in each scheduler pool which is percpt, + * we will estimate reasonable value based on CPUs if it's set to zero. */ +static int nscheds; +CFS_MODULE_PARM(nscheds, "i", int, 0444, + "number of threads in each scheduler pool"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int ntx = 512; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of message descriptors allocated for each pool"); + +/* NB: this value is shared by all CPTs */ +static int credits = 256; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int peer_credits_hiw = 0; +CFS_MODULE_PARM(peer_credits_hiw, "i", int, 0444, + "when eagerly to return credits"); + +static int peer_buffer_credits = 0; +CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444, + "# per-peer router buffer credits"); + +static int peer_timeout = 180; +CFS_MODULE_PARM(peer_timeout, "i", int, 0444, + "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +static char *ipif_name = "ib0"; +CFS_MODULE_PARM(ipif_name, "s", charp, 0444, + "IPoIB interface name"); + +static int retry_count = 5; +CFS_MODULE_PARM(retry_count, "i", int, 0644, + "Retransmissions when no ACK received"); + +static int rnr_retry_count = 6; +CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644, + "RNR retransmissions"); + +static int keepalive = 100; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "Idle time in seconds before sending a keepalive"); + +static int ib_mtu = 0; +CFS_MODULE_PARM(ib_mtu, "i", int, 0444, + "IB MTU 256/512/1024/2048/4096"); + +static int concurrent_sends = 0; +CFS_MODULE_PARM(concurrent_sends, "i", int, 0444, + "send work-queue sizing"); + +static int map_on_demand = 0; +CFS_MODULE_PARM(map_on_demand, "i", int, 0444, + "map on demand"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_pool_size = 512; +CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444, + "size of fmr pool on each CPT (>= ntx / 4)"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int fmr_flush_trigger = 384; +CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444, + "# dirty FMRs that triggers pool flush"); + +static int fmr_cache = 1; +CFS_MODULE_PARM(fmr_cache, "i", int, 0444, + "non-zero to enable FMR caching"); + +/* NB: this value is shared by all CPTs, it can grow at runtime */ +static int pmr_pool_size = 512; +CFS_MODULE_PARM(pmr_pool_size, "i", int, 0444, + "size of MR cache pmr pool on each CPT"); + +/* + * 0: disable failover + * 1: enable failover if necessary + * 2: force to failover (for debug) + */ +static int dev_failover = 0; +CFS_MODULE_PARM(dev_failover, "i", int, 0444, + "HCA failover for bonding (0 off, 1 on, other values reserved)"); + + +static int require_privileged_port = 0; +CFS_MODULE_PARM(require_privileged_port, "i", int, 0644, + "require privileged port when accepting connection"); + +static int use_privileged_port = 1; +CFS_MODULE_PARM(use_privileged_port, "i", int, 0644, + "use privileged port when initiating connection"); + +kib_tunables_t kiblnd_tunables = { + .kib_dev_failover = &dev_failover, + .kib_service = &service, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peertxcredits = &peer_credits, + .kib_peercredits_hiw = &peer_credits_hiw, + .kib_peerrtrcredits = &peer_buffer_credits, + .kib_peertimeout = &peer_timeout, + .kib_default_ipif = &ipif_name, + .kib_retry_count = &retry_count, + .kib_rnr_retry_count = &rnr_retry_count, + .kib_concurrent_sends = &concurrent_sends, + .kib_ib_mtu = &ib_mtu, + .kib_map_on_demand = &map_on_demand, + .kib_fmr_pool_size = &fmr_pool_size, + .kib_fmr_flush_trigger = &fmr_flush_trigger, + .kib_fmr_cache = &fmr_cache, + .kib_pmr_pool_size = &pmr_pool_size, + .kib_require_priv_port = &require_privileged_port, + .kib_use_priv_port = &use_privileged_port, + .kib_nscheds = &nscheds +}; + +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + +static char ipif_basename_space[32]; + + +enum { + O2IBLND_SERVICE = 1, + O2IBLND_CKSUM, + O2IBLND_TIMEOUT, + O2IBLND_NTX, + O2IBLND_CREDITS, + O2IBLND_PEER_TXCREDITS, + O2IBLND_PEER_CREDITS_HIW, + O2IBLND_PEER_RTRCREDITS, + O2IBLND_PEER_TIMEOUT, + O2IBLND_IPIF_BASENAME, + O2IBLND_RETRY_COUNT, + O2IBLND_RNR_RETRY_COUNT, + O2IBLND_KEEPALIVE, + O2IBLND_CONCURRENT_SENDS, + O2IBLND_IB_MTU, + O2IBLND_MAP_ON_DEMAND, + O2IBLND_FMR_POOL_SIZE, + O2IBLND_FMR_FLUSH_TRIGGER, + O2IBLND_FMR_CACHE, + O2IBLND_PMR_POOL_SIZE, + O2IBLND_DEV_FAILOVER +}; + +static ctl_table_t kiblnd_ctl_table[] = { + { + .ctl_name = O2IBLND_SERVICE, + .procname = "service", + .data = &service, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_CKSUM, + .procname = "cksum", + .data = &cksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_TIMEOUT, + .procname = "timeout", + .data = &timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_NTX, + .procname = "ntx", + .data = &ntx, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_CREDITS, + .procname = "credits", + .data = &credits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_PEER_TXCREDITS, + .procname = "peer_credits", + .data = &peer_credits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_PEER_CREDITS_HIW, + .procname = "peer_credits_hiw", + .data = &peer_credits_hiw, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_PEER_RTRCREDITS, + .procname = "peer_buffer_credits", + .data = &peer_buffer_credits, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_PEER_TIMEOUT, + .procname = "peer_timeout", + .data = &peer_timeout, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_IPIF_BASENAME, + .procname = "ipif_name", + .data = ipif_basename_space, + .maxlen = sizeof(ipif_basename_space), + .mode = 0444, + .proc_handler = &proc_dostring + }, + { + .ctl_name = O2IBLND_RETRY_COUNT, + .procname = "retry_count", + .data = &retry_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_RNR_RETRY_COUNT, + .procname = "rnr_retry_count", + .data = &rnr_retry_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_KEEPALIVE, + .procname = "keepalive", + .data = &keepalive, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_CONCURRENT_SENDS, + .procname = "concurrent_sends", + .data = &concurrent_sends, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_IB_MTU, + .procname = "ib_mtu", + .data = &ib_mtu, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_MAP_ON_DEMAND, + .procname = "map_on_demand", + .data = &map_on_demand, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + + { + .ctl_name = O2IBLND_FMR_POOL_SIZE, + .procname = "fmr_pool_size", + .data = &fmr_pool_size, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_FMR_FLUSH_TRIGGER, + .procname = "fmr_flush_trigger", + .data = &fmr_flush_trigger, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_FMR_CACHE, + .procname = "fmr_cache", + .data = &fmr_cache, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_PMR_POOL_SIZE, + .procname = "pmr_pool_size", + .data = &pmr_pool_size, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = O2IBLND_DEV_FAILOVER, + .procname = "dev_failover", + .data = &dev_failover, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec + }, + {0} +}; + +static ctl_table_t kiblnd_top_ctl_table[] = { + { + .ctl_name = CTL_O2IBLND, + .procname = "o2iblnd", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = kiblnd_ctl_table + }, + {0} +}; + +void +kiblnd_initstrtunable(char *space, char *str, int size) +{ + strncpy(space, str, size); + space[size-1] = 0; +} + +void +kiblnd_sysctl_init (void) +{ + kiblnd_initstrtunable(ipif_basename_space, ipif_name, + sizeof(ipif_basename_space)); + + kiblnd_tunables.kib_sysctl = + cfs_register_sysctl_table(kiblnd_top_ctl_table, 0); + + if (kiblnd_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); +} + +void +kiblnd_sysctl_fini (void) +{ + if (kiblnd_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kiblnd_tunables.kib_sysctl); +} + +#else + +void +kiblnd_sysctl_init (void) +{ +} + +void +kiblnd_sysctl_fini (void) +{ +} + +#endif + +int +kiblnd_tunables_init (void) +{ + if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { + CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", + *kiblnd_tunables.kib_ib_mtu); + return -EINVAL; + } + + if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT) + *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT; + + if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX) + *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX; + + if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits) + *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits; + + if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2) + *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2; + + if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits) + *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1; + + if (*kiblnd_tunables.kib_map_on_demand < 0 || + *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS) + *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */ + + if (*kiblnd_tunables.kib_map_on_demand == 1) + *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */ + + if (*kiblnd_tunables.kib_concurrent_sends == 0) { + if (*kiblnd_tunables.kib_map_on_demand > 0 && + *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) + *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2; + else + *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits); + } + + if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2) + *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2; + + if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2) + *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2; + + if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) { + CWARN("Concurrent sends %d is lower than message queue size: %d, " + "performance may drop slightly.\n", + *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits); + } + + kiblnd_sysctl_init(); + return 0; +} + +void +kiblnd_tunables_fini (void) +{ + kiblnd_sysctl_fini(); +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile new file mode 100644 index 000000000000..6494b2bada05 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_LNET) += ksocklnd.o + +ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o + + + +ccflags-y := -I$(src)/../../include diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 000000000000..c826bf9d49ac --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,2902 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#include "socklnd.h" + +lnd_t the_ksocklnd; +ksock_nal_data_t ksocknal_data; + +ksock_interface_t * +ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip) +{ + ksock_net_t *net = ni->ni_data; + int i; + ksock_interface_t *iface; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + iface = &net->ksnn_interfaces[i]; + + if (iface->ksni_ipaddr == ip) + return (iface); + } + + return (NULL); +} + +ksock_route_t * +ksocknal_create_route (__u32 ipaddr, int port) +{ + ksock_route_t *route; + + LIBCFS_ALLOC (route, sizeof (*route)); + if (route == NULL) + return (NULL); + + atomic_set (&route->ksnr_refcount, 1); + route->ksnr_peer = NULL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + route->ksnr_connected = 0; + route->ksnr_deleted = 0; + route->ksnr_conn_count = 0; + route->ksnr_share_count = 0; + + return (route); +} + +void +ksocknal_destroy_route (ksock_route_t *route) +{ + LASSERT (atomic_read(&route->ksnr_refcount) == 0); + + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); + + LIBCFS_FREE (route, sizeof (*route)); +} + +int +ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; + + LASSERT (id.nid != LNET_NID_ANY); + LASSERT (id.pid != LNET_PID_ANY); + LASSERT (!in_interrupt()); + + LIBCFS_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) + return -ENOMEM; + + memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ + + peer->ksnp_ni = ni; + peer->ksnp_id = id; + atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_closing = 0; + peer->ksnp_accepting = 0; + peer->ksnp_proto = NULL; + peer->ksnp_last_alive = 0; + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + INIT_LIST_HEAD (&peer->ksnp_conns); + INIT_LIST_HEAD (&peer->ksnp_routes); + INIT_LIST_HEAD (&peer->ksnp_tx_queue); + INIT_LIST_HEAD (&peer->ksnp_zc_req_list); + spin_lock_init(&peer->ksnp_lock); + + spin_lock_bh(&net->ksnn_lock); + + if (net->ksnn_shutdown) { + spin_unlock_bh(&net->ksnn_lock); + + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_bh(&net->ksnn_lock); + + *peerp = peer; + return 0; +} + +void +ksocknal_destroy_peer (ksock_peer_t *peer) +{ + ksock_net_t *net = peer->ksnp_ni->ni_data; + + CDEBUG (D_NET, "peer %s %p deleted\n", + libcfs_id2str(peer->ksnp_id), peer); + + LASSERT (atomic_read (&peer->ksnp_refcount) == 0); + LASSERT (peer->ksnp_accepting == 0); + LASSERT (list_empty (&peer->ksnp_conns)); + LASSERT (list_empty (&peer->ksnp_routes)); + LASSERT (list_empty (&peer->ksnp_tx_queue)); + LASSERT (list_empty (&peer->ksnp_zc_req_list)); + + LIBCFS_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections and routes keep a reference on their peer + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer has been cleaned up when its refcount drops to + * zero. */ + spin_lock_bh(&net->ksnn_lock); + net->ksnn_npeers--; + spin_unlock_bh(&net->ksnn_lock); +} + +ksock_peer_t * +ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id) +{ + struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); + struct list_head *tmp; + ksock_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, ksock_peer_t, ksnp_list); + + LASSERT (!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_id.nid != id.nid || + peer->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_id2str(id), + atomic_read(&peer->ksnp_refcount)); + return (peer); + } + return (NULL); +} + +ksock_peer_t * +ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + + read_lock(&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) /* +1 ref for caller? */ + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + return (peer); +} + +void +ksocknal_unlink_peer_locked (ksock_peer_t *peer) +{ + int i; + __u32 ip; + ksock_interface_t *iface; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) { + LASSERT (i < LNET_MAX_INTERFACES); + ip = peer->ksnp_passive_ips[i]; + + iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + /* All IPs in peer->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. */ + LASSERT (iface != NULL); + + CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", + peer, iface, iface->ksni_nroutes); + iface->ksni_npeers--; + } + + LASSERT (list_empty(&peer->ksnp_conns)); + LASSERT (list_empty(&peer->ksnp_routes)); + LASSERT (!peer->ksnp_closing); + peer->ksnp_closing = 1; + list_del (&peer->ksnp_list); + /* lose peerlist's ref */ + ksocknal_peer_decref(peer); +} + +int +ksocknal_get_peer_info (lnet_ni_t *ni, int index, + lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_route_t *route; + struct list_head *rtmp; + int i; + int j; + int rc = -ENOENT; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_n_passive_ips == 0 && + list_empty(&peer->ksnp_routes)) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = peer->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + list_for_each (rtmp, &peer->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + *id = peer->ksnp_id; + *myip = route->ksnr_myipaddr; + *peer_ip = route->ksnr_ipaddr; + *port = route->ksnr_port; + *conn_count = route->ksnr_conn_count; + *share_count = route->ksnr_share_count; + rc = 0; + goto out; + } + } + } + out: + read_unlock(&ksocknal_data.ksnd_global_lock); + return (rc); +} + +void +ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + int type = conn->ksnc_type; + ksock_interface_t *iface; + + conn->ksnc_route = route; + ksocknal_route_addref(route); + + if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { + if (route->ksnr_myipaddr == 0) { + /* route wasn't bound locally yet (the initial route) */ + CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + } else { + CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from " + "%u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + route->ksnr_myipaddr = conn->ksnc_myipaddr; + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes++; + } + + route->ksnr_connected |= (1<ksnr_conn_count++; + + /* Successful connection => further attempts can + * proceed immediately */ + route->ksnr_retry_interval = 0; +} + +void +ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_route_t *route2; + + LASSERT (!peer->ksnp_closing); + LASSERT (route->ksnr_peer == NULL); + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT (route->ksnr_connected == 0); + + /* LASSERT(unique) */ + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { + CERROR ("Duplicate route %s %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr)); + LBUG(); + } + } + + route->ksnr_peer = peer; + ksocknal_peer_addref(peer); + /* peer's routelist takes over my ref on 'route' */ + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_ipaddr != route->ksnr_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + /* keep going (typed routes) */ + } +} + +void +ksocknal_del_route_locked (ksock_route_t *route) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_interface_t *iface; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + + LASSERT (!route->ksnr_deleted); + + /* Close associated conns */ + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + ksocknal_close_conn_locked (conn, 0); + } + + if (route->ksnr_myipaddr != 0) { + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + + route->ksnr_deleted = 1; + list_del (&route->ksnr_list); + ksocknal_route_decref(route); /* drop peer's ref */ + + if (list_empty (&peer->ksnp_routes) && + list_empty (&peer->ksnp_conns)) { + /* I've just removed the last route to a peer with no active + * connections */ + ksocknal_unlink_peer_locked (peer); + } +} + +int +ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) +{ + struct list_head *tmp; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_route_t *route; + ksock_route_t *route2; + int rc; + + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) + return (-EINVAL); + + /* Have a brand new peer ready... */ + rc = ksocknal_create_peer(&peer, ni, id); + if (rc != 0) + return rc; + + route = ksocknal_create_route (ipaddr, port); + if (route == NULL) { + ksocknal_peer_decref(peer); + return (-ENOMEM); + } + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked (ni, id); + if (peer2 != NULL) { + ksocknal_peer_decref(peer); + peer = peer2; + } else { + /* peer table takes my ref on peer */ + list_add_tail (&peer->ksnp_list, + ksocknal_nid2peerlist (id.nid)); + } + + route2 = NULL; + list_for_each (tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + if (route2 == NULL) { + ksocknal_add_route_locked(peer, route); + route->ksnr_share_count++; + } else { + ksocknal_route_decref(route); + route2->ksnr_share_count++; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (0); +} + +void +ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip) +{ + ksock_conn_t *conn; + ksock_route_t *route; + struct list_head *tmp; + struct list_head *nxt; + int nshared; + + LASSERT (!peer->ksnp_closing); + + /* Extra ref prevents peer disappearing until I'm done with it */ + ksocknal_peer_addref(peer); + + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* no match */ + if (!(ip == 0 || route->ksnr_ipaddr == ip)) + continue; + + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked (route); + } + + nshared = 0; + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + nshared += route->ksnr_share_count; + } + + if (nshared == 0) { + /* remove everything else if there are no explicit entries + * left */ + + list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* we should only be removing auto-entries */ + LASSERT(route->ksnr_share_count == 0); + ksocknal_del_route_locked (route); + } + + list_for_each_safe (tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + ksocknal_close_conn_locked(conn, 0); + } + } + + ksocknal_peer_decref(peer); + /* NB peer unlinks itself when last conn/route is removed */ +} + +int +ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip) +{ + LIST_HEAD (zombies); + struct list_head *ptmp; + struct list_head *pnxt; + ksock_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) + continue; + + ksocknal_peer_addref(peer); /* a ref for me... */ + + ksocknal_del_peer_locked (peer, ip); + + if (peer->ksnp_closing && + !list_empty(&peer->ksnp_tx_queue)) { + LASSERT (list_empty(&peer->ksnp_conns)); + LASSERT (list_empty(&peer->ksnp_routes)); + + list_splice_init(&peer->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer); /* ...till here */ + + rc = 0; /* matched! */ + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, 1); + + return (rc); +} + +ksock_conn_t * +ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + LASSERT (!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + list_for_each (ctmp, &peer->ksnp_conns) { + if (index-- > 0) + continue; + + conn = list_entry (ctmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + read_unlock(&ksocknal_data. \ + ksnd_global_lock); + return (conn); + } + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return (NULL); +} + +ksock_sched_t * +ksocknal_choose_scheduler_locked(unsigned int cpt) +{ + struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt]; + ksock_sched_t *sched; + int i; + + LASSERT(info->ksi_nthreads > 0); + + sched = &info->ksi_scheds[0]; + /* + * NB: it's safe so far, but info->ksi_nthreads could be changed + * at runtime when we have dynamic LNet configuration, then we + * need to take care of this. + */ + for (i = 1; i < info->ksi_nthreads; i++) { + if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns) + sched = &info->ksi_scheds[i]; + } + + return sched; +} + +int +ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs) +{ + ksock_net_t *net = ni->ni_data; + int i; + int nip; + + read_lock(&ksocknal_data.ksnd_global_lock); + + nip = net->ksnn_ninterfaces; + LASSERT (nip <= LNET_MAX_INTERFACES); + + /* Only offer interfaces for additional connections if I have + * more than one. */ + if (nip < 2) { + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; + LASSERT (ipaddrs[i] != 0); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return (nip); +} + +int +ksocknal_match_peerip (ksock_interface_t *iface, __u32 *ips, int nips) +{ + int best_netmatch = 0; + int best_xor = 0; + int best = -1; + int this_xor; + int this_netmatch; + int i; + + for (i = 0; i < nips; i++) { + if (ips[i] == 0) + continue; + + this_xor = (ips[i] ^ iface->ksni_ipaddr); + this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best < 0 || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_xor > this_xor))) + continue; + + best = i; + best_netmatch = this_netmatch; + best_xor = this_xor; + } + + LASSERT (best >= 0); + return (best); +} + +int +ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + ksock_net_t *net = peer->ksnp_ni->ni_data; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int n_ips; + int i; + int j; + int k; + __u32 ip; + __u32 xor; + int this_netmatch; + int best_netmatch; + int best_npeers; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness shouldn't matter */ + + /* Also note that I'm not going to return more than n_peerips + * interfaces, even if I have more myself */ + + write_lock_bh(global_lock); + + LASSERT (n_peerips <= LNET_MAX_INTERFACES); + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + MIN(n_peerips, net->ksnn_ninterfaces); + + for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { + /* ^ yes really... */ + + /* If we have any new interfaces, first tick off all the + * peer IPs that match old interfaces, then choose new + * interfaces to match the remaining peer IPS. + * We don't forget interfaces we've stopped using; we might + * start using them again... */ + + if (i < peer->ksnp_n_passive_ips) { + /* Old interface. */ + ip = peer->ksnp_passive_ips[i]; + best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + + /* peer passive ips are kept up to date */ + LASSERT(best_iface != NULL); + } else { + /* choose a new interface */ + LASSERT (i == peer->ksnp_n_passive_ips); + + best_iface = NULL; + best_netmatch = 0; + best_npeers = 0; + + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + ip = iface->ksni_ipaddr; + + for (k = 0; k < peer->ksnp_n_passive_ips; k++) + if (peer->ksnp_passive_ips[k] == ip) + break; + + if (k < peer->ksnp_n_passive_ips) /* using it already */ + continue; + + k = ksocknal_match_peerip(iface, peerips, n_peerips); + xor = (ip ^ peerips[k]); + this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_npeers > iface->ksni_npeers))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_npeers = iface->ksni_npeers; + } + + best_iface->ksni_npeers++; + ip = best_iface->ksni_ipaddr; + peer->ksnp_passive_ips[i] = ip; + peer->ksnp_n_passive_ips = i+1; + } + + LASSERT (best_iface != NULL); + + /* mark the best matching peer IP used */ + j = ksocknal_match_peerip(best_iface, peerips, n_peerips); + peerips[j] = 0; + } + + /* Overwrite input peer IP addresses */ + memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); + + write_unlock_bh(global_lock); + + return (n_ips); +} + +void +ksocknal_create_routes(ksock_peer_t *peer, int port, + __u32 *peer_ipaddrs, int npeer_ipaddrs) +{ + ksock_route_t *newroute = NULL; + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + lnet_ni_t *ni = peer->ksnp_ni; + ksock_net_t *net = ni->ni_data; + struct list_head *rtmp; + ksock_route_t *route; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness here shouldn't matter */ + + write_lock_bh(global_lock); + + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + write_unlock_bh(global_lock); + return; + } + + LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES); + + for (i = 0; i < npeer_ipaddrs; i++) { + if (newroute != NULL) { + newroute->ksnr_ipaddr = peer_ipaddrs[i]; + } else { + write_unlock_bh(global_lock); + + newroute = ksocknal_create_route(peer_ipaddrs[i], port); + if (newroute == NULL) + return; + + write_lock_bh(global_lock); + } + + if (peer->ksnp_closing) { + /* peer got closed under me */ + break; + } + + /* Already got a route? */ + route = NULL; + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + best_iface = NULL; + best_nroutes = 0; + best_netmatch = 0; + + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Select interface to connect from */ + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + + /* Using this interface already? */ + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == iface->ksni_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & + iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_nroutes > iface->ksni_nroutes))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_nroutes = iface->ksni_nroutes; + } + + if (best_iface == NULL) + continue; + + newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; + best_iface->ksni_nroutes++; + + ksocknal_add_route_locked(peer, newroute); + newroute = NULL; + } + + write_unlock_bh(global_lock); + if (newroute != NULL) + ksocknal_route_decref(newroute); +} + +int +ksocknal_accept (lnet_ni_t *ni, socket_t *sock) +{ + ksock_connreq_t *cr; + int rc; + __u32 peer_ip; + int peer_port; + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from " + "%u.%u.%u.%u: memory exhausted\n", + HIPQUAD(peer_ip)); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + return 0; +} + +int +ksocknal_connecting (ksock_peer_t *peer, __u32 ipaddr) +{ + ksock_route_t *route; + + list_for_each_entry (route, &peer->ksnp_routes, ksnr_list) { + + if (route->ksnr_ipaddr == ipaddr) + return route->ksnr_connecting; + } + return 0; +} + +int +ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, + socket_t *sock, int type) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + LIST_HEAD (zombies); + lnet_process_id_t peerid; + struct list_head *tmp; + __u64 incarnation; + ksock_conn_t *conn; + ksock_conn_t *conn2; + ksock_peer_t *peer = NULL; + ksock_peer_t *peer2; + ksock_sched_t *sched; + ksock_hello_msg_t *hello; + int cpt; + ksock_tx_t *tx; + ksock_tx_t *txtmp; + int rc; + int active; + char *warn = NULL; + + active = (route != NULL); + + LASSERT (active == (type != SOCKLND_CONN_NONE)); + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } + + memset (conn, 0, sizeof (*conn)); + + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + atomic_set (&conn->ksnc_sock_refcount, 2); + conn->ksnc_type = type; + ksocknal_lib_save_callback(sock, conn); + atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_carrier = NULL; + atomic_set (&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + + /* stash conn's local and remote addrs */ + rc = ksocknal_lib_get_conn_addrs (conn); + if (rc != 0) + goto failed_1; + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ + + if (active) { + peer = route->ksnr_peer; + LASSERT(ni == peer->ksnp_ni); + + /* Active connection sends HELLO eagerly */ + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = peer->ksnp_id; + + write_lock_bh(global_lock); + conn->ksnc_proto = peer->ksnp_proto; + write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } + + rc = ksocknal_send_hello (ni, conn, peerid.nid, hello); + if (rc != 0) + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer */ + conn->ksnc_proto = NULL; + } + + rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation); + if (rc < 0) + goto failed_1; + + LASSERT (rc == 0 || active); + LASSERT (conn->ksnc_proto != NULL); + LASSERT (peerid.nid != LNET_NID_ANY); + + cpt = lnet_cpt_of_nid(peerid.nid); + + if (active) { + ksocknal_peer_addref(peer); + write_lock_bh(global_lock); + } else { + rc = ksocknal_create_peer(&peer, ni, peerid); + if (rc != 0) + goto failed_1; + + write_lock_bh(global_lock); + + /* called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, peerid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer in the peer + * table (which takes my ref) */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer); + peer = peer2; + } + + /* +1 ref for me */ + ksocknal_peer_addref(peer); + peer->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + if (peerid.nid < ni->ni_nid && + ksocknal_connecting(peer, conn->ksnc_ipaddr)) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } + } + + if (peer->ksnp_closing || + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ + rc = -ESTALE; + warn = "peer/route removed"; + goto failed_2; + } + + if (peer->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer + * wants a different protocol than the one I asked for. + */ + LASSERT (list_empty(&peer->ksnp_conns)); + + peer->ksnp_proto = conn->ksnc_proto; + peer->ksnp_incarnation = incarnation; + } + + if (peer->ksnp_proto != conn->ksnc_proto || + peer->ksnp_incarnation != incarnation) { + /* Peer rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer, 0, 0); + + peer->ksnp_proto = NULL; + rc = ESTALE; + warn = peer->ksnp_incarnation != incarnation ? + "peer rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; + goto failed_2; + } + + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ + if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || + conn2->ksnc_myipaddr != conn->ksnc_myipaddr || + conn2->ksnc_type != conn->ksnc_type) + continue; + + /* Reply on a passive connection attempt so the peer + * realises we're connected. */ + LASSERT (rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; + goto failed_2; + } + } + + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. */ + if (active && + route->ksnr_ipaddr != conn->ksnc_ipaddr) { + CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_ipaddr)); + } + + /* Search for a route corresponding to the new connection and + * create an association. This allows incoming connections created + * by routes in my peer to match my own route entries so I don't + * continually create duplicate routes. */ + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + break; + } + + conn->ksnc_peer = peer; /* conn takes my ref on peer */ + peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_send_keepalive = 0; + peer->ksnp_error = 0; + + sched = ksocknal_choose_scheduler_locked(cpt); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + conn->ksnc_tx_last_post = cfs_time_current(); + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = cfs_sock_wmem_queued(sock); + conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with adding to peer's conn list */ + + list_add (&conn->ksnc_list, &peer->ksnp_conns); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO) + continue; + + list_del (&tx->tx_list); + ksocknal_queue_tx_locked (tx, conn); + } + + write_unlock_bh(global_lock); + + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d" + " incarnation:"LPD64" sched[%d:%d]\n", + libcfs_id2str(peerid), conn->ksnc_proto->pro_version, + HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port, incarnation, cpt, + (int)(sched - &sched->kss_info->ksi_scheds[0])); + + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + + failed_2: + if (!peer->ksnp_closing && + list_empty (&peer->ksnp_conns) && + list_empty (&peer->ksnp_routes)) { + list_add(&zombies, &peer->ksnp_tx_queue); + list_del_init(&peer->ksnp_tx_queue); + ksocknal_unlink_peer_locked(peer); + } + + write_unlock_bh(global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + write_lock_bh(global_lock); + peer->ksnp_accepting--; + write_unlock_bh(global_lock); + } + + ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_peer_decref(peer); + + failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + LIBCFS_FREE (conn, sizeof(*conn)); + + failed_0: + libcfs_sock_release(sock); + return rc; +} + +void +ksocknal_close_conn_locked (ksock_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + ksock_conn_t *conn2; + struct list_head *tmp; + + LASSERT (peer->ksnp_error == 0); + LASSERT (!conn->ksnc_closing); + conn->ksnc_closing = 1; + + /* ksnd_deathrow_conns takes over peer's ref */ + list_del (&conn->ksnc_list); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT (!route->ksnr_deleted); + LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); + + conn2 = NULL; + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_route == route && + conn2->ksnc_type == conn->ksnc_type) + break; + + conn2 = NULL; + } + if (conn2 == NULL) + route->ksnr_connected &= ~(1 << conn->ksnc_type); + + conn->ksnc_route = NULL; + +#if 0 /* irrelevent with only eager routes */ + /* make route least favourite */ + list_del (&route->ksnr_list); + list_add_tail (&route->ksnr_list, &peer->ksnp_routes); +#endif + ksocknal_route_decref(route); /* drop conn's ref on route */ + } + + if (list_empty (&peer->ksnp_conns)) { + /* No more connections to this peer */ + + if (!list_empty(&peer->ksnp_tx_queue)) { + ksock_tx_t *tx; + + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + list_for_each_entry(tx, &peer->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + list_splice_init(&peer->ksnp_tx_queue, + &conn->ksnc_tx_queue); + spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + peer->ksnp_proto = NULL; /* renegotiate protocol version */ + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * peer with no routes to it */ + ksocknal_unlink_peer_locked (peer); + } + } + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, + &ksocknal_data.ksnd_deathrow_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed (ksock_peer_t *peer) +{ + int notify = 0; + cfs_time_t last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer is dead if it's to another kernel and + * there are no connections or connection attempts in existance. */ + + read_lock(&ksocknal_data.ksnd_global_lock); + + if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer->ksnp_conns) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + notify = 1; + last_alive = peer->ksnp_last_alive; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0, + last_alive); +} + +void +ksocknal_finalize_zcreq(ksock_conn_t *conn) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD (zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT (conn->ksnc_sock == NULL); + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_zc_aborted = 1; /* mark it as not-acked */ + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } +} + +void +ksocknal_terminate_conn (ksock_conn_t *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_sched_t *sched = conn->ksnc_scheduler; + int failed = 0; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_bh(&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)){ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + /* serialise with callbacks */ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); + + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... */ + conn->ksnc_scheduler->kss_nconns--; + + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT (list_empty (&peer->ksnp_conns)); + failed = 1; + peer->ksnp_error = 0; /* avoid multiple notifications */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_connsock_decref(conn); +} + +void +ksocknal_queue_zombie_conn (ksock_conn_t *conn) +{ + /* Queue the conn for the reaper to destroy */ + + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_destroy_conn (ksock_conn_t *conn) +{ + cfs_time_t last_rcv; + + /* Final coup-de-grace of the reaper */ + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0); + LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0); + LASSERT (conn->ksnc_sock == NULL); + LASSERT (conn->ksnc_route == NULL); + LASSERT (!conn->ksnc_tx_scheduled); + LASSERT (!conn->ksnc_rx_scheduled); + LASSERT (list_empty(&conn->ksnc_tx_queue)); + + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout); + CERROR("Completing partial receive from %s[%d]" + ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, " + "last alive is %ld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), + last_rcv))); + lnet_finalize (conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); + break; + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s" + ", ip %d.%d.%d.%d:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + break; + default: + LBUG (); + break; + } + + ksocknal_peer_decref(conn->ksnc_peer); + + LIBCFS_FREE (conn, sizeof (*conn)); +} + +int +ksocknal_close_peer_conns_locked (ksock_peer_t *peer, __u32 ipaddr, int why) +{ + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + if (ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr) { + count++; + ksocknal_close_conn_locked (conn, why); + } + } + + return (count); +} + +int +ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) +{ + ksock_peer_t *peer = conn->ksnc_peer; + __u32 ipaddr = conn->ksnc_ipaddr; + int count; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + count = ksocknal_close_peer_conns_locked (peer, ipaddr, why); + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (count); +} + +int +ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + + peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + + if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) + continue; + + count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0); + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + /* wildcards always succeed */ + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +void +ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + lnet_process_id_t id = {0}; + + id.nid = gw_nid; + id.pid = LNET_PID_ANY; + + CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns (id, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + +void +ksocknal_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when) +{ + int connect = 1; + cfs_time_t last_alive = 0; + cfs_time_t now = cfs_time_current(); + ksock_peer_t *peer = NULL; + rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + + read_lock(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + struct list_head *tmp; + ksock_conn_t *conn; + int bufnob; + + list_for_each (tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + bufnob = cfs_sock_wmem_queued(conn->ksnc_sock); + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + peer->ksnp_last_alive = now; + conn->ksnc_tx_bufnob = bufnob; + } + } + + last_alive = peer->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer) == NULL) + connect = 0; + } + + read_unlock(glock); + + if (last_alive != 0) + *when = last_alive; + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1, + connect); + + if (!connect) + return; + + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + write_lock_bh(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + ksocknal_launch_all_connections_locked(peer); + + write_unlock_bh(glock); + return; +} + +void +ksocknal_push_peer (ksock_peer_t *peer) +{ + int index; + int i; + struct list_head *tmp; + ksock_conn_t *conn; + + for (index = 0; ; index++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + i = 0; + conn = NULL; + + list_for_each (tmp, &peer->ksnp_conns) { + if (i++ == index) { + conn = list_entry (tmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (conn == NULL) + break; + + ksocknal_lib_push_conn (conn); + ksocknal_conn_decref(conn); + } +} + +int +ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + struct list_head *tmp; + int index; + int i; + int j; + int rc = -ENOENT; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + for (j = 0; ; j++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + index = 0; + peer = NULL; + + list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (!((id.nid == LNET_NID_ANY || + id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer->ksnp_id.pid))) { + peer = NULL; + continue; + } + + if (index++ == j) { + ksocknal_peer_addref(peer); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (peer != NULL) { + rc = 0; + ksocknal_push_peer (peer); + ksocknal_peer_decref(peer); + } + } + + } + + return (rc); +} + +int +ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask) +{ + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + int rc; + int i; + int j; + struct list_head *ptmp; + ksock_peer_t *peer; + struct list_head *rtmp; + ksock_route_t *route; + + if (ipaddress == 0 || + netmask == 0) + return (-EINVAL); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + iface = ksocknal_ip2iface(ni, ipaddress); + if (iface != NULL) { + /* silently ignore dups */ + rc = 0; + } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { + rc = -ENOSPC; + } else { + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; + + iface->ksni_ipaddr = ipaddress; + iface->ksni_netmask = netmask; + iface->ksni_nroutes = 0; + iface->ksni_npeers = 0; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, + ksnp_list); + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) + if (peer->ksnp_passive_ips[j] == ipaddress) + iface->ksni_npeers++; + + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, + ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == ipaddress) + iface->ksni_nroutes++; + } + } + } + + rc = 0; + /* NB only new connections will pay attention to the new interface! */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +void +ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) +{ + struct list_head *tmp; + struct list_head *nxt; + ksock_route_t *route; + ksock_conn_t *conn; + int i; + int j; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) + if (peer->ksnp_passive_ips[i] == ipaddr) { + for (j = i+1; j < peer->ksnp_n_passive_ips; j++) + peer->ksnp_passive_ips[j-1] = + peer->ksnp_passive_ips[j]; + peer->ksnp_n_passive_ips--; + break; + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr != ipaddr) + continue; + + if (route->ksnr_share_count != 0) { + /* Manually created; keep, but unbind */ + route->ksnr_myipaddr = 0; + } else { + ksocknal_del_route_locked(route); + } + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_myipaddr == ipaddr) + ksocknal_close_conn_locked (conn, 0); + } +} + +int +ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress) +{ + ksock_net_t *net = ni->ni_data; + int rc = -ENOENT; + struct list_head *tmp; + struct list_head *nxt; + ksock_peer_t *peer; + __u32 this_ip; + int i; + int j; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; + + if (!(ipaddress == 0 || + ipaddress == this_ip)) + continue; + + rc = 0; + + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; + + net->ksnn_ninterfaces--; + + for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { + list_for_each_safe(tmp, nxt, + &ksocknal_data.ksnd_peers[j]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + ksocknal_peer_del_interface_locked(peer, this_ip); + } + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +int +ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + lnet_process_id_t id = {0}; + struct libcfs_ioctl_data *data = arg; + int rc; + + switch(cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + + read_lock(&ksocknal_data.ksnd_global_lock); + + if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { + rc = -ENOENT; + } else { + rc = 0; + iface = &net->ksnn_interfaces[data->ioc_count]; + + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; + } + + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ + + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ + + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: + id.nid = data->ioc_nid; + id.pid = LUSTRE_SRV_LNET_PID; + return ksocknal_add_peer (ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ + + case IOC_LIBCFS_DEL_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_del_peer (ni, id, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count); + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns (id, + data->ioc_u32[0]); + + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, id); + + default: + return -EINVAL; + } + /* not reached */ +} + +void +ksocknal_free_buffers (void) +{ + LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); + + if (ksocknal_data.ksnd_sched_info != NULL) { + struct ksock_sched_info *info; + int i; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds != NULL) { + LIBCFS_FREE(info->ksi_scheds, + info->ksi_nthreads_max * + sizeof(info->ksi_scheds[0])); + } + } + cfs_percpt_free(ksocknal_data.ksnd_sched_info); + } + + LIBCFS_FREE (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + struct list_head zlist; + ksock_tx_t *tx; + + list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_list); + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } +} + +void +ksocknal_base_shutdown(void) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + int i; + int j; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + LASSERT (ksocknal_data.ksnd_nnets == 0); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + case SOCKNAL_INIT_DATA: + LASSERT (ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); + } + + LASSERT(list_empty(&ksocknal_data.ksnd_nets)); + LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs)); + LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes)); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + + sched = &info->ksi_scheds[j]; + LASSERT(list_empty(&sched->\ + kss_tx_conns)); + LASSERT(list_empty(&sched->\ + kss_rx_conns)); + LASSERT(list_empty(&sched-> \ + kss_zombie_noop_txs)); + LASSERT(sched->kss_nconns == 0); + } + } + } + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all(&ksocknal_data.ksnd_connd_waitq); + wake_up_all(&ksocknal_data.ksnd_reaper_waitq); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + sched = &info->ksi_scheds[j]; + wake_up_all(&sched->kss_waitq); + } + } + } + + i = 4; + read_lock(&ksocknal_data.ksnd_global_lock); + while (ksocknal_data.ksnd_nthreads != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", + ksocknal_data.ksnd_nthreads); + read_unlock(&ksocknal_data.ksnd_global_lock); + cfs_pause(cfs_time_seconds(1)); + read_lock(&ksocknal_data.ksnd_global_lock); + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + module_put(THIS_MODULE); +} + +__u64 +ksocknal_new_incarnation (void) +{ + struct timeval tv; + + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. Hopefully + * we won't be able to reboot more frequently than 1MHz for the + * forseeable future :) */ + + do_gettimeofday(&tv); + + return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; +} + +int +ksocknal_base_startup(void) +{ + struct ksock_sched_info *info; + int rc; + int i; + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT (ksocknal_data.ksnd_nnets == 0); + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + LIBCFS_ALLOC (ksocknal_data.ksnd_peers, + sizeof (struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + return -ENOMEM; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); + + spin_lock_init(&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); + INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + spin_lock_init(&ksocknal_data.ksnd_connd_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs); + INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init(&ksocknal_data.ksnd_tx_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs); + + /* NB memset above zeros whole of ksocknal_data */ + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + try_module_get(THIS_MODULE); + + ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*info)); + if (ksocknal_data.ksnd_sched_info == NULL) + goto failed; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + ksock_sched_t *sched; + int nthrs; + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); + } else { + /* max to half of CPUs, assume another half should be + * reserved for upper layer modules */ + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + } + + info->ksi_nthreads_max = nthrs; + info->ksi_cpt = i; + + LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i, + info->ksi_nthreads_max * sizeof(*sched)); + if (info->ksi_scheds == NULL) + goto failed; + + for (; nthrs > 0; nthrs--) { + sched = &info->ksi_scheds[nthrs - 1]; + + sched->kss_info = info; + spin_lock_init(&sched->kss_lock); + INIT_LIST_HEAD(&sched->kss_rx_conns); + INIT_LIST_HEAD(&sched->kss_tx_conns); + INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); + init_waitqueue_head(&sched->kss_waitq); + } + } + + ksocknal_data.ksnd_connd_starting = 0; + ksocknal_data.ksnd_connd_failed_stamp = 0; + ksocknal_data.ksnd_connd_starting_stamp = cfs_time_current_sec(); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) + *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; + + if (*ksocknal_tunables.ksnd_nconnds_max < + *ksocknal_tunables.ksnd_nconnds) { + ksocknal_tunables.ksnd_nconnds_max = + ksocknal_tunables.ksnd_nconnds; + } + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + char name[16]; + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + + snprintf(name, sizeof(name), "socknal_cd%02d", i); + rc = ksocknal_thread_start(ksocknal_connd, + (void *)((ulong_ptr_t)i), name); + if (rc != 0) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting--; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; + } + } + + rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); + if (rc != 0) { + CERROR ("Can't spawn socknal reaper: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +void +ksocknal_debug_peerhash (lnet_ni_t *ni) +{ + ksock_peer_t *peer = NULL; + struct list_head *tmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (tmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni == ni) break; + + peer = NULL; + } + } + + if (peer != NULL) { + ksock_route_t *route; + ksock_conn_t *conn; + + CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, " + "closing %d, accepting %d, err %d, zcookie "LPU64", " + "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id), + atomic_read(&peer->ksnp_refcount), + peer->ksnp_sharecount, peer->ksnp_closing, + peer->ksnp_accepting, peer->ksnp_error, + peer->ksnp_zc_next_cookie, + !list_empty(&peer->ksnp_tx_queue), + !list_empty(&peer->ksnp_zc_req_list)); + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, " + "del %d\n", atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + list_for_each (tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + CWARN ("Conn: ref %d, sref %d, t %d, c %d\n", + atomic_read(&conn->ksnc_conn_refcount), + atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; +} + +void +ksocknal_shutdown (lnet_ni_t *ni) +{ + ksock_net_t *net = ni->ni_data; + int i; + lnet_process_id_t anyid = {0}; + + anyid.nid = LNET_NID_ANY; + anyid.pid = LNET_PID_ANY; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + spin_lock_bh(&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_bh(&net->ksnn_lock); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer state to clean up */ + i = 2; + spin_lock_bh(&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + spin_unlock_bh(&net->ksnn_lock); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + cfs_pause(cfs_time_seconds(1)); + + ksocknal_debug_peerhash(ni); + + spin_lock_bh(&net->ksnn_lock); + } + spin_unlock_bh(&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + list_del(&net->ksnn_list); + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == LNET_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + strncpy(&net->ksnn_interfaces[j].ksni_name[0], + names[i], IFNAMSIZ); + j++; + } + + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + +int +ksocknal_search_new_ipif(ksock_net_t *net) +{ + int new_ipif = 0; + int i; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; + char *colon = strchr(ifnam, ':'); + int found = 0; + ksock_net_t *tmp; + int j; + + if (colon != NULL) /* ignore alias device */ + *colon = 0; + + list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, + ksnn_list) { + for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { + char *ifnam2 = &tmp->ksnn_interfaces[j].\ + ksni_name[0]; + char *colon2 = strchr(ifnam2, ':'); + + if (colon2 != NULL) + *colon2 = 0; + + found = strcmp(ifnam, ifnam2) == 0; + if (colon2 != NULL) + *colon2 = ':'; + } + if (found) + break; + } + + new_ipif += !found; + if (colon != NULL) + *colon = ':'; + } + + return new_ipif; +} + +int +ksocknal_start_schedulers(struct ksock_sched_info *info) +{ + int nthrs; + int rc = 0; + int i; + + if (info->ksi_nthreads == 0) { + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = info->ksi_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + info->ksi_cpt); + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); + } + nthrs = min(nthrs, info->ksi_nthreads_max); + } else { + LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max); + /* increase two threads if there is new interface */ + nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + ksock_sched_t *sched; + id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i); + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + snprintf(name, sizeof(name), "socknal_sd%02d_%02d", + info->ksi_cpt, (int)(sched - &info->ksi_scheds[0])); + + rc = ksocknal_thread_start(ksocknal_scheduler, + (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + info->ksi_cpt, info->ksi_nthreads + i, rc); + break; + } + + info->ksi_nthreads += i; + return rc; +} + +int +ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts) +{ + int newif = ksocknal_search_new_ipif(net); + int rc; + int i; + + LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table())); + + for (i = 0; i < ncpts; i++) { + struct ksock_sched_info *info; + int cpt = (cpts == NULL) ? i : cpts[i]; + + LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); + info = ksocknal_data.ksnd_sched_info[cpt]; + + if (!newif && info->ksi_nthreads > 0) + continue; + + rc = ksocknal_start_schedulers(info); + if (rc != 0) + return rc; + } + return 0; +} + +int +ksocknal_startup (lnet_ni_t *ni) +{ + ksock_net_t *net; + int rc; + int i; + + LASSERT (ni->ni_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ksocknal_new_incarnation(); + ni->ni_data = net; + ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; + ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; + + if (ni->ni_interfaces[0] == NULL) { + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) + goto fail_1; + + net->ksnn_ninterfaces = 1; + } else { + for (i = 0; i < LNET_MAX_INTERFACES; i++) { + int up; + + if (ni->ni_interfaces[i] == NULL) + break; + + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); + + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); + goto fail_1; + } + + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } + + strncpy(&net->ksnn_interfaces[i].ksni_name[0], + ni->ni_interfaces[i], IFNAMSIZ); + } + net->ksnn_ninterfaces = i; + } + + /* call it before add it to ksocknal_data.ksnd_nets */ + rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto fail_1; + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), + net->ksnn_interfaces[0].ksni_ipaddr); + list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); + + ksocknal_data.ksnd_nnets++; + + return 0; + + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; +} + + +void __exit +ksocknal_module_fini (void) +{ + lnet_unregister_lnd(&the_ksocklnd); + ksocknal_tunables_fini(); +} + +int __init +ksocknal_module_init (void) +{ + int rc; + + /* check ksnr_connected/connecting field large enough */ + CLASSERT (SOCKLND_CONN_NTYPES <= 4); + CLASSERT (SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN); + + /* initialize the_ksocklnd */ + the_ksocklnd.lnd_type = SOCKLND; + the_ksocklnd.lnd_startup = ksocknal_startup; + the_ksocklnd.lnd_shutdown = ksocknal_shutdown; + the_ksocklnd.lnd_ctl = ksocknal_ctl; + the_ksocklnd.lnd_send = ksocknal_send; + the_ksocklnd.lnd_recv = ksocknal_recv; + the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; + the_ksocklnd.lnd_accept = ksocknal_accept; + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_ksocklnd); + + return 0; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0"); +MODULE_LICENSE("GPL"); + +cfs_module(ksocknal, "3.0.0", ksocknal_module_init, ksocknal_module_fini); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 000000000000..b483e0c3a69a --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define DEBUG_SUBSYSTEM S_LND + +#include "socklnd_lib-linux.h" + +#include +#include +#include +#include +#include + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ +#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */ + +#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ +#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ + +#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ + +/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). + * no risk if we're not running on a CONFIG_HIGHMEM platform. */ +#ifdef CONFIG_HIGHMEM +# define SOCKNAL_RISK_KMAP_DEADLOCK 0 +#else +# define SOCKNAL_RISK_KMAP_DEADLOCK 1 +#endif + +struct ksock_sched_info; + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + /* conn waiting to be written */ + struct list_head kss_tx_conns; + /* zombie noop tx list */ + struct list_head kss_zombie_noop_txs; + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + /* # connections assigned to this scheduler */ + int kss_nconns; + struct ksock_sched_info *kss_info; /* owner of it */ + struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; + struct iovec kss_scratch_iov[LNET_MAX_IOV]; +} ksock_sched_t; + +struct ksock_sched_info { + int ksi_nthreads_max; /* max allowed threads */ + int ksi_nthreads; /* number of threads */ + int ksi_cpt; /* CPT id */ + ksock_sched_t *ksi_scheds; /* array of schedulers */ +}; + +#define KSOCK_CPT_SHIFT 16 +#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) +#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) +#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) + +typedef struct /* in-use interface */ +{ + __u32 ksni_ipaddr; /* interface's IP address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ + char ksni_name[IFNAMSIZ]; /* interface name */ +} ksock_interface_t; + +typedef struct +{ + /* "stuck" socket timeout (seconds) */ + int *ksnd_timeout; + /* # scheduler threads in each pool while starting */ + int *ksnd_nscheds; + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_nconnds_max; /* max # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_round_robin; /* round robin for multiple interfaces */ + int *ksnd_keepalive; /* # secs for sending keepalive NOOP */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peertxcredits; /* # concurrent sends to 1 peer */ + int *ksnd_peerrtrcredits; /* # per-peer router buffer credits */ + int *ksnd_peertimeout; /* seconds to consider peer dead */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ + unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + ctl_table_header_t *ksnd_sysctl; /* sysctl interface */ +#endif +} ksock_tunables_t; + +typedef struct +{ + __u64 ksnn_incarnation; /* my epoch */ + spinlock_t ksnn_lock; /* serialise */ + struct list_head ksnn_list; /* chain on global list */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ + int ksnn_ninterfaces; /* IP interfaces */ + ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES]; +} ksock_net_t; + +/** connd timeout */ +#define SOCKNAL_CONND_TIMEOUT 120 +/** reserved thread for accepting & creating new connd */ +#define SOCKNAL_CONND_RESV 1 + +typedef struct +{ + int ksnd_init; /* initialisation state */ + int ksnd_nnets; /* # networks set up */ + struct list_head ksnd_nets; /* list of nets */ + /* stabilize peer/conn ops */ + rwlock_t ksnd_global_lock; + /* hash table of all my known peers */ + struct list_head *ksnd_peers; + int ksnd_peer_hash_size; /* size of ksnd_peers */ + + int ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + /* schedulers information */ + struct ksock_sched_info **ksnd_sched_info; + + atomic_t ksnd_nactive_txs; /* #active txs */ + + struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/ + struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */ + struct list_head ksnd_enomem_conns; /* conns to retry: reaper_lock*/ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + cfs_time_t ksnd_reaper_waketime;/* when reaper will wake */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + int ksnd_enomem_tx; /* test ENOMEM sender */ + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + struct list_head ksnd_connd_connreqs; /* incoming connection requests */ + struct list_head ksnd_connd_routes; /* routes waiting to be connected */ + wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */ + int ksnd_connd_connecting;/* # connds connecting */ + /** time stamp of the last failed connecting attempt */ + long ksnd_connd_failed_stamp; + /** # starting connd */ + unsigned ksnd_connd_starting; + /** time stamp of the last starting connd */ + long ksnd_connd_starting_stamp; + /** # running connd */ + unsigned ksnd_connd_running; + spinlock_t ksnd_connd_lock; /* serialise */ + + struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */ + spinlock_t ksnd_tx_lock; /* serialise, g_lock unsafe */ + +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_ALL 2 + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments (the first frag contains the portals header), + * followed by 0 or more lnet_kiov_t fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, the payload is + * received into either struct iovec or lnet_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ + +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ +struct ksock_proto; /* forward ref */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + struct list_head tx_zc_list; /* queue on peer for ZC request */ + atomic_t tx_refcount; /* tx reference count */ + int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ + int tx_niov; /* # packet iovec frags */ + struct iovec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ + lnet_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ + cfs_time_t tx_deadline; /* when (in jiffies) tx times out */ + ksock_msg_t tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + union { + struct { + struct iovec iov; /* virt hdr */ + lnet_kiov_t kiov[0]; /* paged payload */ + } paged; + struct { + struct iovec iov[1]; /* virt hdr + payload */ + } virt; + } tx_frags; +} ksock_tx_t; + +#define KSOCK_NOOP_TX_SIZE ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0])) + +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or up to LNET_MAX_IOV frags of payload of either type. */ +typedef union { + struct iovec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ + +typedef struct ksock_conn +{ + struct ksock_peer *ksnc_peer; /* owning peer */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer's conn list */ + socket_t *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + atomic_t ksnc_conn_refcount; /* conn refcount */ + atomic_t ksnc_sock_refcount; /* sock refcount */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + __u32 ksnc_myipaddr; /* my IP */ + __u32 ksnc_ipaddr; /* peer's IP */ + int ksnc_port; /* peer's port */ + signed int ksnc_type:3; /* type of connection, + * should be signed value */ + unsigned int ksnc_closing:1; /* being shut down */ + unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ + unsigned int ksnc_zc_capable:1; /* enable to ZC */ + struct ksock_proto *ksnc_proto; /* protocol for the connection */ + + /* reader */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + cfs_time_t ksnc_rx_deadline; /* when (in jiffies) receive times out */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled;/* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct iovec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + lnet_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space;/* space for frag descriptors */ + __u32 ksnc_rx_csum; /* partial checksum for incoming data */ + void *ksnc_cookie; /* rx lnet_finalize passthru arg */ + ksock_msg_t ksnc_msg; /* incoming message buffer: + * V2.x message takes the + * whole struct + * V1.x message is a bare + * lnet_hdr_t, it's stored in + * ksnc_msg.ksm_u.lnetmsg */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + ksock_tx_t *ksnc_tx_carrier; /* next TX that can carry a LNet message or ZC-ACK */ + cfs_time_t ksnc_tx_deadline; /* when (in jiffies) tx times out */ + int ksnc_tx_bufnob; /* send buffer marker */ + atomic_t ksnc_tx_nob; /* # bytes queued */ + int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + cfs_time_t ksnc_tx_last_post; /* time stamp of the last posted TX */ +} ksock_conn_t; + +typedef struct ksock_route +{ + struct list_head ksnr_list; /* chain on peer route list */ + struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ + struct ksock_peer *ksnr_peer; /* owning peer */ + atomic_t ksnr_refcount; /* # users */ + cfs_time_t ksnr_timeout; /* when (in jiffies) reconnection can happen next */ + cfs_duration_t ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_myipaddr; /* my IP */ + __u32 ksnr_ipaddr; /* IP address to connect to */ + int ksnr_port; /* port to connect to */ + unsigned int ksnr_scheduled:1; /* scheduled for attention */ + unsigned int ksnr_connecting:1;/* connection establishment in progress */ + unsigned int ksnr_connected:4; /* connections established by type */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ + unsigned int ksnr_share_count; /* created explicitly? */ + int ksnr_conn_count; /* # conns established by this route */ +} ksock_route_t; + +#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ + +typedef struct ksock_peer +{ + struct list_head ksnp_list; /* stash on global peer list */ + cfs_time_t ksnp_last_alive; /* when (in jiffies) I was last alive */ + lnet_process_id_t ksnp_id; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_sharecount; /* lconf usage counter */ + int ksnp_closing; /* being closed */ + int ksnp_accepting;/* # passive connections pending */ + int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ + __u64 ksnp_incarnation; /* latest known peer incarnation */ + struct ksock_proto *ksnp_proto; /* latest known peer protocol */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ + struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */ + cfs_time_t ksnp_send_keepalive; /* time to send keepalive */ + lnet_ni_t *ksnp_ni; /* which network */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */ +} ksock_peer_t; + +typedef struct ksock_connreq +{ + struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ + lnet_ni_t *ksncr_ni; /* chosen NI */ + socket_t *ksncr_sock; /* accepted socket */ +} ksock_connreq_t; + +extern ksock_nal_data_t ksocknal_data; +extern ksock_tunables_t ksocknal_tunables; + +#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ +#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ +#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */ + +typedef struct ksock_proto +{ + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */ + int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */ + void (*pro_pack)(ksock_tx_t *); /* message pack */ + void (*pro_unpack)(ksock_msg_t *); /* message unpack */ + ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *); /* queue tx on the connection */ + int (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */ + int (*pro_handle_zcreq)(ksock_conn_t *, __u64, int); /* handle ZC request */ + int (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64); /* handle ZC ACK */ + int (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int); /* msg type matches the connection type: + * return value: + * return MATCH_NO : no + * return MATCH_YES : matching type + * return MATCH_MAY : can be backup */ +} ksock_proto_t; + +extern ksock_proto_t ksocknal_protocol_v1x; +extern ksock_proto_t ksocknal_protocol_v2x; +extern ksock_proto_t ksocknal_protocol_v3x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR + +#ifndef CPU_MASK_NONE +#define CPU_MASK_NONE 0UL +#endif + +static inline int +ksocknal_route_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return (1 << SOCKLND_CONN_ANY); + + return ((1 << SOCKLND_CONN_CONTROL) | + (1 << SOCKLND_CONN_BULK_IN) | + (1 << SOCKLND_CONN_BULK_OUT)); +} + +static inline struct list_head * +ksocknal_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return (&ksocknal_data.ksnd_peers [hash]); +} + +static inline void +ksocknal_conn_addref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + atomic_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn); +extern void ksocknal_finalize_zcreq(ksock_conn_t *conn); + +static inline void +ksocknal_conn_decref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + +static inline int +ksocknal_connsock_addref (ksock_conn_t *conn) +{ + int rc = -ESHUTDOWN; + + read_lock(&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + atomic_inc(&conn->ksnc_sock_refcount); + rc = 0; + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + return (rc); +} + +static inline void +ksocknal_connsock_decref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT (conn->ksnc_closing); + libcfs_sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + ksocknal_finalize_zcreq(conn); + } +} + +static inline void +ksocknal_tx_addref (ksock_tx_t *tx) +{ + LASSERT (atomic_read(&tx->tx_refcount) > 0); + atomic_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx); +extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx); + +static inline void +ksocknal_tx_decref (ksock_tx_t *tx) +{ + LASSERT (atomic_read(&tx->tx_refcount) > 0); + if (atomic_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx); +} + +static inline void +ksocknal_route_addref (ksock_route_t *route) +{ + LASSERT (atomic_read(&route->ksnr_refcount) > 0); + atomic_inc(&route->ksnr_refcount); +} + +extern void ksocknal_destroy_route (ksock_route_t *route); + +static inline void +ksocknal_route_decref (ksock_route_t *route) +{ + LASSERT (atomic_read (&route->ksnr_refcount) > 0); + if (atomic_dec_and_test(&route->ksnr_refcount)) + ksocknal_destroy_route (route); +} + +static inline void +ksocknal_peer_addref (ksock_peer_t *peer) +{ + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + atomic_inc(&peer->ksnp_refcount); +} + +extern void ksocknal_destroy_peer (ksock_peer_t *peer); + +static inline void +ksocknal_peer_decref (ksock_peer_t *peer) +{ + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + if (atomic_dec_and_test(&peer->ksnp_refcount)) + ksocknal_destroy_peer (peer); +} + +int ksocknal_startup (lnet_ni_t *ni); +void ksocknal_shutdown (lnet_ni_t *ni); +int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(lnet_ni_t *ni, socket_t *sock); + +extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); +extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id); +extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id); +extern void ksocknal_peer_failed (ksock_peer_t *peer); +extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, + socket_t *sock, int type); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); +extern void ksocknal_terminate_conn (ksock_conn_t *conn); +extern void ksocknal_destroy_conn (ksock_conn_t *conn); +extern int ksocknal_close_peer_conns_locked (ksock_peer_t *peer, + __u32 ipaddr, int why); +extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why); +extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr); +extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer, + ksock_tx_t *tx, int nonblk); + +extern int ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx, + lnet_process_id_t id); +extern ksock_tx_t *ksocknal_alloc_tx(int type, int size); +extern void ksocknal_free_tx (ksock_tx_t *tx); +extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); +extern void ksocknal_next_tx_carrier(ksock_conn_t *conn); +extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); +extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, + int error); +extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); +extern void ksocknal_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when); +extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); +extern void ksocknal_thread_fini (void); +extern void ksocknal_launch_all_connections_locked (ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connectable_route_locked (ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer); +extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); +extern int ksocknal_scheduler (void *arg); +extern int ksocknal_connd (void *arg); +extern int ksocknal_reaper (void *arg); +extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello); +extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *id, + __u64 *incarnation); +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); + +extern int ksocknal_lib_zc_capable(ksock_conn_t *conn); +extern void ksocknal_lib_save_callback(socket_t *sock, ksock_conn_t *conn); +extern void ksocknal_lib_set_callback(socket_t *sock, ksock_conn_t *conn); +extern void ksocknal_lib_reset_callback(socket_t *sock, ksock_conn_t *conn); +extern void ksocknal_lib_push_conn (ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn); +extern int ksocknal_lib_setup_sock (socket_t *so); +extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx); +extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx); +extern void ksocknal_lib_eager_ack (ksock_conn_t *conn); +extern int ksocknal_lib_recv_iov (ksock_conn_t *conn); +extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, + int *rxmem, int *nagle); + +extern int ksocknal_tunables_init(void); +extern void ksocknal_tunables_fini(void); +extern int ksocknal_lib_tunables_init(void); +extern void ksocknal_lib_tunables_fini(void); + +extern void ksocknal_lib_csum_tx(ksock_tx_t *tx); + +extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn); +extern int ksocknal_lib_bind_thread_to_cpu(int id); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 000000000000..ad5e24104238 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,2664 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +ksock_tx_t * +ksocknal_alloc_tx(int type, int size) +{ + ksock_tx_t *tx = NULL; + + if (type == KSOCK_MSG_NOOP) { + LASSERT(size == KSOCK_NOOP_TX_SIZE); + + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ + next, ksock_tx_t, tx_list); + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); + + if (tx == NULL) + return NULL; + + atomic_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; + tx->tx_zc_capable = 0; + tx->tx_zc_checked = 0; + tx->tx_desc_size = size; + + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +ksock_tx_t * +ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) +{ + ksock_tx_t *tx; + + tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return NULL; + } + + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1; + tx->tx_nonblk = nonblk; + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + + return tx; +} + + +void +ksocknal_free_tx (ksock_tx_t *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } +} + +int +ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct iovec *iov = tx->tx_iov; + int nob; + int rc; + + LASSERT (tx->tx_niov > 0); + + /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ + rc = ksocknal_lib_send_iov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return (rc); + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" iov */ + do { + LASSERT (tx->tx_niov > 0); + + if (nob < (int) iov->iov_len) { + iov->iov_base = (void *)((char *)iov->iov_base + nob); + iov->iov_len -= nob; + return (rc); + } + + nob -= iov->iov_len; + tx->tx_iov = ++iov; + tx->tx_niov--; + } while (nob != 0); + + return (rc); +} + +int +ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + lnet_kiov_t *kiov = tx->tx_kiov; + int nob; + int rc; + + LASSERT (tx->tx_niov == 0); + LASSERT (tx->tx_nkiov > 0); + + /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ + rc = ksocknal_lib_send_kiov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return (rc); + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" kiov */ + do { + LASSERT(tx->tx_nkiov > 0); + + if (nob < (int)kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return rc; + } + + nob -= (int)kiov->kiov_len; + tx->tx_kiov = ++kiov; + tx->tx_nkiov--; + } while (nob != 0); + + return (rc); +} + +int +ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + int bufnob; + + if (ksocknal_data.ksnd_stall_tx != 0) { + cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + } + + LASSERT (tx->tx_resid != 0); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return (-ESHUTDOWN); + } + + do { + if (ksocknal_data.ksnd_enomem_tx > 0) { + /* testing... */ + ksocknal_data.ksnd_enomem_tx--; + rc = -EAGAIN; + } else if (tx->tx_niov != 0) { + rc = ksocknal_send_iov (conn, tx); + } else { + rc = ksocknal_send_kiov (conn, tx); + } + + bufnob = cfs_sock_wmem_queued(conn->ksnc_sock); + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = bufnob; + mb(); + } + + if (rc <= 0) { /* Didn't write anything? */ + + if (rc == 0) /* some stacks return 0 instead of -EAGAIN */ + rc = -EAGAIN; + + /* Check if EAGAIN is due to memory pressure */ + if(rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) + rc = -ENOMEM; + + break; + } + + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); + rc = 0; + + } while (tx->tx_resid != 0); + + ksocknal_connsock_decref(conn); + return (rc); +} + +int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + + LASSERT (conn->ksnc_rx_niov > 0); + + /* Never touch conn->ksnc_rx_iov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_iov(conn); + + if (rc <= 0) + return (rc); + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_niov > 0); + + if (nob < (int)iov->iov_len) { + iov->iov_len -= nob; + iov->iov_base = (void *)((char *)iov->iov_base + nob); + return (-EAGAIN); + } + + nob -= iov->iov_len; + conn->ksnc_rx_iov = ++iov; + conn->ksnc_rx_niov--; + } while (nob != 0); + + return (rc); +} + +int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + LASSERT (conn->ksnc_rx_nkiov > 0); + + /* Never touch conn->ksnc_rx_kiov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_kiov(conn); + + if (rc <= 0) + return (rc); + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_nkiov > 0); + + if (nob < (int) kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return -EAGAIN; + } + + nob -= kiov->kiov_len; + conn->ksnc_rx_kiov = ++kiov; + conn->ksnc_rx_nkiov--; + } while (nob != 0); + + return 1; +} + +int +ksocknal_receive (ksock_conn_t *conn) +{ + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; + ENTRY; + + if (ksocknal_data.ksnd_stall_rx != 0) { + cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx)); + } + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return (-ESHUTDOWN); + } + + for (;;) { + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) { + rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } + break; + } + + /* Completed a fragment */ + + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } + } + + ksocknal_connsock_decref(conn); + RETURN (rc); +} + +void +ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx) +{ + lnet_msg_t *lnetmsg = tx->tx_lnetmsg; + int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO; + ENTRY; + + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); + + if (ni == NULL && tx->tx_conn != NULL) + ni = tx->tx_conn->ksnc_peer->ksnp_ni; + + ksocknal_free_tx (tx); + if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */ + lnet_finalize (ni, lnetmsg, rc); + + EXIT; +} + +void +ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error) +{ + ksock_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, ksock_tx_t, tx_list); + + if (error && tx->tx_lnetmsg != NULL) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } + + list_del (&tx->tx_list); + + LASSERT (atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done (ni, tx); + } +} + +static void +ksocknal_check_zc_req(ksock_tx_t *tx) +{ + ksock_conn_t *conn = tx->tx_conn; + ksock_peer_t *peer = conn->ksnc_peer; + + /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx + * to ksnp_zc_req_list if some fragment of this message should be sent + * zero-copy. Our peer will send an ACK containing this cookie when + * she has received this message to tell us we can signal completion. + * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on + * ksnp_zc_req_list. */ + LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT (tx->tx_zc_capable); + + tx->tx_zc_checked = 1; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x || + !conn->ksnc_zc_capable) + return; + + /* assign cookie and queue tx to pending list, it will be released when + * a matching ack is received. See ksocknal_handle_zcack() */ + + ksocknal_tx_addref(tx); + + spin_lock(&peer->ksnp_lock); + + /* ZC_REQ is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + + tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; + + if (peer->ksnp_zc_next_cookie == 0) + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); + + spin_unlock(&peer->ksnp_lock); +} + +static void +ksocknal_uncheck_zc_req(ksock_tx_t *tx) +{ + ksock_peer_t *peer = tx->tx_conn->ksnc_peer; + + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_zc_capable); + + tx->tx_zc_checked = 0; + + spin_lock(&peer->ksnp_lock); + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* Not waiting for an ACK */ + spin_unlock(&peer->ksnp_lock); + return; + } + + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + + spin_unlock(&peer->ksnp_lock); + + ksocknal_tx_decref(tx); +} + +int +ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + + if (tx->tx_zc_capable && !tx->tx_zc_checked) + ksocknal_check_zc_req(tx); + + rc = ksocknal_transmit (conn, tx); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); + + if (tx->tx_resid == 0) { + /* Sent everything OK */ + LASSERT (rc == 0); + + return (0); + } + + if (rc == -EAGAIN) + return (rc); + + if (rc == -ENOMEM) { + static int counter; + + counter++; /* exponential backoff warnings */ + if ((counter & (-counter)) == counter) + CWARN("%u ENOMEM tx %p (%u allocated)\n", + counter, conn, atomic_read(&libcfs_kmemory)); + + /* Queue on ksnd_enomem_conns for retry after a timeout */ + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* enomem list takes over scheduler's ref... */ + LASSERT (conn->ksnc_tx_scheduled); + list_add_tail(&conn->ksnc_tx_list, + &ksocknal_data.ksnd_enomem_conns); + if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(), + SOCKNAL_ENOMEM_RETRY), + ksocknal_data.ksnd_reaper_waketime)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + return (rc); + } + + /* Actual error */ + LASSERT (rc < 0); + + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %u.%u.%u.%u reset our connection " + "while we were sending data; it may have " + "rebooted.\n", + HIPQUAD(conn->ksnc_ipaddr)); + break; + default: + LCONSOLE_WARN("There was an unexpected network error " + "while writing to %u.%u.%u.%u: %d.\n", + HIPQUAD(conn->ksnc_ipaddr), rc); + break; + } + CDEBUG(D_NET, "[%p] Error %d on write to %s" + " ip %d.%d.%d.%d:%d\n", conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + } + + if (tx->tx_zc_checked) + ksocknal_uncheck_zc_req(tx); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + + return (rc); +} + +void +ksocknal_launch_connection_locked (ksock_route_t *route) +{ + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0); + + route->ksnr_scheduled = 1; /* scheduling conn for connd */ + ksocknal_route_addref(route); /* extra ref for connd */ + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&route->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); +} + +void +ksocknal_launch_all_connections_locked (ksock_peer_t *peer) +{ + ksock_route_t *route; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + route = ksocknal_find_connectable_route_locked(peer); + if (route == NULL) + return; + + ksocknal_launch_connection_locked(route); + } +} + +ksock_conn_t * +ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_conn_t *typed = NULL; + ksock_conn_t *fallback = NULL; + int tnob = 0; + int fnob = 0; + + list_for_each (tmp, &peer->ksnp_conns) { + ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); + int nob = atomic_read(&c->ksnc_tx_nob) + + cfs_sock_wmem_queued(c->ksnc_sock); + int rc; + + LASSERT (!c->ksnc_closing); + LASSERT (c->ksnc_proto != NULL && + c->ksnc_proto->pro_match_tx != NULL); + + rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); + + switch (rc) { + default: + LBUG(); + case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ + continue; + + case SOCKNAL_MATCH_YES: /* typed connection */ + if (typed == NULL || tnob > nob || + (tnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + typed = c; + tnob = nob; + } + break; + + case SOCKNAL_MATCH_MAY: /* fallback connection */ + if (fallback == NULL || fnob > nob || + (fnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + fallback = c; + fnob = nob; + } + break; + } + } + + /* prefer the typed selection */ + conn = (typed != NULL) ? typed : fallback; + + if (conn != NULL) + conn->ksnc_tx_last_post = cfs_time_current(); + + return conn; +} + +void +ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx) +{ + conn->ksnc_proto->pro_pack(tx); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + ksocknal_conn_addref(conn); /* +1 ref for tx */ + tx->tx_conn = conn; +} + +void +ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) +{ + ksock_sched_t *sched = conn->ksnc_scheduler; + ksock_msg_t *msg = &tx->tx_msg; + ksock_tx_t *ztx = NULL; + int bufnob = 0; + + /* called holding global lock (read or irq-write) and caller may + * not have dropped this lock between finding conn and calling me, + * so we don't need the {get,put}connsock dance to deref + * ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + + ksocknal_tx_prep(conn, tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. */ + LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) + + lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == + (unsigned int)tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_resid == tx->tx_nob); + + CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type: + KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + /* + * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ + * but they're used inside spinlocks a lot. + */ + bufnob = cfs_sock_wmem_queued(conn->ksnc_sock); + spin_lock_bh(&sched->kss_lock); + + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = 0; + mb(); /* order with adding to tx_queue */ + } + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT (msg->ksm_zc_cookies[1] != 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) + ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT (msg->ksm_zc_cookies[1] == 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL); + + ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); + /* ztx will be released later */ + } + + if (ztx != NULL) { + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + } + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + ksocknal_conn_addref(conn); + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + + +ksock_route_t * +ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +{ + cfs_time_t now = cfs_time_current(); + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) /* connections being established */ + continue; + + /* all route types connected ? */ + if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) + continue; + + if (!(route->ksnr_retry_interval == 0 || /* first attempt */ + cfs_time_aftereq(now, route->ksnr_timeout))) { + CDEBUG(D_NET, + "Too soon to retry route %u.%u.%u.%u " + "(cnted %d, interval %ld, %ld secs later)\n", + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_connected, + route->ksnr_retry_interval, + cfs_duration_sec(route->ksnr_timeout - now)); + continue; + } + + return (route); + } + + return (NULL); +} + +ksock_route_t * +ksocknal_find_connecting_route_locked (ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) + return (route); + } + + return (NULL); +} + +int +ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) +{ + ksock_peer_t *peer; + ksock_conn_t *conn; + rwlock_t *g_lock; + int retry; + int rc; + + LASSERT (tx->tx_conn == NULL); + + g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { + read_lock(g_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + if (ksocknal_find_connectable_route_locked(peer) == NULL) { + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock(g_lock); + return (0); + } + } + } + + /* I'll need a write lock... */ + read_unlock(g_lock); + + write_lock_bh(g_lock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + break; + + write_unlock_bh(g_lock); + + if ((id.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to " + "userspace process %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + rc = ksocknal_add_peer(ni, id, + LNET_NIDADDR(id.nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_id2str(id), rc); + return rc; + } + } + + ksocknal_launch_all_connections_locked(peer); + + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* Connection exists; queue message on it */ + ksocknal_queue_tx_locked (tx, conn); + write_unlock_bh(g_lock); + return (0); + } + + if (peer->ksnp_accepting > 0 || + ksocknal_find_connecting_route_locked (peer) != NULL) { + /* the message is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + /* Queue the message until a connection is established */ + list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); + write_unlock_bh(g_lock); + return 0; + } + + write_unlock_bh(g_lock); + + /* NB Routes may be ignored if connections to them failed recently */ + CNETERR("No usable routes to %s\n", libcfs_id2str(id)); + return (-EHOSTUNREACH); +} + +int +ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + int mpflag = 0; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + ksock_tx_t *tx; + int desc_size; + int rc; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it... */ + + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + LASSERT (!in_interrupt ()); + + if (payload_iov != NULL) + desc_size = offsetof(ksock_tx_t, + tx_frags.virt.iov[1 + payload_niov]); + else + desc_size = offsetof(ksock_tx_t, + tx_frags.paged.kiov[payload_niov]); + + if (lntmsg->msg_vmflush) + mpflag = cfs_memory_pressure_get_and_set(); + tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + if (lntmsg->msg_vmflush) + cfs_memory_pressure_restore(mpflag); + return (-ENOMEM); + } + + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; + + if (payload_iov != NULL) { + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1 + + lnet_extract_iov(payload_niov, &tx->tx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); + } else { + tx->tx_niov = 1; + tx->tx_iov = &tx->tx_frags.paged.iov; + tx->tx_kiov = tx->tx_frags.paged.kiov; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); + + if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) + tx->tx_zc_capable = 1; + } + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET); + + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + if (lntmsg->msg_vmflush) + cfs_memory_pressure_restore(mpflag); + if (rc == 0) + return (0); + + ksocknal_free_tx(tx); + return (-EIO); +} + +int +ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + task_t *task = kthread_run(fn, arg, name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads++; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return 0; +} + +void +ksocknal_thread_fini (void) +{ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads--; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + unsigned int niov; + int skipped; + + LASSERT(conn->ksnc_proto != NULL); + + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); + } + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb(); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + case KSOCK_PROTO_V3: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg; + + conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u); + break; + + case KSOCK_PROTO_V1: + /* Receiving bare lnet_hdr_t */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t); + conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t); + break; + + default: + LBUG (); + } + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return (1); + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +int +ksocknal_process_receive (ksock_conn_t *conn) +{ + lnet_hdr_t *lhdr; + lnet_process_id_t *id; + int rc; + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + + /* NB: sched lock NOT held */ + /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn); + + if (rc <= 0) { + LASSERT (rc != -EAGAIN); + + if (rc == 0) + CDEBUG (D_NET, "[%p] EOF from %s" + " ip %d.%d.%d.%d:%d\n", conn, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + else if (!conn->ksnc_closing) + CERROR ("[%p] Error %d on read from %s" + " ip %d.%d.%d.%d:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return (-EAGAIN); + } + } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); + } + + if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { + CERROR("%s: Unknown message type: %x\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_type); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EPROTO); + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EIO); + } + + if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) { + __u64 cookie = 0; + + LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x); + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) + cookie = conn->ksnc_msg.ksm_zc_cookies[0]; + + rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, + conn->ksnc_msg.ksm_zc_cookies[1]); + + if (rc != 0) { + CERROR("%s: Unknown ZC-ACK cookie: "LPU64", "LPU64"\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + cookie, conn->ksnc_msg.ksm_zc_cookies[1]); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (rc); + } + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { + ksocknal_new_packet (conn, 0); + return 0; /* NOOP is done and just return */ + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t); + conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer */ + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + /* Substitute process ID assigned at connection time */ + lhdr->src_pid = cpu_to_le32(id->pid); + lhdr->src_nid = cpu_to_le64(id->nid); + } + + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, + conn->ksnc_peer->ksnp_id.nid, conn, 0); + if (rc < 0) { + /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + ksocknal_conn_decref(conn); + return (-EPROTO); + } + + /* I'm racing with ksocknal_recv() */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; + } + + if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) { + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + rc = conn->ksnc_proto->pro_handle_zcreq(conn, + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != id->nid); + } + + lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ + + default: + break; + } + + /* Not Reached */ + LBUG (); + return (-EINVAL); /* keep gcc happy */ +} + +int +ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT (mlen <= rlen); + LASSERT (niov <= LNET_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + if (mlen == 0 || iov != NULL) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = + lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + + LASSERT (mlen == + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + LASSERT (conn->ksnc_rx_scheduled); + + spin_lock_bh(&sched->kss_lock); + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + wake_up (&sched->kss_waitq); + LASSERT (conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh(&sched->kss_lock); + ksocknal_conn_decref(conn); + return 0; +} + +static inline int +ksocknal_sched_cansleep(ksock_sched_t *sched) +{ + int rc; + + spin_lock_bh(&sched->kss_lock); + + rc = (!ksocknal_data.ksnd_shuttingdown && + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns)); + + spin_unlock_bh(&sched->kss_lock); + return rc; +} + +int ksocknal_scheduler(void *arg) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + int nloops = 0; + long id = (long)arg; + + info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + + cfs_block_allsigs(); + + rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); + if (rc != 0) { + CERROR("Can't set CPT affinity to %d: %d\n", + info->ksi_cpt, rc); + } + + spin_lock_bh(&sched->kss_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + conn = list_entry(sched->kss_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del(&conn->ksnc_rx_list); + + LASSERT(conn->ksnc_rx_scheduled); + LASSERT(conn->ksnc_rx_ready); + + /* clear rx_ready in case receive isn't complete. + * Do it BEFORE we call process_recv, since + * data_ready can set it any time after we release + * kss_lock. */ + conn->ksnc_rx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + rc = ksocknal_process_receive(conn); + + spin_lock_bh(&sched->kss_lock); + + /* I'm the only one that can clear this flag */ + LASSERT(conn->ksnc_rx_scheduled); + + /* Did process_receive get everything it wanted? */ + if (rc == 0) + conn->ksnc_rx_ready = 1; + + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; + } else if (conn->ksnc_rx_ready) { + /* reschedule for rx */ + list_add_tail (&conn->ksnc_rx_list, + &sched->kss_rx_conns); + } else { + conn->ksnc_rx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + + if (!list_empty (&sched->kss_tx_conns)) { + LIST_HEAD (zlist); + + if (!list_empty(&sched->kss_zombie_noop_txs)) { + list_add(&zlist, + &sched->kss_zombie_noop_txs); + list_del_init(&sched->kss_zombie_noop_txs); + } + + conn = list_entry(sched->kss_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT(conn->ksnc_tx_scheduled); + LASSERT(conn->ksnc_tx_ready); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + + tx = list_entry(conn->ksnc_tx_queue.next, + ksock_tx_t, tx_list); + + if (conn->ksnc_tx_carrier == tx) + ksocknal_next_tx_carrier(conn); + + /* dequeue now so empty list => more to send */ + list_del(&tx->tx_list); + + /* Clear tx_ready in case send isn't complete. Do + * it BEFORE we call process_transmit, since + * write_space can set it any time after we release + * kss_lock. */ + conn->ksnc_tx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } + + rc = ksocknal_process_transmit(conn, tx); + + if (rc == -ENOMEM || rc == -EAGAIN) { + /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh(&sched->kss_lock); + list_add(&tx->tx_list, + &conn->ksnc_tx_queue); + } else { + /* Complete send; tx -ref */ + ksocknal_tx_decref(tx); + + spin_lock_bh(&sched->kss_lock); + /* assume space for more */ + conn->ksnc_tx_ready = 1; + } + + if (rc == -ENOMEM) { + /* Do nothing; after a short timeout, this + * conn will be reposted on kss_tx_conns. */ + } else if (conn->ksnc_tx_ready && + !list_empty (&conn->ksnc_tx_queue)) { + /* reschedule for tx */ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + } else { + conn->ksnc_tx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_bh(&sched->kss_lock); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + cfs_wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched), rc); + LASSERT (rc == 0); + } else { + cond_resched(); + } + + spin_lock_bh(&sched->kss_lock); + } + } + + spin_unlock_bh(&sched->kss_lock); + ksocknal_thread_fini(); + return 0; +} + +/* + * Add connection to kss_rx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_read_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + ENTRY; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + spin_unlock_bh(&sched->kss_lock); + + EXIT; +} + +/* + * Add connection to kss_tx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_write_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + ENTRY; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && // not being progressed + !list_empty(&conn->ksnc_tx_queue)){//packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + EXIT; +} + +ksock_proto_t * +ksocknal_parse_proto_version (ksock_hello_msg_t *hello) +{ + __u32 version = 0; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + version = hello->kshm_version; + else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) + version = __swab32(hello->kshm_version); + + if (version != 0) { +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 1) + return NULL; + + if (*ksocknal_tunables.ksnd_protocol == 2 && + version == KSOCK_PROTO_V3) + return NULL; +#endif + if (version == KSOCK_PROTO_V2) + return &ksocknal_protocol_v2x; + + if (version == KSOCK_PROTO_V3) + return &ksocknal_protocol_v3x; + + return NULL; + } + + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello; + + CLASSERT (sizeof (lnet_magicversion_t) == + offsetof (ksock_hello_msg_t, kshm_src_nid)); + + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } + + return NULL; +} + +int +ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello) +{ + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + ksock_net_t *net = (ksock_net_t *)ni->ni_data; + + LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES); + + /* rely on caller to hold a ref on socket so it wouldn't disappear */ + LASSERT (conn->ksnc_proto != NULL); + + hello->kshm_src_nid = ni->ni_nid; + hello->kshm_dst_nid = peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +int +ksocknal_invert_type(int type) +{ + switch (type) + { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return (type); + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return (SOCKLND_CONN_NONE); + } +} + +int +ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *peerid, + __u64 *incarnation) +{ + /* Return < 0 fatal error + * 0 success + * EALREADY lost connection race + * EPROTO protocol version mismatch + */ + socket_t *sock = conn->ksnc_sock; + int active = (conn->ksnc_proto != NULL); + int timeout; + int proto_match; + int rc; + ksock_proto_t *proto; + lnet_process_id_t recv_id; + + /* socket type set on active connections - not set on passive */ + LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); + + timeout = active ? *ksocknal_tunables.ksnd_timeout : + lnet_acceptor_timeout(); + + rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0); + return rc; + } + + proto = ksocknal_parse_proto_version(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer, tell peer my protocol */ + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, ni->ni_nid, hello); + } + + CERROR ("Unknown protocol version (%d.x expected)" + " from %u.%u.%u.%u\n", + conn->ksnc_proto->pro_version, + HIPQUAD(conn->ksnc_ipaddr)); + + return -EPROTO; + } + + proto_match = (conn->ksnc_proto == proto); + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0); + return rc; + } + + *incarnation = hello->kshm_src_incarnation; + + if (hello->kshm_src_nid == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY" + "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + if (!active && + conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer process ID from socket */ + recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; + recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); + } else { + recv_id.nid = hello->kshm_src_nid; + recv_id.pid = hello->kshm_src_pid; + } + + if (!active) { + *peerid = recv_id; + + /* peer determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n", + hello->kshm_ctype, libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + return 0; + } + + if (peerid->pid != recv_id.pid || + peerid->nid != recv_id.nid) { + LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host" + " %u.%u.%u.%u, but they claimed they were " + "%s; please check your Lustre " + "configuration.\n", + libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr), + libcfs_id2str(recv_id)); + return -EPROTO; + } + + if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + /* Possible protocol mismatch or I lost the connection race */ + return proto_match ? EALREADY : EPROTO; + } + + if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n", + conn->ksnc_type, libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr), + hello->kshm_ctype); + return -EPROTO; + } + + return 0; +} + +int +ksocknal_connect (ksock_route_t *route) +{ + LIST_HEAD (zombies); + ksock_peer_t *peer = route->ksnr_peer; + int type; + int wanted; + socket_t *sock; + cfs_time_t deadline; + int retry_later = 0; + int rc = 0; + + deadline = cfs_time_add(cfs_time_current(), + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + LASSERT (route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + + route->ksnr_connecting = 1; + + for (;;) { + wanted = ksocknal_route_mask() & ~route->ksnr_connected; + + /* stop connecting if peer/route got closed under me, or + * route got connected while queued */ + if (peer->ksnp_closing || route->ksnr_deleted || + wanted == 0) { + retry_later = 0; + break; + } + + /* reschedule if peer is connecting to me */ + if (peer->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer %s(%d) already connecting to me, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting); + retry_later = 1; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) { + type = SOCKLND_CONN_BULK_IN; + } else { + LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0); + type = SOCKLND_CONN_BULK_OUT; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (cfs_time_aftereq(cfs_time_current(), deadline)) { + rc = -ETIMEDOUT; + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + rc = lnet_connect(&sock, peer->ksnp_id.nid, + route->ksnr_myipaddr, + route->ksnr_ipaddr, route->ksnr_port); + if (rc != 0) + goto failed; + + rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); + if (rc < 0) { + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + /* A +ve RC means I have to retry because I lost the connection + * race or I have to renegotiate protocol version */ + retry_later = (rc != 0); + if (retry_later) + CDEBUG(D_NET, "peer %s: conn race, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + } + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer's incoming connection request */ + + if (rc == EALREADY || + (rc == 0 && peer->ksnp_accepting > 0)) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, + * but the race is resolved quickly usually, + * so min_reconnectms should be good heuristic */ + route->ksnr_retry_interval = + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000; + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + } + + ksocknal_launch_connection_locked(route); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return retry_later; + + failed: + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + /* This is a retry rather than a new connection */ + route->ksnr_retry_interval *= 2; + route->ksnr_retry_interval = + MAX(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000); + route->ksnr_retry_interval = + MIN(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000); + + LASSERT (route->ksnr_retry_interval != 0); + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + + if (!list_empty(&peer->ksnp_tx_queue) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + ksock_conn_t *conn; + + /* ksnp_tx_queue is queued on a conn on successful + * connection for V1.x and V2.x */ + if (!list_empty (&peer->ksnp_conns)) { + conn = list_entry(peer->ksnp_conns.next, + ksock_conn_t, ksnc_list); + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + } + + /* take all the blocked packets while I've got the lock and + * complete below... */ + list_splice_init(&peer->ksnp_tx_queue, &zombies); + } + +#if 0 /* irrelevent with only eager routes */ + if (!route->ksnr_deleted) { + /* make this route least-favourite for re-selection */ + list_del(&route->ksnr_list); + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + } +#endif + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_peer_failed(peer); + ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); + return 0; +} + +/* + * check whether we need to create more connds. + * It will try to create new thread if it's necessary, @timeout can + * be updated if failed to create, so caller wouldn't keep try while + * running out of resource. + */ +static int +ksocknal_connd_check_start(long sec, long *timeout) +{ + char name[16]; + int rc; + int total = ksocknal_data.ksnd_connd_starting + + ksocknal_data.ksnd_connd_running; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (total >= *ksocknal_tunables.ksnd_nconnds_max || + total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { + /* can't create more connd, or still have enough + * threads to handle more connecting */ + return 0; + } + + if (list_empty(&ksocknal_data.ksnd_connd_routes)) { + /* no pending connecting request */ + return 0; + } + + if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { + /* may run out of resource, retry later */ + *timeout = cfs_time_seconds(1); + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* serialize starting to avoid flood */ + return 0; + } + + ksocknal_data.ksnd_connd_starting_stamp = sec; + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + /* NB: total is the next id */ + snprintf(name, sizeof(name), "socknal_cd%02d", total); + rc = ksocknal_thread_start(ksocknal_connd, NULL, name); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + if (rc == 0) + return 1; + + /* we tried ... */ + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_failed_stamp = cfs_time_current_sec(); + + return 1; +} + +/* + * check whether current thread can exit, it will return 1 if there are too + * many threads and no creating in past 120 seconds. + * Also, this function may update @timeout to make caller come back + * again to recheck these conditions. + */ +static int +ksocknal_connd_check_stop(long sec, long *timeout) +{ + int val; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* in progress of starting new thread */ + return 0; + } + + if (ksocknal_data.ksnd_connd_running <= + *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ + return 0; + } + + /* created thread in past 120 seconds? */ + val = (int)(ksocknal_data.ksnd_connd_starting_stamp + + SOCKNAL_CONND_TIMEOUT - sec); + + *timeout = (val > 0) ? cfs_time_seconds(val) : + cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); + if (val > 0) + return 0; + + /* no creating in past 120 seconds */ + + return ksocknal_data.ksnd_connd_running > + ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; +} + +/* Go through connd_routes queue looking for a route that we can process + * right now, @timeout_p can be updated if we need to come back later */ +static ksock_route_t * +ksocknal_connd_get_route_locked(signed long *timeout_p) +{ + ksock_route_t *route; + cfs_time_t now; + + now = cfs_time_current(); + + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { + + if (route->ksnr_retry_interval == 0 || + cfs_time_aftereq(now, route->ksnr_timeout)) + return route; + + if (*timeout_p == MAX_SCHEDULE_TIMEOUT || + (int)*timeout_p > (int)(route->ksnr_timeout - now)) + *timeout_p = (int)(route->ksnr_timeout - now); + } + + return NULL; +} + +int +ksocknal_connd (void *arg) +{ + spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; + ksock_connreq_t *cr; + wait_queue_t wait; + int nloops = 0; + int cons_retry = 0; + + cfs_block_allsigs (); + + init_waitqueue_entry_current (&wait); + + spin_lock_bh(connd_lock); + + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_running++; + + while (!ksocknal_data.ksnd_shuttingdown) { + ksock_route_t *route = NULL; + long sec = cfs_time_current_sec(); + long timeout = MAX_SCHEDULE_TIMEOUT; + int dropped_lock = 0; + + if (ksocknal_connd_check_stop(sec, &timeout)) { + /* wakeup another one to check stop */ + wake_up(&ksocknal_data.ksnd_connd_waitq); + break; + } + + if (ksocknal_connd_check_start(sec, &timeout)) { + /* created new thread */ + dropped_lock = 1; + } + + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + /* Connection accepted by the listener */ + cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \ + next, ksock_connreq_t, ksncr_list); + + list_del(&cr->ksncr_list); + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh(connd_lock); + } + + /* Only handle an outgoing connection request if there + * is a thread left to handle incoming connections and + * create new connd */ + if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < + ksocknal_data.ksnd_connd_running) { + route = ksocknal_connd_get_route_locked(&timeout); + } + if (route != NULL) { + list_del (&route->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + if (ksocknal_connect(route)) { + /* consecutive retry */ + if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { + CWARN("massive consecutive " + "re-connecting to %u.%u.%u.%u\n", + HIPQUAD(route->ksnr_ipaddr)); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + + ksocknal_route_decref(route); + + spin_lock_bh(connd_lock); + ksocknal_data.ksnd_connd_connecting--; + } + + if (dropped_lock) { + if (++nloops < SOCKNAL_RESCHED) + continue; + spin_unlock_bh(connd_lock); + nloops = 0; + cond_resched(); + spin_lock_bh(connd_lock); + continue; + } + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_unlock_bh(connd_lock); + + nloops = 0; + waitq_timedwait(&wait, TASK_INTERRUPTIBLE, timeout); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_lock_bh(connd_lock); + } + ksocknal_data.ksnd_connd_running--; + spin_unlock_bh(connd_lock); + + ksocknal_thread_fini(); + return 0; +} + +ksock_conn_t * +ksocknal_find_timed_out_conn (ksock_peer_t *peer) +{ + /* We're called with a shared lock on ksnd_global_lock */ + ksock_conn_t *conn; + struct list_head *ctmp; + + list_for_each (ctmp, &peer->ksnp_conns) { + int error; + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ + LASSERT (!conn->ksnc_closing); + + /* SOCK_ERROR will reset error code of socket in + * some platform (like Darwin8.x) */ + error = cfs_sock_error(conn->ksnc_sock); + if (error != 0) { + ksocknal_conn_addref(conn); + + switch (error) { + case ECONNRESET: + CNETERR("A connection with %s " + "(%u.%u.%u.%u:%d) was reset; " + "it may have rebooted.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + break; + case ETIMEDOUT: + CNETERR("A connection with %s " + "(%u.%u.%u.%u:%d) timed out; the " + "network or node may be down.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + break; + default: + CNETERR("An unexpected network error %d " + "occurred with %s " + "(%u.%u.%u.%u:%d\n", error, + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + break; + } + + return (conn); + } + + if (conn->ksnc_rx_started && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + ksocknal_conn_addref(conn); + CNETERR("Timeout receiving from %s (%u.%u.%u.%u:%d), " + "state %d wanted %d left %d\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); + return (conn); + } + + if ((!list_empty(&conn->ksnc_tx_queue) || + cfs_sock_wmem_queued(conn->ksnc_sock) != 0) && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending or + * buffered in the socket's send buffer */ + ksocknal_conn_addref(conn); + CNETERR("Timeout sending data to %s (%u.%u.%u.%u:%d) " + "the network or that node may be down.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + return (conn); + } + } + + return (NULL); +} + +static inline void +ksocknal_flush_stale_txs(ksock_peer_t *peer) +{ + ksock_tx_t *tx; + LIST_HEAD (stale_txs); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &stale_txs); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); +} + +int +ksocknal_send_keepalive_locked(ksock_peer_t *peer) +{ + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + + if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */ + return 0; + + if (peer->ksnp_proto != &ksocknal_protocol_v3x) + return 0; + + if (*ksocknal_tunables.ksnd_keepalive <= 0 || + cfs_time_before(cfs_time_current(), + cfs_time_add(peer->ksnp_last_alive, + cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive)))) + return 0; + + if (cfs_time_before(cfs_time_current(), + peer->ksnp_send_keepalive)) + return 0; + + /* retry 10 secs later, so we wouldn't put pressure + * on this peer if we failed to send keepalive this time */ + peer->ksnp_send_keepalive = cfs_time_shift(10); + + conn = ksocknal_find_conn_locked(peer, NULL, 1); + if (conn != NULL) { + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + if (!list_empty(&conn->ksnc_tx_queue)) { + spin_unlock_bh(&sched->kss_lock); + /* there is an queued ACK, don't need keepalive */ + return 0; + } + + spin_unlock_bh(&sched->kss_lock); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* cookie = 1 is reserved for keepalive PING */ + tx = ksocknal_alloc_tx_noop(1, 1); + if (tx == NULL) { + read_lock(&ksocknal_data.ksnd_global_lock); + return -ENOMEM; + } + + if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) { + read_lock(&ksocknal_data.ksnd_global_lock); + return 1; + } + + ksocknal_free_tx(tx); + read_lock(&ksocknal_data.ksnd_global_lock); + + return -EIO; +} + + +void +ksocknal_check_peer_timeouts (int idx) +{ + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; + ksock_peer_t *peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock(&ksocknal_data.ksnd_global_lock); + + list_for_each_entry(peer, peers, ksnp_list) { + cfs_time_t deadline = 0; + int resid = 0; + int n = 0; + + if (ksocknal_send_keepalive_locked(peer) != 0) { + read_unlock(&ksocknal_data.ksnd_global_lock); + goto again; + } + + conn = ksocknal_find_timed_out_conn (peer); + + if (conn != NULL) { + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + + /* NB we won't find this one again, but we can't + * just proceed with the next peer, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_conn_decref(conn); + goto again; + } + + /* we can't process stale txs right here because we're + * holding only shared lock */ + if (!list_empty (&peer->ksnp_tx_queue)) { + ksock_tx_t *tx = + list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) { + + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer); + + ksocknal_peer_decref(peer); + goto again; + } + } + + if (list_empty(&peer->ksnp_zc_req_list)) + continue; + + spin_lock(&peer->ksnp_lock); + list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; + n++; + } + + if (n == 0) { + spin_unlock(&peer->ksnp_lock); + continue; + } + + tx = list_entry(peer->ksnp_zc_req_list.next, + ksock_tx_t, tx_zc_list); + deadline = tx->tx_deadline; + resid = tx->tx_resid; + conn = tx->tx_conn; + ksocknal_conn_addref(conn); + + spin_unlock(&peer->ksnp_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer %s detected; the " + "oldest(%p) timed out %ld secs ago, " + "resid: %d, wmem: %d\n", + n, libcfs_nid2str(peer->ksnp_id.nid), tx, + cfs_duration_sec(cfs_time_current() - deadline), + resid, cfs_sock_wmem_queued(conn->ksnc_sock)); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_reaper (void *arg) +{ + wait_queue_t wait; + ksock_conn_t *conn; + ksock_sched_t *sched; + struct list_head enomem_conns; + int nenomem_conns; + cfs_duration_t timeout; + int i; + int peer_index = 0; + cfs_time_t deadline = cfs_time_current(); + + cfs_block_allsigs (); + + INIT_LIST_HEAD(&enomem_conns); + init_waitqueue_entry_current (&wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + + if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry (ksocknal_data. \ + ksnd_deathrow_conns.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_terminate_conn(conn); + ksocknal_conn_decref(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry (ksocknal_data.ksnd_zombie_conns.\ + next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_destroy_conn(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) { + list_add(&enomem_conns, + &ksocknal_data.ksnd_enomem_conns); + list_del_init(&ksocknal_data.ksnd_enomem_conns); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* reschedule all the connections that stalled with ENOMEM... */ + nenomem_conns = 0; + while (!list_empty (&enomem_conns)) { + conn = list_entry (enomem_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + LASSERT(conn->ksnc_tx_scheduled); + conn->ksnc_tx_ready = 1; + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + wake_up(&sched->kss_waitq); + + spin_unlock_bh(&sched->kss_lock); + nenomem_conns++; + } + + /* careful with the jiffy wrap... */ + while ((timeout = cfs_time_sub(deadline, + cfs_time_current())) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (*ksocknal_tunables.ksnd_timeout > n * p) + chunk = (chunk * n * p) / + *ksocknal_tunables.ksnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } + + deadline = cfs_time_add(deadline, cfs_time_seconds(p)); + } + + if (nenomem_conns != 0) { + /* Reduce my timeout if I rescheduled ENOMEM conns. + * This also prevents me getting woken immediately + * if any go back on my enomem list. */ + timeout = SOCKNAL_ENOMEM_RETRY; + } + ksocknal_data.ksnd_reaper_waketime = + cfs_time_add(cfs_time_current(), timeout); + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty (&ksocknal_data.ksnd_deathrow_conns) && + list_empty (&ksocknal_data.ksnd_zombie_conns)) + waitq_timedwait (&wait, TASK_INTERRUPTIBLE, + timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c new file mode 100644 index 000000000000..3e08fe2d1489 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -0,0 +1,1088 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include "socklnd.h" + +# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + + +enum { + SOCKLND_TIMEOUT = 1, + SOCKLND_CREDITS, + SOCKLND_PEER_TXCREDITS, + SOCKLND_PEER_RTRCREDITS, + SOCKLND_PEER_TIMEOUT, + SOCKLND_NCONNDS, + SOCKLND_RECONNECTS_MIN, + SOCKLND_RECONNECTS_MAX, + SOCKLND_EAGER_ACK, + SOCKLND_ZERO_COPY, + SOCKLND_TYPED, + SOCKLND_BULK_MIN, + SOCKLND_RX_BUFFER_SIZE, + SOCKLND_TX_BUFFER_SIZE, + SOCKLND_NAGLE, + SOCKLND_IRQ_AFFINITY, + SOCKLND_ROUND_ROBIN, + SOCKLND_KEEPALIVE, + SOCKLND_KEEPALIVE_IDLE, + SOCKLND_KEEPALIVE_COUNT, + SOCKLND_KEEPALIVE_INTVL, + SOCKLND_BACKOFF_INIT, + SOCKLND_BACKOFF_MAX, + SOCKLND_PROTOCOL, + SOCKLND_ZERO_COPY_RECV, + SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS +}; + +static ctl_table_t ksocknal_ctl_table[] = { + { + .ctl_name = SOCKLND_TIMEOUT, + .procname = "timeout", + .data = &ksocknal_tunables.ksnd_timeout, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_CREDITS, + .procname = "credits", + .data = &ksocknal_tunables.ksnd_credits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_PEER_TXCREDITS, + .procname = "peer_credits", + .data = &ksocknal_tunables.ksnd_peertxcredits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_PEER_RTRCREDITS, + .procname = "peer_buffer_credits", + .data = &ksocknal_tunables.ksnd_peerrtrcredits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_PEER_TIMEOUT, + .procname = "peer_timeout", + .data = &ksocknal_tunables.ksnd_peertimeout, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NCONNDS, + .procname = "nconnds", + .data = &ksocknal_tunables.ksnd_nconnds, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MIN, + .procname = "min_reconnectms", + .data = &ksocknal_tunables.ksnd_min_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MAX, + .procname = "max_reconnectms", + .data = &ksocknal_tunables.ksnd_max_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_EAGER_ACK, + .procname = "eager_ack", + .data = &ksocknal_tunables.ksnd_eager_ack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY, + .procname = "zero_copy", + .data = &ksocknal_tunables.ksnd_zc_min_payload, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY_RECV, + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + + { + .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS, + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv_min_nfrags, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TYPED, + .procname = "typed", + .data = &ksocknal_tunables.ksnd_typed_conns, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_BULK_MIN, + .procname = "min_bulk", + .data = &ksocknal_tunables.ksnd_min_bulk, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RX_BUFFER_SIZE, + .procname = "rx_buffer_size", + .data = &ksocknal_tunables.ksnd_rx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TX_BUFFER_SIZE, + .procname = "tx_buffer_size", + .data = &ksocknal_tunables.ksnd_tx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NAGLE, + .procname = "nagle", + .data = &ksocknal_tunables.ksnd_nagle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ROUND_ROBIN, + .procname = "round_robin", + .data = &ksocknal_tunables.ksnd_round_robin, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE, + .procname = "keepalive", + .data = &ksocknal_tunables.ksnd_keepalive, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_IDLE, + .procname = "keepalive_idle", + .data = &ksocknal_tunables.ksnd_keepalive_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_COUNT, + .procname = "keepalive_count", + .data = &ksocknal_tunables.ksnd_keepalive_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_INTVL, + .procname = "keepalive_intvl", + .data = &ksocknal_tunables.ksnd_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#if SOCKNAL_VERSION_DEBUG + { + .ctl_name = SOCKLND_PROTOCOL, + .procname = "protocol", + .data = &ksocknal_tunables.ksnd_protocol, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#endif + {0} +}; + + +ctl_table_t ksocknal_top_ctl_table[] = { + { + .ctl_name = CTL_SOCKLND, + .procname = "socknal", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ksocknal_ctl_table + }, + { 0 } +}; + +int +ksocknal_lib_tunables_init () +{ + if (!*ksocknal_tunables.ksnd_typed_conns) { + int rc = -EINVAL; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol < 3) + rc = 0; +#endif + if (rc != 0) { + CERROR("Protocol V3.x MUST have typed connections\n"); + return rc; + } + } + + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; + + ksocknal_tunables.ksnd_sysctl = + cfs_register_sysctl_table(ksocknal_top_ctl_table, 0); + + if (ksocknal_tunables.ksnd_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); +} +#else +int +ksocknal_lib_tunables_init () +{ + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ +} +#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */ + +int +ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) +{ + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); + + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return rc; + } + + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +int +ksocknal_lib_zc_capable(ksock_conn_t *conn) +{ + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); +} + +int +ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + int nob; + int rc; + + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + + { +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + set_fs (KERNEL_DS); + rc = sock_sendmsg(sock, &msg, nob); + set_fs (oldmm); + } + return rc; +} + +int +ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + + /* Not NOOP message */ + LASSERT (tx->tx_lnetmsg != NULL); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->kiov_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = cfs_tcp_sendpage(sk, page, offset, fragsize, + msgflg); + } + } else { +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + set_fs (KERNEL_DS); + rc = sock_sendmsg(sock, &msg, nob); + set_fs (oldmm); + + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + return rc; +} + +void +ksocknal_lib_eager_ack (ksock_conn_t *conn) +{ + int opt = 1; + mm_segment_t oldmm = get_fs(); + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + set_fs(KERNEL_DS); + sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof (opt)); + set_fs(oldmm); +} + +int +ksocknal_lib_recv_iov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct iovec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int nob; + int i; + int rc; + int fragnob; + int sum; + __u32 saved_csum; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + LASSERT (niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); + /* NB this is just a boolean..........................^ */ + set_fs (oldmm); + + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct iovec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; +} + +int +ksocknal_lib_recv_kiov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct iovec scratch; + struct iovec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int nob; + int i; + int rc; + void *base; + void *addr; + int sum; + int fragnob; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + msg.msg_iovlen = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + msg.msg_iovlen = niov; + } + + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); + /* NB this is just a boolean.......................^ */ + set_fs (oldmm); + + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].kiov_page); + } + } + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + + return (rc); +} + +void +ksocknal_lib_csum_tx(ksock_tx_t *tx) +{ + int i; + __u32 csum; + void *base; + + LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); + + tx->tx_msg.ksm_csum = 0; + + csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); + + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } + + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; + } + + tx->tx_msg.ksm_csum = csum; +} + +int +ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + mm_segment_t oldmm = get_fs (); + struct socket *sock = conn->ksnc_sock; + int len; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return (-ESHUTDOWN); + } + + rc = libcfs_sock_getbuf(sock, txmem, rxmem); + if (rc == 0) { + len = sizeof(*nagle); + set_fs(KERNEL_DS); + rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)nagle, &len); + set_fs(oldmm); + } + + ksocknal_connsock_decref(conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return (rc); +} + +int +ksocknal_lib_setup_sock (struct socket *sock) +{ + mm_segment_t oldmm = get_fs (); + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct linger linger; + + sock->sk->sk_allocation = GFP_NOFS; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + linger.l_onoff = 0; + linger.l_linger = 0; + + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof (linger)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + return (rc); + } + + option = -1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_LINGER2, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_LINGER2: %d\n", rc); + return (rc); + } + + if (!*ksocknal_tunables.ksnd_nagle) { + option = 1; + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + return (rc); + } + } + + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + + option = (do_keepalive ? 1 : 0); + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } + + if (!do_keepalive) + return (0); + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keep_idle, sizeof (keep_idle)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); + return (rc); + } + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keep_intvl, sizeof (keep_intvl)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keep_count, sizeof (keep_count)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPCNT: %d\n", rc); + return (rc); + } + + return (0); +} + +void +ksocknal_lib_push_conn (ksock_conn_t *conn) +{ + struct sock *sk; + struct tcp_sock *tp; + int nonagle; + int val = 1; + int rc; + mm_segment_t oldmm; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); + + lock_sock (sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock (sk); + + oldmm = get_fs (); + set_fs (KERNEL_DS); + + rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof (val)); + LASSERT (rc == 0); + + set_fs (oldmm); + + lock_sock (sk); + tp->nonagle = nonagle; + release_sock (sk); + + ksocknal_connsock_decref(conn); +} + +extern void ksocknal_read_callback (ksock_conn_t *conn); +extern void ksocknal_write_callback (ksock_conn_t *conn); +/* + * socket call back in Linux + */ +static void +ksocknal_data_ready (struct sock *sk, int n) +{ + ksock_conn_t *conn; + ENTRY; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT (sk->sk_data_ready != &ksocknal_data_ready); + sk->sk_data_ready (sk, n); + } else + ksocknal_read_callback(conn); + + read_unlock(&ksocknal_data.ksnd_global_lock); + + EXIT; +} + +static void +ksocknal_write_space (struct sock *sk) +{ + ksock_conn_t *conn; + int wspace; + int min_wpace; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + wspace = SOCKNAL_WSPACE(sk); + min_wpace = SOCKNAL_MIN_WSPACE(sk); + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, wspace, min_wpace, conn, + (conn == NULL) ? "" : (conn->ksnc_tx_ready ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT (sk->sk_write_space != &ksocknal_write_space); + sk->sk_write_space (sk); + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; + } + + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); + + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ + + clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +void +ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) +{ + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; +} + +void +ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) +{ + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +{ + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; +} + +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h new file mode 100644 index 000000000000..3c135786dc11 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h @@ -0,0 +1,91 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_PORTAL_ALLOC + +#ifndef __LINUX_SOCKNAL_LIB_H__ +#define __LINUX_SOCKNAL_LIB_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; +#endif +} + +#define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk) +#define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk) + +/* assume one thread for each connection type */ +#define SOCKNAL_NSCHEDS 3 +#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) + +#endif diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 000000000000..8a474f64abbe --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +static int sock_timeout = 50; +CFS_MODULE_PARM(sock_timeout, "i", int, 0644, + "dead socket timeout (seconds)"); + +static int credits = 256; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int peer_buffer_credits = 0; +CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444, + "# per-peer router buffer credits"); + +static int peer_timeout = 180; +CFS_MODULE_PARM(peer_timeout, "i", int, 0444, + "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +CFS_MODULE_PARM(nscheds, "i", int, 0444, + "# scheduler daemons in each pool while starting"); + +static int nconnds = 4; +CFS_MODULE_PARM(nconnds, "i", int, 0444, + "# connection daemons while starting"); + +static int nconnds_max = 64; +CFS_MODULE_PARM(nconnds_max, "i", int, 0444, + "max # connection daemons"); + +static int min_reconnectms = 1000; +CFS_MODULE_PARM(min_reconnectms, "i", int, 0644, + "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +CFS_MODULE_PARM(max_reconnectms, "i", int, 0644, + "max connection retry interval (mS)"); + +# define DEFAULT_EAGER_ACK 0 +static int eager_ack = DEFAULT_EAGER_ACK; +CFS_MODULE_PARM(eager_ack, "i", int, 0644, + "send tcp ack packets eagerly"); + +static int typed_conns = 1; +CFS_MODULE_PARM(typed_conns, "i", int, 0444, + "use different sockets for bulk"); + +static int min_bulk = (1<<10); +CFS_MODULE_PARM(min_bulk, "i", int, 0644, + "smallest 'large' message"); + +# define DEFAULT_BUFFER_SIZE 0 +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644, + "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644, + "socket rx buffer size (0 for system default)"); + +static int nagle = 0; +CFS_MODULE_PARM(nagle, "i", int, 0644, + "enable NAGLE?"); + +static int round_robin = 1; +CFS_MODULE_PARM(round_robin, "i", int, 0644, + "Round robin for multiple interfaces"); + +static int keepalive = 30; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "# seconds before send keepalive"); + +static int keepalive_idle = 30; +CFS_MODULE_PARM(keepalive_idle, "i", int, 0644, + "# idle seconds before probe"); + +#define DEFAULT_KEEPALIVE_COUNT 5 +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +CFS_MODULE_PARM(keepalive_count, "i", int, 0644, + "# missed probes == dead"); + +static int keepalive_intvl = 5; +CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644, + "seconds between probes"); + +static int enable_csum = 0; +CFS_MODULE_PARM(enable_csum, "i", int, 0644, + "enable check sum"); + +static int inject_csum_error = 0; +CFS_MODULE_PARM(inject_csum_error, "i", int, 0644, + "set non-zero to inject a checksum error"); + +static int nonblk_zcack = 1; +CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644, + "always send ZC-ACK on non-blocking connection"); + +static unsigned int zc_min_payload = (16 << 10); +CFS_MODULE_PARM(zc_min_payload, "i", int, 0644, + "minimum payload size to zero copy"); + +static unsigned int zc_recv = 0; +CFS_MODULE_PARM(zc_recv, "i", int, 0644, + "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644, + "minimum # of fragments to enable ZC recv"); + + +#if SOCKNAL_VERSION_DEBUG +static int protocol = 3; +CFS_MODULE_PARM(protocol, "i", int, 0644, + "protocol version"); +#endif + +ksock_tunables_t ksocknal_tunables; + +int ksocknal_tunables_init(void) +{ + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + + + +#if SOCKNAL_VERSION_DEBUG + ksocknal_tunables.ksnd_protocol = &protocol; +#endif + +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + ksocknal_tunables.ksnd_sysctl = NULL; +#endif + + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + + /* initialize platform-sepcific tunables */ + return ksocknal_lib_tunables_init(); +}; + +void ksocknal_tunables_fini(void) +{ + ksocknal_lib_tunables_fini(); +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c new file mode 100644 index 000000000000..ec57179f8d2b --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +/* + * Protocol entries : + * pro_send_hello : send hello message + * pro_recv_hello : receive hello message + * pro_pack : pack message header + * pro_unpack : unpack message header + * pro_queue_tx_zcack() : Called holding BH lock: kss_lock + * return 1 if ACK is piggybacked, otherwise return 0 + * pro_queue_tx_msg() : Called holding BH lock: kss_lock + * return the ACK that piggybacked by my message, or NULL + * pro_handle_zcreq() : handler of incoming ZC-REQ + * pro_handle_zcack() : handler of incoming ZC-ACK + * pro_match_tx() : Called holding glock + */ + +static ksock_tx_t * +ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + /* V1.x, just enqueue it */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; +} + +void +ksocknal_next_tx_carrier(ksock_conn_t *conn) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT (!list_empty(&conn->ksnc_tx_queue)); + LASSERT (tx != NULL); + + /* Next TX that can carry ZC-ACK or LNet message */ + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_carrier = NULL; + } else { + conn->ksnc_tx_carrier = list_entry(tx->tx_list.next, + ksock_tx_t, tx_list); + LASSERT (conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type); + } +} + +static int +ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* + * Enqueue or piggyback tx_ack / cookie + * . no tx can piggyback cookie of tx_ack (or cookie), just + * enqueue the tx_ack (if tx_ack != NUL) and return NULL. + * . There is tx can piggyback cookie of tx_ack (or cookie), + * piggyback the cookie and return the tx. + */ + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + if (tx_ack != NULL) + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0); + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_cookies[1] = cookie; + /* move on to the next TX which can carry cookie */ + ksocknal_next_tx_carrier(conn); + + return 1; +} + +static ksock_tx_t * +ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* + * Enqueue tx_msg: + * . If there is no NOOP on the connection, just enqueue + * tx_msg and return NULL + * . If there is NOOP on the connection, piggyback the cookie + * and replace the NOOP tx, and return the NOOP tx. + */ + if (tx == NULL) { /* nothing on queue */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_msg; + return NULL; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; + } + + LASSERT (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* There is a noop zc-ack can be piggybacked */ + tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; + ksocknal_next_tx_carrier(conn); + + /* use new_tx to replace the noop zc-ack packet */ + list_add(&tx_msg->tx_list, &tx->tx_list); + list_del(&tx->tx_list); + + return tx; +} + +static int +ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx; + + if (conn->ksnc_type != SOCKLND_CONN_ACK) + return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); + + /* non-blocking ZC-ACK (to router) */ + LASSERT (tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + if ((tx = conn->ksnc_tx_carrier) == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + /* conn->ksnc_tx_carrier != NULL */ + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ + return 1; + + if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { + /* replace the keepalive PING with a real ACK */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] || + cookie == tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: "LPU64"\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX return error in the future */ + } + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */ + if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { + tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + } else { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + } + + if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { + /* not likely to carry more ACKs, skip it to simplify logic */ + ksocknal_next_tx_carrier(conn); + } + + return 1; + } + + /* takes two or more cookies already */ + + if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { + __u64 tmp = 0; + + /* two seperated cookies: (a+2, a) or (a+1, a) */ + LASSERT (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] <= 2); + + if (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] == 2) { + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) + tmp = cookie; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { + tmp = tx->tx_msg.ksm_zc_cookies[1]; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { + tmp = tx->tx_msg.ksm_zc_cookies[0]; + } + + if (tmp != 0) { + /* range of cookies */ + tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; + tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; + return 1; + } + + } else { + /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */ + if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && + cookie <= tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: "LPU64"\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX: return error in the future */ + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + return 1; + } + } + + /* failed to piggyback ZC-ACK */ + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); + /* the next tx can piggyback at least 1 ACK */ + ksocknal_next_tx_carrier(conn); + } + + return 0; +} + +static int +ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + +#if SOCKNAL_VERSION_DEBUG + if (!*ksocknal_tunables.ksnd_typed_conns) + return SOCKNAL_MATCH_YES; +#endif + + if (tx == NULL || tx->tx_lnetmsg == NULL) { + /* noop packet */ + nob = offsetof(ksock_msg_t, ksm_u); + } else { + nob = tx->tx_lnetmsg->msg_len + + ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? + sizeof(lnet_hdr_t) : sizeof(ksock_msg_t)); + } + + /* default checking for typed connection */ + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_BULK_IN: + return SOCKNAL_MATCH_MAY; + + case SOCKLND_CONN_BULK_OUT: + if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + + if (tx == NULL || tx->tx_lnetmsg == NULL) + nob = offsetof(ksock_msg_t, ksm_u); + else + nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t); + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote) +{ + ksock_peer_t *peer = c->ksnc_peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = ksocknal_find_conn_locked(peer, NULL, !!remote); + if (conn != NULL) { + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + spin_lock_bh(&sched->kss_lock); + + rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); + + spin_unlock_bh(&sched->kss_lock); + + if (rc) { /* piggybacked */ + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* ACK connection is not ready, or can't piggyback the ACK */ + tx = ksocknal_alloc_tx_noop(cookie, !!remote); + if (tx == NULL) + return -ENOMEM; + + if ((rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) == 0) + return 0; + + ksocknal_free_tx(tx); + return rc; +} + +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD (zlist); + int count; + + if (cookie1 == 0) + cookie1 = cookie2; + + count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); + + if (cookie2 == SOCKNAL_KEEPALIVE_PING && + conn->ksnc_proto == &ksocknal_protocol_v3x) { + /* keepalive PING for V3.x, just ignore it */ + return count == 1 ? 0 : -EPROTO; + } + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, + &peer->ksnp_zc_req_list, tx_zc_list) { + __u64 c = tx->tx_msg.ksm_zc_cookies[0]; + + if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) { + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + + if (--count == 0) + break; + } + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + + return count == 0 ? 0 : -EPROTO; +} + +static int +ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + socket_t *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + lnet_magicversion_t *hmv; + int rc; + int i; + + CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + hmv = (lnet_magicversion_t *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (lnet_hdr_t) + * header and send out */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + + hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); + + rc = libcfs_sock_write(sock, hdr, sizeof(*hdr),lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", + rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); + } + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d)" + " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + socket_t *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hello->kshm_version++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + LNET_UNLOCK(); + } + + rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", + rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + return rc; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d)" + " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + } + + return rc; +} + +static int +ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout) +{ + socket_t *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + int rc; + int i; + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + rc = libcfs_sock_read(sock, &hdr->src_nid, + sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid), + timeout); + if (rc != 0) { + CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + goto out; + } + + /* ...and check we got what we expected */ + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { + CERROR ("Expecting a HELLO hdr," + " but got type %d from %u.%u.%u.%u\n", + le32_to_cpu (hdr->type), + HIPQUAD(conn->ksnc_ipaddr)); + rc = -EPROTO; + goto out; + } + + hello->kshm_src_nid = le64_to_cpu (hdr->src_nid); + hello->kshm_src_pid = le32_to_cpu (hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu (hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu (hdr->payload_length) / + sizeof (__u32); + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %u.%u.%u.%u\n", + hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr)); + rc = -EPROTO; + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + goto out; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n", + i, HIPQUAD(conn->ksnc_ipaddr)); + rc = -EPROTO; + break; + } + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout) +{ + socket_t *sock = conn->ksnc_sock; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = libcfs_sock_read(sock, &hello->kshm_src_nid, + offsetof(ksock_hello_msg_t, kshm_ips) - + offsetof(ksock_hello_msg_t, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab64s(&hello->kshm_src_nid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_dst_nid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + __swab32s(&hello->kshm_nips); + } + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %u.%u.%u.%u\n", + hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n", + i, HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + } + + return 0; +} + +static void +ksocknal_pack_msg_v1(ksock_tx_t *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t); + + tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t); +} + +static void +ksocknal_pack_msg_v2(ksock_tx_t *tx) +{ + tx->tx_iov[0].iov_base = (void *)&tx->tx_msg; + + if (tx->tx_lnetmsg != NULL) { + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + + tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(ksock_msg_t); + tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len; + } else { + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + } + /* Don't checksum before start sending, because packet can be piggybacked with ACK */ +} + +static void +ksocknal_unpack_msg_v1(ksock_msg_t *msg) +{ + msg->ksm_csum = 0; + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +static void +ksocknal_unpack_msg_v2(ksock_msg_t *msg) +{ + return; /* Do nothing */ +} + +ksock_proto_t ksocknal_protocol_v1x = +{ + .pro_version = KSOCK_PROTO_V1, + .pro_send_hello = ksocknal_send_hello_v1, + .pro_recv_hello = ksocknal_recv_hello_v1, + .pro_pack = ksocknal_pack_msg_v1, + .pro_unpack = ksocknal_unpack_msg_v1, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, + .pro_handle_zcreq = NULL, + .pro_handle_zcack = NULL, + .pro_queue_tx_zcack = NULL, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v2x = +{ + .pro_version = KSOCK_PROTO_V2, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v3x = +{ + .pro_version = KSOCK_PROTO_V3, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v3 +}; diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile new file mode 100644 index 000000000000..1bd9ef774208 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LNET) += lnet.o + +lnet-y := api-errno.o api-ni.o config.o lib-me.o lib-msg.o lib-eq.o \ + lib-md.o lib-ptl.o lib-move.o module.o lo.o router.o \ + router_proc.o acceptor.o peer.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c new file mode 100644 index 000000000000..81ef28bbcba0 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/acceptor.c @@ -0,0 +1,527 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + + +static int accept_port = 988; +static int accept_backlog = 127; +static int accept_timeout = 5; + +struct { + int pta_shutdown; + socket_t *pta_sock; + struct completion pta_signal; +} lnet_acceptor_state; + +int +lnet_acceptor_port(void) +{ + return accept_port; +} + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + + +EXPORT_SYMBOL(lnet_acceptor_port); + +static char *accept = "secure"; + +CFS_MODULE_PARM(accept, "s", charp, 0444, + "Accept connections (secure|all|none)"); +CFS_MODULE_PARM(accept_port, "i", int, 0444, + "Acceptor's port (same on all nodes)"); +CFS_MODULE_PARM(accept_backlog, "i", int, 0444, + "Acceptor's listen backlog"); +CFS_MODULE_PARM(accept_timeout, "i", int, 0644, + "Acceptor's timeout (seconds)"); + +static char *accept_type = NULL; + +int +lnet_acceptor_get_tunables(void) +{ + /* Userland acceptor uses 'accept_type' instead of 'accept', due to + * conflict with 'accept(2)', but kernel acceptor still uses 'accept' + * for compatibility. Hence the trick. */ + accept_type = accept; + return 0; +} + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +void +lnet_connect_console_error (int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int peer_port) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CNETERR("Connection to %s at host %u.%u.%u.%u on port %d was " + "refused: check that Lustre is running on that node.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CNETERR("Connection to %s at host %u.%u.%u.%u " + "was unreachable: the network or that node may " + "be down, or Lustre may be misconfigured.\n", + libcfs_nid2str(peer_nid), HIPQUAD(peer_ip)); + break; + case -ETIMEDOUT: + CNETERR("Connection to %s at host %u.%u.%u.%u on " + "port %d took too long: that node may be hung " + "or experiencing high load.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -ECONNRESET: + LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %u.%u.%u.%u" + " on port %d was reset: " + "is it running a compatible version of " + "Lustre and is %s one of its NIDs?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port, + libcfs_nid2str(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at " + "host %u.%u.%u.%u on port %d: is it running " + "a compatible version of Lustre?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EADDRINUSE: + LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to " + "connect to %s at host %u.%u.%u.%u on port " + "%d\n", libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + default: + LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s" + " at host %u.%u.%u.%u on port %d\n", rc, + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +int +lnet_connect(socket_t **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port) +{ + lnet_acceptor_connreq_t cr; + socket_t *sock; + int rc; + int port; + int fatal; + + CLASSERT (sizeof(cr) <= 16); /* not too big to be on the stack */ + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + + rc = libcfs_sock_connect(&sock, &fatal, + local_ip, port, + peer_ip, peer_port); + if (rc != 0) { + if (fatal) + goto failed; + continue; + } + + CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1); + + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = peer_nid; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + lnet_net_lock(LNET_LOCK_EX); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + lnet_net_unlock(LNET_LOCK_EX); + } + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + if (rc != 0) + goto failed_sock; + + *sockp = sock; + return 0; + } + + rc = -EADDRINUSE; + goto failed; + + failed_sock: + libcfs_sock_release(sock); + failed: + lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); + return rc; +} +EXPORT_SYMBOL(lnet_connect); + + +/* Below is the code common for both kernel and MT user-space */ + +int +lnet_accept(socket_t *sock, __u32 magic) +{ + lnet_acceptor_connreq_t cr; + __u32 peer_ip; + int peer_port; + int rc; + int flip; + lnet_ni_t *ni; + char *str; + + LASSERT (sizeof(cr) <= 16); /* not too big for the stack */ + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to LNET magic from %u.%u.%u.%u: %d\n", + HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) + str = "'old' ranal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %u.%u.%u.%u" + " magic %08x: %s acceptor protocol\n", + HIPQUAD(peer_ip), magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = libcfs_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to version %d from %u.%u.%u.%u: %d\n", + peer_version, HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(lnet_acceptor_connreq_t, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + + ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %u.%u.%u.%u" + " for %s: No matching NI\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + if (ni->ni_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %u.%u.%u.%u" + " for %s: NI doesn not accept IP connections\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u\n", + libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip)); + + rc = ni->ni_lnd->lnd_accept(ni, sock); + + lnet_ni_decref(ni); + return rc; +} + +int +lnet_acceptor(void *arg) +{ + socket_t *newsock; + int rc; + __u32 magic; + __u32 peer_ip; + int peer_port; + int secure = (int)((long_ptr_t)arg); + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + + cfs_block_allsigs(); + + rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock, + 0, accept_port, accept_backlog); + if (rc != 0) { + if (rc == -EADDRINUSE) + LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port" + " %d: port already in use\n", + accept_port); + else + LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port " + "%d: unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + complete(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (!lnet_acceptor_state.pta_shutdown) { + + rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + cfs_pause(cfs_time_seconds(1)); + } + continue; + } + + /* maybe we're waken up with libcfs_sock_abort_accept() */ + if (lnet_acceptor_state.pta_shutdown) { + libcfs_sock_release(newsock); + break; + } + + rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %u.%u.%u.%u: " + "insecure port %d\n", + HIPQUAD(peer_ip), peer_port); + goto failed; + } + + rc = libcfs_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + goto failed; + } + + rc = lnet_accept(newsock, magic); + if (rc != 0) + goto failed; + + continue; + + failed: + libcfs_sock_release(newsock); + } + + libcfs_sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + CDEBUG(D_NET, "Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + complete(&lnet_acceptor_state.pta_signal); + return 0; +} + +static inline int +accept2secure(const char *acc, long *sec) +{ + if (!strcmp(acc, "secure")) { + *sec = 1; + return 1; + } else if (!strcmp(acc, "all")) { + *sec = 0; + return 1; + } else if (!strcmp(acc, "none")) { + return 0; + } else { + LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", + acc); + return -EINVAL; + } +} + +int +lnet_acceptor_start(void) +{ + int rc; + long rc2; + long secure; + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + + rc = lnet_acceptor_get_tunables(); + if (rc != 0) + return rc; + + + init_completion(&lnet_acceptor_state.pta_signal); + rc = accept2secure(accept_type, &secure); + if (rc <= 0) { + fini_completion(&lnet_acceptor_state.pta_signal); + return rc; + } + + if (lnet_count_acceptor_nis() == 0) /* not required */ + return 0; + + rc2 = PTR_ERR(kthread_run(lnet_acceptor, + (void *)(ulong_ptr_t)secure, + "acceptor_%03ld", secure)); + if (IS_ERR_VALUE(rc2)) { + CERROR("Can't start acceptor thread: %ld\n", rc2); + fini_completion(&lnet_acceptor_state.pta_signal); + + return -ESRCH; + } + + /* wait for acceptor to startup */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + if (!lnet_acceptor_state.pta_shutdown) { + /* started OK */ + LASSERT(lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT(lnet_acceptor_state.pta_sock == NULL); + fini_completion(&lnet_acceptor_state.pta_signal); + + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + if (lnet_acceptor_state.pta_sock == NULL) /* not running */ + return; + + lnet_acceptor_state.pta_shutdown = 1; + libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock); + + /* block until acceptor signals exit */ + wait_for_completion(&lnet_acceptor_state.pta_signal); + + fini_completion(&lnet_acceptor_state.pta_signal); +} diff --git a/drivers/staging/lustre/lnet/lnet/api-errno.c b/drivers/staging/lustre/lnet/lnet/api-errno.c new file mode 100644 index 000000000000..695b27265e23 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/api-errno.c @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/api-errno.c + * + * Instantiate the string table of errors + */ + +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c new file mode 100644 index 000000000000..e88bee362497 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/api-ni.c @@ -0,0 +1,1941 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include + +#define D_LNI D_CONSOLE + +lnet_t the_lnet; /* THE state of the network */ +EXPORT_SYMBOL(the_lnet); + + +static char *ip2nets = ""; +CFS_MODULE_PARM(ip2nets, "s", charp, 0444, + "LNET network <- IP table"); + +static char *networks = ""; +CFS_MODULE_PARM(networks, "s", charp, 0444, + "local networks"); + +static char *routes = ""; +CFS_MODULE_PARM(routes, "s", charp, 0444, + "routes to non-local networks"); + +static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; +CFS_MODULE_PARM(rnet_htable_size, "i", int, 0444, + "size of remote network hash table"); + +char * +lnet_get_routes(void) +{ + return routes; +} + +char * +lnet_get_networks(void) +{ + char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or " + "'ip2nets' but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +void +lnet_init_locks(void) +{ + spin_lock_init(&the_lnet.ln_eq_wait_lock); + init_waitqueue_head(&the_lnet.ln_eq_waitq); + mutex_init(&the_lnet.ln_lnd_mutex); + mutex_init(&the_lnet.ln_api_mutex); +} + +void +lnet_fini_locks(void) +{ +} + + +static int +lnet_create_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + LASSERT(the_lnet.ln_remote_nets_hash == NULL); + LASSERT(the_lnet.ln_remote_nets_hbits > 0); + LIBCFS_ALLOC(hash, LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create remote nets hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash[i]); + the_lnet.ln_remote_nets_hash = hash; + return 0; +} + +static void +lnet_destroy_remote_nets_table(void) +{ + int i; + struct list_head *hash; + + if (the_lnet.ln_remote_nets_hash == NULL) + return; + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) + LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); + + LIBCFS_FREE(the_lnet.ln_remote_nets_hash, + LNET_REMOTE_NETS_HASH_SIZE * sizeof(*hash)); + the_lnet.ln_remote_nets_hash = NULL; +} + +static void +lnet_destroy_locks(void) +{ + if (the_lnet.ln_res_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_res_lock); + the_lnet.ln_res_lock = NULL; + } + + if (the_lnet.ln_net_lock != NULL) { + cfs_percpt_lock_free(the_lnet.ln_net_lock); + the_lnet.ln_net_lock = NULL; + } + + lnet_fini_locks(); +} + +static int +lnet_create_locks(void) +{ + lnet_init_locks(); + + the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_res_lock == NULL) + goto failed; + + the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); + if (the_lnet.ln_net_lock == NULL) + goto failed; + + return 0; + + failed: + lnet_destroy_locks(); + return -ENOMEM; +} + +void lnet_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ + + /* Constants... */ + CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded); + CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1); + CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0); + CLASSERT (LNET_MSG_ACK == 0); + CLASSERT (LNET_MSG_PUT == 1); + CLASSERT (LNET_MSG_GET == 2); + CLASSERT (LNET_MSG_REPLY == 3); + CLASSERT (LNET_MSG_HELLO == 4); + + /* Checks for struct ptl_handle_wire_t */ + CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8); + + /* Checks for struct lnet_magicversion_t */ + CLASSERT ((int)sizeof(lnet_magicversion_t) == 8); + CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct lnet_hdr_t */ + CLASSERT ((int)sizeof(lnet_hdr_t) == 72); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40); + + /* Ack */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4); + + /* Put */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4); + + /* Get */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16); + + /* Hello */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4); +} + +lnd_t * +lnet_find_lnd_by_type (int type) +{ + lnd_t *lnd; + struct list_head *tmp; + + /* holding lnd mutex */ + list_for_each (tmp, &the_lnet.ln_lnds) { + lnd = list_entry(tmp, lnd_t, lnd_list); + + if ((int)lnd->lnd_type == type) + return lnd; + } + + return NULL; +} + +void +lnet_register_lnd (lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds); + lnd->lnd_refcount = 0; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_register_lnd); + +void +lnet_unregister_lnd (lnd_t *lnd) +{ + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + LASSERT (lnd->lnd_refcount == 0); + + list_del (&lnd->lnd_list); + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); +} +EXPORT_SYMBOL(lnet_unregister_lnd); + +void +lnet_counters_get(lnet_counters_t *counters) +{ + lnet_counters_t *ctr; + int i; + + memset(counters, 0, sizeof(*counters)); + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { + counters->msgs_max += ctr->msgs_max; + counters->msgs_alloc += ctr->msgs_alloc; + counters->errors += ctr->errors; + counters->send_count += ctr->send_count; + counters->recv_count += ctr->recv_count; + counters->route_count += ctr->route_count; + counters->drop_length += ctr->drop_length; + counters->send_length += ctr->send_length; + counters->recv_length += ctr->recv_length; + counters->route_length += ctr->route_length; + counters->drop_length += ctr->drop_length; + + } + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_get); + +void +lnet_counters_reset(void) +{ + lnet_counters_t *counters; + int i; + + lnet_net_lock(LNET_LOCK_EX); + + cfs_percpt_for_each(counters, i, the_lnet.ln_counters) + memset(counters, 0, sizeof(lnet_counters_t)); + + lnet_net_unlock(LNET_LOCK_EX); +} +EXPORT_SYMBOL(lnet_counters_reset); + +#ifdef LNET_USE_LIB_FREELIST + +int +lnet_freelist_init (lnet_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lnet_freeobj_t, fo_contents); + + LIBCFS_ALLOC(space, n * size); + if (space == NULL) + return (-ENOMEM); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (0); +} + +void +lnet_freelist_fini (lnet_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (*fl)); +} + +#endif /* LNET_USE_LIB_FREELIST */ + +__u64 +lnet_create_interface_cookie (void) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. Initialisation time, + * even if it's only implemented to millisecond resolution is probably + * easily good enough. */ + struct timeval tv; + __u64 cookie; + do_gettimeofday(&tv); + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return cookie; +} + +static char * +lnet_res_type2str(int type) +{ + switch (type) { + default: + LBUG(); + case LNET_COOKIE_TYPE_MD: + return "MD"; + case LNET_COOKIE_TYPE_ME: + return "ME"; + case LNET_COOKIE_TYPE_EQ: + return "EQ"; + } +} + +void +lnet_res_container_cleanup(struct lnet_res_container *rec) +{ + int count = 0; + + if (rec->rec_type == 0) /* not set yet, it's uninitialized */ + return; + + while (!list_empty(&rec->rec_active)) { + struct list_head *e = rec->rec_active.next; + + list_del_init(e); + if (rec->rec_type == LNET_COOKIE_TYPE_EQ) { + lnet_eq_free(list_entry(e, lnet_eq_t, eq_list)); + + } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) { + lnet_md_free(list_entry(e, lnet_libmd_t, md_list)); + + } else { /* NB: Active MEs should be attached on portals */ + LBUG(); + } + count++; + } + + if (count > 0) { + /* Found alive MD/ME/EQ, user really should unlink/free + * all of them before finalize LNet, but if someone didn't, + * we have to recycle garbage for him */ + CERROR("%d active elements on exit of %s container\n", + count, lnet_res_type2str(rec->rec_type)); + } + +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&rec->rec_freelist); +#endif + if (rec->rec_lh_hash != NULL) { + LIBCFS_FREE(rec->rec_lh_hash, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + rec->rec_lh_hash = NULL; + } + + rec->rec_type = 0; /* mark it as finalized */ +} + +int +lnet_res_container_setup(struct lnet_res_container *rec, + int cpt, int type, int objnum, int objsz) +{ + int rc = 0; + int i; + + LASSERT(rec->rec_type == 0); + + rec->rec_type = type; + INIT_LIST_HEAD(&rec->rec_active); + +#ifdef LNET_USE_LIB_FREELIST + memset(&rec->rec_freelist, 0, sizeof(rec->rec_freelist)); + rc = lnet_freelist_init(&rec->rec_freelist, objnum, objsz); + if (rc != 0) + goto out; +#endif + rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; + + /* Arbitrary choice of hash table size */ + LIBCFS_CPT_ALLOC(rec->rec_lh_hash, lnet_cpt_table(), cpt, + LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0])); + if (rec->rec_lh_hash == NULL) { + rc = -ENOMEM; + goto out; + } + + for (i = 0; i < LNET_LH_HASH_SIZE; i++) + INIT_LIST_HEAD(&rec->rec_lh_hash[i]); + + return 0; + +out: + CERROR("Failed to setup %s resource container\n", + lnet_res_type2str(type)); + lnet_res_container_cleanup(rec); + return rc; +} + +static void +lnet_res_containers_destroy(struct lnet_res_container **recs) +{ + struct lnet_res_container *rec; + int i; + + cfs_percpt_for_each(rec, i, recs) + lnet_res_container_cleanup(rec); + + cfs_percpt_free(recs); +} + +static struct lnet_res_container ** +lnet_res_containers_create(int type, int objnum, int objsz) +{ + struct lnet_res_container **recs; + struct lnet_res_container *rec; + int rc; + int i; + + recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); + if (recs == NULL) { + CERROR("Failed to allocate %s resource containers\n", + lnet_res_type2str(type)); + return NULL; + } + + cfs_percpt_for_each(rec, i, recs) { + rc = lnet_res_container_setup(rec, i, type, objnum, objsz); + if (rc != 0) { + lnet_res_containers_destroy(recs); + return NULL; + } + } + + return recs; +} + +lnet_libhandle_t * +lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) +{ + /* ALWAYS called with lnet_res_lock held */ + struct list_head *head; + lnet_libhandle_t *lh; + unsigned int hash; + + if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) + return NULL; + + hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); + head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; + + list_for_each_entry(lh, head, lh_hash_chain) { + if (lh->lh_cookie == cookie) + return lh; + } + + return NULL; +} + +void +lnet_res_lh_initialize(struct lnet_res_container *rec, lnet_libhandle_t *lh) +{ + /* ALWAYS called with lnet_res_lock held */ + unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; + unsigned int hash; + + lh->lh_cookie = rec->rec_lh_cookie; + rec->rec_lh_cookie += 1 << ibits; + + hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; + + list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); +} + + +int lnet_unprepare(void); + +int +lnet_prepare(lnet_pid_t requested_pid) +{ + /* Prepare to bring up the network */ + struct lnet_res_container **recs; + int rc = 0; + + LASSERT (the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + + LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; + + INIT_LIST_HEAD(&the_lnet.ln_test_peers); + INIT_LIST_HEAD(&the_lnet.ln_nis); + INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); + INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); + INIT_LIST_HEAD(&the_lnet.ln_routers); + + rc = lnet_create_remote_nets_table(); + if (rc != 0) + goto failed; + + the_lnet.ln_interface_cookie = lnet_create_interface_cookie(); + + the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(lnet_counters_t)); + if (the_lnet.ln_counters == NULL) { + CERROR("Failed to allocate counters for LNet\n"); + rc = -ENOMEM; + goto failed; + } + + rc = lnet_peer_tables_create(); + if (rc != 0) + goto failed; + + rc = lnet_msg_containers_create(); + if (rc != 0) + goto failed; + + rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, + LNET_COOKIE_TYPE_EQ, LNET_FL_MAX_EQS, + sizeof(lnet_eq_t)); + if (rc != 0) + goto failed; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME, LNET_FL_MAX_MES, + sizeof(lnet_me_t)); + if (recs == NULL) + goto failed; + + the_lnet.ln_me_containers = recs; + + recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD, LNET_FL_MAX_MDS, + sizeof(lnet_libmd_t)); + if (recs == NULL) + goto failed; + + the_lnet.ln_md_containers = recs; + + rc = lnet_portals_create(); + if (rc != 0) { + CERROR("Failed to create portals for LNet: %d\n", rc); + goto failed; + } + + return 0; + + failed: + lnet_unprepare(); + return rc; +} + +int +lnet_unprepare (void) +{ + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ + + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_test_peers)); + LASSERT(list_empty(&the_lnet.ln_nis)); + LASSERT(list_empty(&the_lnet.ln_nis_cpt)); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_portals_destroy(); + + if (the_lnet.ln_md_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_md_containers); + the_lnet.ln_md_containers = NULL; + } + + if (the_lnet.ln_me_containers != NULL) { + lnet_res_containers_destroy(the_lnet.ln_me_containers); + the_lnet.ln_me_containers = NULL; + } + + lnet_res_container_cleanup(&the_lnet.ln_eq_container); + + lnet_msg_containers_destroy(); + lnet_peer_tables_destroy(); + lnet_rtrpools_free(); + + if (the_lnet.ln_counters != NULL) { + cfs_percpt_free(the_lnet.ln_counters); + the_lnet.ln_counters = NULL; + } + lnet_destroy_remote_nets_table(); + + return 0; +} + +lnet_ni_t * +lnet_net2ni_locked(__u32 net, int cpt) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +lnet_ni_t * +lnet_net2ni(__u32 net) +{ + lnet_ni_t *ni; + + lnet_net_lock(0); + ni = lnet_net2ni_locked(net, 0); + lnet_net_unlock(0); + + return ni; +} +EXPORT_SYMBOL(lnet_net2ni); + +static unsigned int +lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number) +{ + __u64 key = nid; + unsigned int val; + + LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); + + if (number == 1) + return 0; + + val = cfs_hash_long(key, LNET_CPT_BITS); + /* NB: LNET_CP_NUMBER doesn't have to be PO2 */ + if (val < number) + return val; + + return (unsigned int)(key + val + (val >> 1)) % number; +} + +int +lnet_cpt_of_nid_locked(lnet_nid_t nid) +{ + struct lnet_ni *ni; + + /* must called with hold of lnet_net_lock */ + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + /* take lnet_net_lock(any) would be OK */ + if (!list_empty(&the_lnet.ln_nis_cpt)) { + list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) { + if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) + continue; + + LASSERT(ni->ni_cpts != NULL); + return ni->ni_cpts[lnet_nid_cpt_hash + (nid, ni->ni_ncpts)]; + } + } + + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); +} + +int +lnet_cpt_of_nid(lnet_nid_t nid) +{ + int cpt; + int cpt2; + + if (LNET_CPT_NUMBER == 1) + return 0; /* the only one */ + + if (list_empty(&the_lnet.ln_nis_cpt)) + return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); + + cpt = lnet_net_lock_current(); + cpt2 = lnet_cpt_of_nid_locked(nid); + lnet_net_unlock(cpt); + + return cpt2; +} +EXPORT_SYMBOL(lnet_cpt_of_nid); + +int +lnet_islocalnet(__u32 net) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + + ni = lnet_net2ni_locked(net, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + + lnet_net_unlock(cpt); + + return ni != NULL; +} + +lnet_ni_t * +lnet_nid2ni_locked(lnet_nid_t nid, int cpt) +{ + struct lnet_ni *ni; + struct list_head *tmp; + + LASSERT(cpt != LNET_LOCK_EX); + + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_nid == nid) { + lnet_ni_addref_locked(ni, cpt); + return ni; + } + } + + return NULL; +} + +int +lnet_islocalnid(lnet_nid_t nid) +{ + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + ni = lnet_nid2ni_locked(nid, cpt); + if (ni != NULL) + lnet_ni_decref_locked(ni, cpt); + lnet_net_unlock(cpt); + + return ni != NULL; +} + +int +lnet_count_acceptor_nis (void) +{ + /* Return the # of NIs that need the acceptor. */ + int count = 0; + struct list_head *tmp; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (ni->ni_lnd->lnd_accept != NULL) + count++; + } + + lnet_net_unlock(cpt); + + return count; +} + +static int +lnet_ni_tq_credits(lnet_ni_t *ni) +{ + int credits; + + LASSERT(ni->ni_ncpts >= 1); + + if (ni->ni_ncpts == 1) + return ni->ni_maxtxcredits; + + credits = ni->ni_maxtxcredits / ni->ni_ncpts; + credits = max(credits, 8 * ni->ni_peertxcredits); + credits = min(credits, ni->ni_maxtxcredits); + + return credits; +} + +void +lnet_shutdown_lndnis (void) +{ + int i; + int islo; + lnet_ni_t *ni; + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_refcount == 0); + LASSERT(list_empty(&the_lnet.ln_nis_zombie)); + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_shutdown = 1; /* flag shutdown */ + + /* Unlink NIs from the global table */ + while (!list_empty(&the_lnet.ln_nis)) { + ni = list_entry(the_lnet.ln_nis.next, + lnet_ni_t, ni_list); + /* move it to zombie list and nobody can find it anymore */ + list_move(&ni->ni_list, &the_lnet.ln_nis_zombie); + lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */ + + if (!list_empty(&ni->ni_cptlist)) { + list_del_init(&ni->ni_cptlist); + lnet_ni_decref_locked(ni, 0); + } + } + + /* Drop the cached eqwait NI. */ + if (the_lnet.ln_eq_waitni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_eq_waitni, 0); + the_lnet.ln_eq_waitni = NULL; + } + + /* Drop the cached loopback NI. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni, 0); + the_lnet.ln_loni = NULL; + } + + lnet_net_unlock(LNET_LOCK_EX); + + /* Clear lazy portals and drop delayed messages which hold refs + * on their lnet_msg_t::msg_rxpeer */ + for (i = 0; i < the_lnet.ln_nportals; i++) + LNetClearLazyPortal(i); + + /* Clear the peer table and wait for all peers to go (they hold refs on + * their NIs) */ + lnet_peer_tables_cleanup(); + + lnet_net_lock(LNET_LOCK_EX); + /* Now wait for the NI's I just nuked to show up on ln_zombie_nis + * and shut them down in guaranteed thread context */ + i = 2; + while (!list_empty(&the_lnet.ln_nis_zombie)) { + int *ref; + int j; + + ni = list_entry(the_lnet.ln_nis_zombie.next, + lnet_ni_t, ni_list); + list_del_init(&ni->ni_list); + cfs_percpt_for_each(ref, j, ni->ni_refs) { + if (*ref == 0) + continue; + /* still busy, add it back to zombie list */ + list_add(&ni->ni_list, &the_lnet.ln_nis_zombie); + break; + } + + while (!list_empty(&ni->ni_list)) { + lnet_net_unlock(LNET_LOCK_EX); + ++i; + if ((i & (-i)) == i) { + CDEBUG(D_WARNING, + "Waiting for zombie LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + } + cfs_pause(cfs_time_seconds(1)); + lnet_net_lock(LNET_LOCK_EX); + continue; + } + + ni->ni_lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + + islo = ni->ni_lnd->lnd_type == LOLND; + + LASSERT (!in_interrupt ()); + (ni->ni_lnd->lnd_shutdown)(ni); + + /* can't deref lnd anymore now; it might have unregistered + * itself... */ + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + + lnet_ni_free(ni); + lnet_net_lock(LNET_LOCK_EX); + } + + the_lnet.ln_shutdown = 0; + lnet_net_unlock(LNET_LOCK_EX); + + if (the_lnet.ln_network_tokens != NULL) { + LIBCFS_FREE(the_lnet.ln_network_tokens, + the_lnet.ln_network_tokens_nob); + the_lnet.ln_network_tokens = NULL; + } +} + +int +lnet_startup_lndnis (void) +{ + lnd_t *lnd; + struct lnet_ni *ni; + struct lnet_tx_queue *tq; + struct list_head nilist; + int i; + int rc = 0; + int lnd_type; + int nicount = 0; + char *nets = lnet_get_networks(); + + INIT_LIST_HEAD(&nilist); + + if (nets == NULL) + goto failed; + + rc = lnet_parse_networks(&nilist, nets); + if (rc != 0) + goto failed; + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + LASSERT (libcfs_isknown_lnd(lnd_type)); + + if (lnd_type == CIBLND || + lnd_type == OPENIBLND || + lnd_type == IIBLND || + lnd_type == VIBLND) { + CERROR("LND %s obsoleted\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + rc = request_module("%s", + libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_LOCK(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); + goto failed; + } + } + + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount++; + lnet_net_unlock(LNET_LOCK_EX); + + ni->ni_lnd = lnd; + + rc = (lnd->lnd_startup)(ni); + + LNET_MUTEX_UNLOCK(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s" + "\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + lnet_net_lock(LNET_LOCK_EX); + lnd->lnd_refcount--; + lnet_net_unlock(LNET_LOCK_EX); + goto failed; + } + + LASSERT (ni->ni_peertimeout <= 0 || lnd->lnd_query != NULL); + + list_del(&ni->ni_list); + + lnet_net_lock(LNET_LOCK_EX); + /* refcount for ln_nis */ + lnet_ni_addref_locked(ni, 0); + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + if (ni->ni_cpts != NULL) { + list_add_tail(&ni->ni_cptlist, + &the_lnet.ln_nis_cpt); + lnet_ni_addref_locked(ni, 0); + } + + lnet_net_unlock(LNET_LOCK_EX); + + if (lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT (the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + continue; + } + + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + tq->tq_credits_min = + tq->tq_credits_max = + tq->tq_credits = lnet_ni_tq_credits(ni); + } + + CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", + libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, + lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, + ni->ni_peerrtrcredits, ni->ni_peertimeout); + + nicount++; + } + + if (the_lnet.ln_eq_waitni != NULL && nicount > 1) { + lnd_type = the_lnet.ln_eq_waitni->ni_lnd->lnd_type; + LCONSOLE_ERROR_MSG(0x109, "LND %s can only run single-network" + "\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + return 0; + + failed: + lnet_shutdown_lndnis(); + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + return -ENETDOWN; +} + +/** + * Initialize LNet library. + * + * Only userspace program needs to call this function - it's automatically + * called in the kernel at module loading time. Caller has to call LNetFini() + * after a call to LNetInit(), if and only if the latter returned 0. It must + * be called exactly once. + * + * \return 0 on success, and -ve on failures. + */ +int +LNetInit(void) +{ + int rc; + + lnet_assert_wire_constants(); + LASSERT(!the_lnet.ln_init); + + memset(&the_lnet, 0, sizeof(the_lnet)); + + /* refer to global cfs_cpt_table for now */ + the_lnet.ln_cpt_table = cfs_cpt_table; + the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_table); + + LASSERT(the_lnet.ln_cpt_number > 0); + if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { + /* we are under risk of consuming all lh_cookie */ + CERROR("Can't have %d CPTs for LNet (max allowed is %d), " + "please change setting of CPT-table and retry\n", + the_lnet.ln_cpt_number, LNET_CPT_MAX); + return -1; + } + + while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) + the_lnet.ln_cpt_bits++; + + rc = lnet_create_locks(); + if (rc != 0) { + CERROR("Can't create LNet global locks: %d\n", rc); + return -1; + } + + the_lnet.ln_refcount = 0; + the_lnet.ln_init = 1; + LNetInvalidateHandle(&the_lnet.ln_rc_eqh); + INIT_LIST_HEAD(&the_lnet.ln_lnds); + INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); + INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); + + /* The hash table size is the number of bits it takes to express the set + * ln_num_routes, minus 1 (better to under estimate than over so we + * don't waste memory). */ + if (rnet_htable_size <= 0) + rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; + else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) + rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; + the_lnet.ln_remote_nets_hbits = max_t(int, 1, + order_base_2(rnet_htable_size) - 1); + + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ + lnet_register_lnd(&the_lolnd); + return 0; +} +EXPORT_SYMBOL(LNetInit); + +/** + * Finalize LNet library. + * + * Only userspace program needs to call this function. It can be called + * at most once. + * + * \pre LNetInit() called with success. + * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. + */ +void +LNetFini(void) +{ + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount == 0); + + while (!list_empty(&the_lnet.ln_lnds)) + lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, + lnd_t, lnd_list)); + lnet_destroy_locks(); + + the_lnet.ln_init = 0; +} +EXPORT_SYMBOL(LNetFini); + +/** + * Set LNet PID and start LNet interfaces, routing, and forwarding. + * + * Userspace program should call this after a successful call to LNetInit(). + * Users must call this function at least once before any other functions. + * For each successful call there must be a corresponding call to + * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is + * ignored. + * + * The PID used by LNet may be different from the one requested. + * See LNetGetId(). + * + * \param requested_pid PID requested by the caller. + * + * \return >= 0 on success, and < 0 error code on failures. + */ +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + goto out; + } + + lnet_get_tunables(); + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + rc = -ENETDOWN; + goto failed0; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) + goto failed0; + + rc = lnet_startup_lndnis(); + if (rc != 0) + goto failed1; + + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_check_routes(); + if (rc != 0) + goto failed2; + + rc = lnet_rtrpools_alloc(im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_acceptor_start(); + if (rc != 0) + goto failed2; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + /* NB router checker needs the_lnet.ln_ping_info in + * lnet_router_checker -> lnet_update_ni_status_locked */ + rc = lnet_ping_target_init(); + if (rc != 0) + goto failed3; + + rc = lnet_router_checker_start(); + if (rc != 0) + goto failed4; + + lnet_proc_init(); + goto out; + + failed4: + lnet_ping_target_fini(); + failed3: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); + failed2: + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + failed1: + lnet_unprepare(); + failed0: + LASSERT (rc < 0); + out: + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return rc; +} +EXPORT_SYMBOL(LNetNIInit); + +/** + * Stop LNet interfaces, routing, and forwarding. + * + * Users must call this function once for each successful call to LNetNIInit(). + * Once the LNetNIFini() operation has been started, the results of pending + * API operations are undefined. + * + * \return always 0 for current implementation. + */ +int +LNetNIFini() +{ + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT (!the_lnet.ln_niinit_self); + + lnet_proc_fini(); + lnet_router_checker_stop(); + lnet_ping_target_fini(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + lnet_unprepare(); + } + + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + return 0; +} +EXPORT_SYMBOL(LNetNIFini); + +/** + * This is an ugly hack to export IOC_LIBCFS_DEBUG_PEER and + * IOC_LIBCFS_PORTALS_COMPATIBILITY commands to users, by tweaking the LNet + * internal ioctl handler. + * + * IOC_LIBCFS_PORTALS_COMPATIBILITY is now deprecated, don't use it. + * + * \param cmd IOC_LIBCFS_DEBUG_PEER to print debugging data about a peer. + * The data will be printed to system console. Don't use it excessively. + * \param arg A pointer to lnet_process_id_t, process ID of the peer. + * + * \return Always return 0 when called by users directly (i.e., not via ioctl). + */ +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + lnet_process_id_t id = {0}; + lnet_ni_t *ni; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = id.nid; + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: + rc = lnet_add_route(data->ioc_net, data->ioc_count, + data->ioc_nid); + return (rc != 0) ? rc : lnet_check_routes(); + + case IOC_LIBCFS_DEL_ROUTE: + return lnet_del_route(data->ioc_net, data->ioc_nid); + + case IOC_LIBCFS_GET_ROUTE: + return lnet_get_route(data->ioc_count, + &data->ioc_net, &data->ioc_count, + &data->ioc_nid, &data->ioc_flags); + case IOC_LIBCFS_NOTIFY_ROUTER: + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + cfs_time_current() - + cfs_time_seconds(cfs_time_current_sec() - + (time_t)data->ioc_u64[0])); + + case IOC_LIBCFS_PORTALS_COMPATIBILITY: + /* This can be removed once lustre stops calling it */ + return 0; + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_testprotocompat = data->ioc_flags; + lnet_net_unlock(LNET_LOCK_EX); + return 0; + + case IOC_LIBCFS_PING: + id.nid = data->ioc_nid; + id.pid = data->ioc_u32[0]; + rc = lnet_ping(id, data->ioc_u32[1], /* timeout */ + (lnet_process_id_t *)data->ioc_pbuf1, + data->ioc_plen1/sizeof(lnet_process_id_t)); + if (rc < 0) + return rc; + data->ioc_count = rc; + return 0; + + case IOC_LIBCFS_DEBUG_PEER: { + /* CAVEAT EMPTOR: this one designed for calling directly; not + * via an ioctl */ + id = *((lnet_process_id_t *) arg); + + lnet_debug_peer(id.nid); + + ni = lnet_net2ni(LNET_NIDNET(id.nid)); + if (ni == NULL) { + CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(id)); + } else { + if (ni->ni_lnd->lnd_ctl == NULL) { + CDEBUG(D_WARNING, "No ctl for %s\n", + libcfs_id2str(id)); + } else { + (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg); + } + + lnet_ni_decref(ni); + } + return 0; + } + + default: + ni = lnet_net2ni(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; + } + /* not reached */ +} +EXPORT_SYMBOL(LNetCtl); + +/** + * Retrieve the lnet_process_id_t ID of LNet interface at \a index. Note that + * all interfaces share a same PID, as requested by LNetNIInit(). + * + * \param index Index of the interface to look up. + * \param id On successful return, this location will hold the + * lnet_process_id_t ID of the interface. + * + * \retval 0 If an interface exists at \a index. + * \retval -ENOENT If no interface has been found. + */ +int +LNetGetId(unsigned int index, lnet_process_id_t *id) +{ + struct lnet_ni *ni; + struct list_head *tmp; + int cpt; + int rc = -ENOENT; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each(tmp, &the_lnet.ln_nis) { + if (index-- != 0) + continue; + + ni = list_entry(tmp, lnet_ni_t, ni_list); + + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; + } + + lnet_net_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetGetId); + +/** + * Print a string representation of handle \a h into buffer \a str of + * \a len bytes. + */ +void +LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) +{ + snprintf(str, len, LPX64, h.cookie); +} +EXPORT_SYMBOL(LNetSnprintHandle); + +static int +lnet_create_ping_info(void) +{ + int i; + int n; + int rc; + unsigned int infosz; + lnet_ni_t *ni; + lnet_process_id_t id; + lnet_ping_info_t *pinfo; + + for (n = 0; ; n++) { + rc = LNetGetId(n, &id); + if (rc == -ENOENT) + break; + + LASSERT (rc == 0); + } + + infosz = offsetof(lnet_ping_info_t, pi_ni[n]); + LIBCFS_ALLOC(pinfo, infosz); + if (pinfo == NULL) { + CERROR("Can't allocate ping info[%d]\n", n); + return -ENOMEM; + } + + pinfo->pi_nnis = n; + pinfo->pi_pid = the_lnet.ln_pid; + pinfo->pi_magic = LNET_PROTO_PING_MAGIC; + pinfo->pi_features = LNET_PING_FEAT_NI_STATUS; + + for (i = 0; i < n; i++) { + lnet_ni_status_t *ns = &pinfo->pi_ni[i]; + + rc = LNetGetId(i, &id); + LASSERT (rc == 0); + + ns->ns_nid = id.nid; + ns->ns_status = LNET_NI_STATUS_UP; + + lnet_net_lock(0); + + ni = lnet_nid2ni_locked(id.nid, 0); + LASSERT(ni != NULL); + + lnet_ni_lock(ni); + LASSERT(ni->ni_status == NULL); + ni->ni_status = ns; + lnet_ni_unlock(ni); + + lnet_ni_decref_locked(ni, 0); + lnet_net_unlock(0); + } + + the_lnet.ln_ping_info = pinfo; + return 0; +} + +static void +lnet_destroy_ping_info(void) +{ + struct lnet_ni *ni; + + lnet_net_lock(0); + + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + lnet_ni_lock(ni); + ni->ni_status = NULL; + lnet_ni_unlock(ni); + } + + lnet_net_unlock(0); + + LIBCFS_FREE(the_lnet.ln_ping_info, + offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis])); + the_lnet.ln_ping_info = NULL; + return; +} + +int +lnet_ping_target_init(void) +{ + lnet_md_t md = {0}; + lnet_handle_me_t meh; + lnet_process_id_t id; + int rc; + int rc2; + int infosz; + + rc = lnet_create_ping_info(); + if (rc != 0) + return rc; + + /* We can have a tiny EQ since we only need to see the unlink event on + * teardown, which by definition is the last one! */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq); + if (rc != 0) { + CERROR("Can't allocate ping EQ: %d\n", rc); + goto failed_0; + } + + memset(&id, 0, sizeof(lnet_process_id_t)); + id.nid = LNET_NID_ANY; + id.pid = LNET_PID_ANY; + + rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, + LNET_PROTO_PING_MATCHBITS, 0, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc != 0) { + CERROR("Can't create ping ME: %d\n", rc); + goto failed_1; + } + + /* initialize md content */ + infosz = offsetof(lnet_ping_info_t, + pi_ni[the_lnet.ln_ping_info->pi_nnis]); + md.start = the_lnet.ln_ping_info; + md.length = infosz; + md.threshold = LNET_MD_THRESH_INF; + md.max_size = 0; + md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE; + md.user_ptr = NULL; + md.eq_handle = the_lnet.ln_ping_target_eq; + + rc = LNetMDAttach(meh, md, + LNET_RETAIN, + &the_lnet.ln_ping_target_md); + if (rc != 0) { + CERROR("Can't attach ping MD: %d\n", rc); + goto failed_2; + } + + return 0; + + failed_2: + rc2 = LNetMEUnlink(meh); + LASSERT (rc2 == 0); + failed_1: + rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc2 == 0); + failed_0: + lnet_destroy_ping_info(); + return rc; +} + +void +lnet_ping_target_fini(void) +{ + lnet_event_t event; + int rc; + int which; + int timeout_ms = 1000; + sigset_t blocked = cfs_block_allsigs(); + + LNetMDUnlink(the_lnet.ln_ping_target_md); + /* NB md could be busy; this just starts the unlink */ + + for (;;) { + rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1, + timeout_ms, &event, &which); + + /* I expect overflow... */ + LASSERT (rc >= 0 || rc == -EOVERFLOW); + + if (rc == 0) { + /* timed out: provide a diagnostic */ + CWARN("Still waiting for ping MD to unlink\n"); + timeout_ms *= 2; + continue; + } + + /* Got a valid event */ + if (event.unlinked) + break; + } + + rc = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc == 0); + lnet_destroy_ping_info(); + cfs_restore_sigs(blocked); +} + +int +lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids) +{ + lnet_handle_eq_t eqh; + lnet_handle_md_t mdh; + lnet_event_t event; + lnet_md_t md = {0}; + int which; + int unlinked = 0; + int replied = 0; + const int a_long_time = 60000; /* mS */ + int infosz = offsetof(lnet_ping_info_t, pi_ni[n_ids]); + lnet_ping_info_t *info; + lnet_process_id_t tmpid; + int i; + int nob; + int rc; + int rc2; + sigset_t blocked; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY || + timeout_ms > 500000 || /* arbitrary limit! */ + n_ids > 20) /* arbitrary limit! */ + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LUSTRE_SRV_LNET_PID; + + LIBCFS_ALLOC(info, infosz); + if (info == NULL) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto out_0; + } + + /* initialize md content */ + md.start = info; + md.length = infosz; + md.threshold = 2; /*GET/REPLY*/ + md.max_size = 0; + md.options = LNET_MD_TRUNCATE; + md.user_ptr = NULL; + md.eq_handle = eqh; + + rc = LNetMDBind(md, LNET_UNLINK, &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_1; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + + rc2 = LNetMDUnlink(mdh); + LASSERT (rc2 == 0); + + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout_ms = a_long_time; + } + + do { + /* MUST block for unlink to complete */ + if (unlinked) + blocked = cfs_block_allsigs(); + + rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which); + + if (unlinked) + cfs_restore_sigs(blocked); + + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout_ms = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + + } while (rc2 <= 0 || !event.unlinked); + + if (!replied) { + if (rc >= 0) + CWARN("%s: Unexpected rc >= 0 but no reply!\n", + libcfs_id2str(id)); + rc = -EIO; + goto out_1; + } + + nob = rc; + LASSERT (nob >= 0 && nob <= infosz); + + rc = -EPROTO; /* if I can't parse... */ + + if (nob < 8) { + /* can't check magic/version */ + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto out_1; + } + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + lnet_swap_pinginfo(info); + } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), info->pi_magic); + goto out_1; + } + + if ((info->pi_features & LNET_PING_FEAT_NI_STATUS) == 0) { + CERROR("%s: ping w/o NI status: 0x%x\n", + libcfs_id2str(id), info->pi_features); + goto out_1; + } + + if (nob < offsetof(lnet_ping_info_t, pi_ni[0])) { + CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[0])); + goto out_1; + } + + if (info->pi_nnis < n_ids) + n_ids = info->pi_nnis; + + if (nob < offsetof(lnet_ping_info_t, pi_ni[n_ids])) { + CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_ni[n_ids])); + goto out_1; + } + + rc = -EFAULT; /* If I SEGV... */ + + for (i = 0; i < n_ids; i++) { + tmpid.pid = info->pi_pid; + tmpid.nid = info->pi_ni[i].ns_nid; + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto out_1; + } + rc = info->pi_nnis; + + out_1: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT (rc2 == 0); + + out_0: + LIBCFS_FREE(info, infosz); + return rc; +} diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c new file mode 100644 index 000000000000..28711e6e8b03 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/config.c @@ -0,0 +1,1264 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +typedef struct { /* tmp struct for parsing routes */ + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +} lnet_text_buf_t; + +static int lnet_tbnob = 0; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +void +lnet_syntax(char *name, char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +int +lnet_issep (char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +int +lnet_net_unique(__u32 net, struct list_head *nilist) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each (tmp, nilist) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) + return 0; + } + + return 1; +} + +void +lnet_ni_free(struct lnet_ni *ni) +{ + if (ni->ni_refs != NULL) + cfs_percpt_free(ni->ni_refs); + + if (ni->ni_tx_queues != NULL) + cfs_percpt_free(ni->ni_tx_queues); + + if (ni->ni_cpts != NULL) + cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); + + LIBCFS_FREE(ni, sizeof(*ni)); +} + +lnet_ni_t * +lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist) +{ + struct lnet_tx_queue *tq; + struct lnet_ni *ni; + int rc; + int i; + + if (!lnet_net_unique(net, nilist)) { + LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n", + libcfs_net2str(net)); + return NULL; + } + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net)); + return NULL; + } + + spin_lock_init(&ni->ni_lock); + INIT_LIST_HEAD(&ni->ni_cptlist); + ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_refs[0])); + if (ni->ni_refs == NULL) + goto failed; + + ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ni->ni_tx_queues[0])); + if (ni->ni_tx_queues == NULL) + goto failed; + + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) + INIT_LIST_HEAD(&tq->tq_delayed); + + if (el == NULL) { + ni->ni_cpts = NULL; + ni->ni_ncpts = LNET_CPT_NUMBER; + } else { + rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); + if (rc <= 0) { + CERROR("Failed to set CPTs for NI %s: %d\n", + libcfs_net2str(net), rc); + goto failed; + } + + LASSERT(rc <= LNET_CPT_NUMBER); + if (rc == LNET_CPT_NUMBER) { + LIBCFS_FREE(ni->ni_cpts, rc * sizeof(ni->ni_cpts[0])); + ni->ni_cpts = NULL; + } + + ni->ni_ncpts = rc; + } + + /* LND will fill in the address part of the NID */ + ni->ni_nid = LNET_MKNID(net, 0); + ni->ni_last_alive = cfs_time_current_sec(); + list_add_tail(&ni->ni_list, nilist); + return ni; + failed: + lnet_ni_free(ni); + return NULL; +} + +int +lnet_parse_networks(struct list_head *nilist, char *networks) +{ + struct cfs_expr_list *el = NULL; + int tokensize = strlen(networks) + 1; + char *tokens; + char *str; + char *tmp; + struct lnet_ni *ni; + __u32 net; + int nnets = 0; + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR_MSG(0x112, "Can't parse networks: string too " + "long\n"); + return -EINVAL; + } + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + the_lnet.ln_network_tokens = tokens; + the_lnet.ln_network_tokens_nob = tokensize; + memcpy (tokens, networks, tokensize); + str = tmp = tokens; + + /* Add in the loopback network */ + ni = lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, nilist); + if (ni == NULL) + goto failed; + + while (str != NULL && *str != 0) { + char *comma = strchr(str, ','); + char *bracket = strchr(str, '('); + char *square = strchr(str, '['); + char *iface; + int niface; + int rc; + + /* NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) */ + + if (square != NULL && (comma == NULL || square < comma)) { + /* i.e: o2ib0(ib0)[1,2], number between square + * brackets are CPTs this NI needs to be bond */ + if (bracket != NULL && bracket > square) { + tmp = square; + goto failed_syntax; + } + + tmp = strchr(square, ']'); + if (tmp == NULL) { + tmp = square; + goto failed_syntax; + } + + rc = cfs_expr_list_parse(square, tmp - square + 1, + 0, LNET_CPT_NUMBER - 1, &el); + if (rc != 0) { + tmp = square; + goto failed_syntax; + } + + while (square <= tmp) + *square++ = ' '; + } + + if (bracket == NULL || + (comma != NULL && comma < bracket)) { + + /* no interface list specified */ + + if (comma != NULL) + *comma++ = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + + if (net == LNET_NIDNET(LNET_NID_ANY)) { + LCONSOLE_ERROR_MSG(0x113, "Unrecognised network" + " type\n"); + tmp = str; + goto failed_syntax; + } + + if (LNET_NETTYP(net) != LOLND && /* LO is implicit */ + lnet_ni_alloc(net, el, nilist) == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + str = comma; + continue; + } + + *bracket = 0; + net = libcfs_str2net(cfs_trimwhite(str)); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + tmp = str; + goto failed_syntax; + } + + nnets++; + ni = lnet_ni_alloc(net, el, nilist); + if (ni == NULL) + goto failed; + + if (el != NULL) { + cfs_expr_list_free(el); + el = NULL; + } + + niface = 0; + iface = bracket + 1; + + bracket = strchr(iface, ')'); + if (bracket == NULL) { + tmp = iface; + goto failed_syntax; + } + + *bracket = 0; + do { + comma = strchr(iface, ','); + if (comma != NULL) + *comma++ = 0; + + iface = cfs_trimwhite(iface); + if (*iface == 0) { + tmp = iface; + goto failed_syntax; + } + + if (niface == LNET_MAX_INTERFACES) { + LCONSOLE_ERROR_MSG(0x115, "Too many interfaces " + "for net %s\n", + libcfs_net2str(net)); + goto failed; + } + + ni->ni_interfaces[niface++] = iface; + iface = comma; + } while (iface != NULL); + + str = bracket + 1; + comma = strchr(bracket + 1, ','); + if (comma != NULL) { + *comma = 0; + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + str = comma + 1; + continue; + } + + str = cfs_trimwhite(str); + if (*str != 0) { + tmp = str; + goto failed_syntax; + } + } + + LASSERT(!list_empty(nilist)); + return 0; + + failed_syntax: + lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp)); + failed: + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + + list_del(&ni->ni_list); + lnet_ni_free(ni); + } + + if (el != NULL) + cfs_expr_list_free(el); + + LIBCFS_FREE(tokens, tokensize); + the_lnet.ln_network_tokens = NULL; + + return -EINVAL; +} + +lnet_text_buf_t * +lnet_new_text_buf (int str_len) +{ + lnet_text_buf_t *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +void +lnet_free_text_buf (lnet_text_buf_t *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +void +lnet_free_text_bufs(struct list_head *tbs) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +void +lnet_print_text_bufs(struct list_head *tbs) +{ + struct list_head *tmp; + lnet_text_buf_t *ltb; + + list_for_each (tmp, tbs) { + ltb = list_entry(tmp, lnet_text_buf_t, ltb_list); + + CDEBUG(D_WARNING, "%s\n", ltb->ltb_text); + } + + CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob); +} + +int +lnet_str2tbs_sep (struct list_head *tbs, char *str) +{ + struct list_head pending; + char *sep; + int nob; + int i; + lnet_text_buf_t *ltb; + + INIT_LIST_HEAD(&pending); + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (cfs_iswhite(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = (int)(sep - str); + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -1; + } + + for (i = 0; i < nob; i++) + if (cfs_iswhite(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +int +lnet_expand1tb (struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = (int)(sep1 - str); + int len2 = strlen(sep2 + 1); + lnet_text_buf_t *ltb; + + LASSERT (*sep1 == '['); + LASSERT (*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +int +lnet_str2tbs_expand (struct list_head *tbs, char *str) +{ + char num[16]; + struct list_head pending; + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + INIT_LIST_HEAD(&pending); + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb(&pending, str, sep, sep2, + parsed, (int)(enditem - parsed)) != 0) + goto failed; + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -1; +} + +int +lnet_parse_hops (char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + + +int +lnet_parse_route (char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head nets; + struct list_head gateways; + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + lnet_nid_t nid; + lnet_text_buf_t *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + unsigned int hops; + int got_hops = 0; + + INIT_LIST_HEAD(&gateways); + INIT_LIST_HEAD(&nets); + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd) - 1); + cmd[sizeof(cmd) - 1] = 0; + + sep = str; + for (;;) { + /* scan for token start */ + while (cfs_iswhite(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !cfs_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + nid = libcfs_str2nid(ltb->ltb_text); + if (nid == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + goto token_error; + } + } + } + + if (!got_hops) + hops = 1; + + LASSERT (!list_empty(&nets)); + LASSERT (!list_empty(&gateways)); + + list_for_each (tmp1, &nets) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT (net != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each (tmp2, &gateways) { + ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list); + nid = libcfs_str2nid(ltb->ltb_text); + LASSERT (nid != LNET_NID_ANY); + + if (lnet_islocalnid(nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route (net, hops, nid); + if (rc != 0) { + CERROR("Can't create route " + "to %s via %s\n", + libcfs_net2str(net), + libcfs_nid2str(nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + + token_error: + lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); + out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes (char *routes, int *im_a_router) +{ + struct list_head tbs; + int rc = 0; + + *im_a_router = 0; + + INIT_LIST_HEAD(&tbs); + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT (lnet_tbnob == 0); + return rc; +} + +int +lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) +{ + LIST_HEAD (list); + int rc; + int i; + + rc = cfs_ip_addr_parse(token, len, &list); + if (rc != 0) + return rc; + + for (rc = i = 0; !rc && i < nip; i++) + rc = cfs_ip_addr_match(ipaddrs[i], &list); + + cfs_ip_addr_free(&list); + + return rc; +} + +int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT (strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (cfs_iswhite(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !cfs_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, len, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + (int)(token - tokens), len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +__u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT (!list_empty(nets)); + LASSERT (nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, lnet_text_buf_t, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (int)(bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, lnet_text_buf_t, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += (int)(sep - tb->ltb_text); + tb2 = lnet_new_text_buf(strlen(sep)); + if (tb2 == NULL) + return -ENOMEM; + + strcpy(tb2->ltb_text, sep); + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +int +lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head raw_entries; + struct list_head matched_nets; + struct list_head current_nets; + struct list_head *t; + struct list_head *t2; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + __u32 net1; + __u32 net2; + int len; + int count; + int dup; + int rc; + + INIT_LIST_HEAD(&raw_entries); + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT (lnet_tbnob == 0); + return -EINVAL; + } + + INIT_LIST_HEAD(&matched_nets); + INIT_LIST_HEAD(¤t_nets); + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, lnet_text_buf_t, + ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)-1); + source[sizeof(source)-1] = 0; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + dup = 0; + list_for_each (t, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + net1 = lnet_netspec2net(tb->ltb_text); + LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(t2, &matched_nets) { + tb2 = list_entry(t2, lnet_text_buf_t, + ltb_list); + net2 = lnet_netspec2net(tb2->ltb_text); + LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY)); + + if (net1 == net2) { + dup = 1; + break; + } + } + + if (dup) + break; + } + + if (dup) { + lnet_free_text_bufs(¤t_nets); + continue; + } + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + + list_del(&tb->ltb_list); + list_add_tail(&tb->ltb_list, &matched_nets); + + len += snprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT (lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} + +void +lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip) +{ + LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs)); +} + +int +lnet_ipaddr_enumerate (__u32 **ipaddrsp) +{ + int up; + __u32 netmask; + __u32 *ipaddrs; + __u32 *ipaddrs2; + int nip; + char **ifnames; + int nif = libcfs_ipif_enumerate(&ifnames); + int i; + int rc; + + if (nif <= 0) + return nif; + + LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs)); + if (ipaddrs == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nif); + libcfs_ipif_free_enumeration(ifnames, nif); + return -ENOMEM; + } + + for (i = nip = 0; i < nif; i++) { + if (!strcmp(ifnames[i], "lo")) + continue; + + rc = libcfs_ipif_query(ifnames[i], &up, + &ipaddrs[nip], &netmask); + if (rc != 0) { + CWARN("Can't query interface %s: %d\n", + ifnames[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s: it's down\n", + ifnames[i]); + continue; + } + + nip++; + } + + libcfs_ipif_free_enumeration(ifnames, nif); + + if (nip == nif) { + *ipaddrsp = ipaddrs; + } else { + if (nip > 0) { + LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2)); + if (ipaddrs2 == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nip); + nip = -ENOMEM; + } else { + memcpy(ipaddrs2, ipaddrs, + nip * sizeof(*ipaddrs)); + *ipaddrsp = ipaddrs2; + rc = nip; + } + } + lnet_ipaddr_free_enumeration(ipaddrs, nif); + } + return nip; +} + +int +lnet_parse_ip2nets (char **networksp, char *ip2nets) +{ + __u32 *ipaddrs; + int nip = lnet_ipaddr_enumerate(&ipaddrs); + int rc; + + if (nip < 0) { + LCONSOLE_ERROR_MSG(0x117, "Error %d enumerating local IP " + "interfaces for ip2nets to match\n", nip); + return nip; + } + + if (nip == 0) { + LCONSOLE_ERROR_MSG(0x118, "No local IP interfaces " + "for ip2nets to match\n"); + return -ENOENT; + } + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + lnet_ipaddr_free_enumeration(ipaddrs, nip); + + if (rc < 0) { + LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); + return rc; + } + + if (rc == 0) { + LCONSOLE_ERROR_MSG(0x11a, "ip2nets does not match " + "any local IP interfaces\n"); + return -ENOENT; + } + + return 0; +} + +int +lnet_set_ip_niaddr (lnet_ni_t *ni) +{ + __u32 net = LNET_NIDNET(ni->ni_nid); + char **names; + int n; + __u32 ip; + __u32 netmask; + int up; + int i; + int rc; + + /* Convenience for LNDs that use the IP address of a local interface as + * the local address part of their NID */ + + if (ni->ni_interfaces[0] != NULL) { + + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Net %s doesn't support multiple interfaces\n", + libcfs_net2str(net)); + return -EPERM; + } + + rc = libcfs_ipif_query(ni->ni_interfaces[0], + &up, &ip, &netmask); + if (rc != 0) { + CERROR("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), ni->ni_interfaces[0], rc); + return -EPERM; + } + + if (!up) { + CERROR("Net %s can't use interface %s: it's down\n", + libcfs_net2str(net), ni->ni_interfaces[0]); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Net %s can't enumerate interfaces: %d\n", + libcfs_net2str(net), n); + return 0; + } + + for (i = 0; i < n; i++) { + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); + + if (rc != 0) { + CWARN("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), names[i], rc); + continue; + } + + if (!up) { + CWARN("Net %s ignoring interface %s (down)\n", + libcfs_net2str(net), names[i]); + continue; + } + + libcfs_ipif_free_enumeration(names, n); + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); + libcfs_ipif_free_enumeration(names, n); + return -ENOENT; +} +EXPORT_SYMBOL(lnet_set_ip_niaddr); diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c new file mode 100644 index 000000000000..78297a7d94e8 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c @@ -0,0 +1,447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-eq.c + * + * Library level Event queue management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +/** + * Create an event queue that has room for \a count number of events. + * + * The event queue is circular and older events will be overwritten by new + * ones if they are not removed in time by the user using the functions + * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to + * determine the appropriate size of the event queue to prevent this loss + * of events. Note that when EQ handler is specified in \a callback, no + * event loss can happen, since the handler is run for each event deposited + * into the EQ. + * + * \param count The number of events to be stored in the event queue. It + * will be rounded up to the next power of two. + * \param callback A handler function that runs when an event is deposited + * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to + * indicate that no event handler is desired. + * \param handle On successful return, this location will hold a handle for + * the newly created EQ. + * + * \retval 0 On success. + * \retval -EINVAL If an parameter is not valid. + * \retval -ENOMEM If memory for the EQ can't be allocated. + * + * \see lnet_eq_handler_t for the discussion on EQ handler semantics. + */ +int +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, + lnet_handle_eq_t *handle) +{ + lnet_eq_t *eq; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq + * overflow, they don't skip entries, so the queue has the same + * apparent capacity at all times */ + + count = cfs_power2_roundup(count); + + if (callback != LNET_EQ_HANDLER_NONE && count != 0) { + CWARN("EQ callback is guaranteed to get every event, " + "do you still want to set eqcount %d for polling " + "event which will have locking overhead? " + "Please contact with developer to confirm\n", count); + } + + /* count can be 0 if only need callback, we can eliminate + * overhead of enqueue event */ + if (count == 0 && callback == LNET_EQ_HANDLER_NONE) + return -EINVAL; + + eq = lnet_eq_alloc(); + if (eq == NULL) + return -ENOMEM; + + if (count != 0) { + LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t)); + if (eq->eq_events == NULL) + goto failed; + /* NB allocator has set all event sequence numbers to 0, + * so all them should be earlier than eq_deq_seq */ + } + + eq->eq_deq_seq = 1; + eq->eq_enq_seq = 1; + eq->eq_size = count; + eq->eq_callback = callback; + + eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*eq->eq_refs[0])); + if (eq->eq_refs == NULL) + goto failed; + + /* MUST hold both exclusive lnet_res_lock */ + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); + list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); + + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_eq2handle(handle, eq); + return 0; + +failed: + if (eq->eq_events != NULL) + LIBCFS_FREE(eq->eq_events, count * sizeof(lnet_event_t)); + + if (eq->eq_refs != NULL) + cfs_percpt_free(eq->eq_refs); + + lnet_eq_free(eq); + return -ENOMEM; +} +EXPORT_SYMBOL(LNetEQAlloc); + +/** + * Release the resources associated with an event queue if it's idle; + * otherwise do nothing and it's up to the user to try again. + * + * \param eqh A handle for the event queue to be released. + * + * \retval 0 If the EQ is not in use and freed. + * \retval -ENOENT If \a eqh does not point to a valid EQ. + * \retval -EBUSY If the EQ is still in use by some MDs. + */ +int +LNetEQFree(lnet_handle_eq_t eqh) +{ + struct lnet_eq *eq; + lnet_event_t *events = NULL; + int **refs = NULL; + int *ref; + int rc = 0; + int size = 0; + int i; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + lnet_res_lock(LNET_LOCK_EX); + /* NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do + * both EQ lookup and poll event with only lnet_eq_wait_lock */ + lnet_eq_wait_lock(); + + eq = lnet_handle2eq(&eqh); + if (eq == NULL) { + rc = -ENOENT; + goto out; + } + + cfs_percpt_for_each(ref, i, eq->eq_refs) { + LASSERT(*ref >= 0); + if (*ref == 0) + continue; + + CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", + i, *ref); + rc = -EBUSY; + goto out; + } + + /* stash for free after lock dropped */ + events = eq->eq_events; + size = eq->eq_size; + refs = eq->eq_refs; + + lnet_res_lh_invalidate(&eq->eq_lh); + list_del(&eq->eq_list); + lnet_eq_free_locked(eq); + out: + lnet_eq_wait_unlock(); + lnet_res_unlock(LNET_LOCK_EX); + + if (events != NULL) + LIBCFS_FREE(events, size * sizeof(lnet_event_t)); + if (refs != NULL) + cfs_percpt_free(refs); + + return rc; +} +EXPORT_SYMBOL(LNetEQFree); + +void +lnet_eq_enqueue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ + int index; + + if (eq->eq_size == 0) { + LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); + eq->eq_callback(ev); + return; + } + + lnet_eq_wait_lock(); + ev->sequence = eq->eq_enq_seq++; + + LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); + index = ev->sequence & (eq->eq_size - 1); + + eq->eq_events[index] = *ev; + + if (eq->eq_callback != LNET_EQ_HANDLER_NONE) + eq->eq_callback(ev); + + /* Wake anyone waiting in LNetEQPoll() */ + if (waitqueue_active(&the_lnet.ln_eq_waitq)) + wake_up_all(&the_lnet.ln_eq_waitq); + lnet_eq_wait_unlock(); +} + +int +lnet_eq_dequeue_event(lnet_eq_t *eq, lnet_event_t *ev) +{ + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + lnet_event_t *new_event = &eq->eq_events[new_index]; + int rc; + ENTRY; + + /* must called with lnet_eq_wait_lock hold */ + if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) + RETURN(0); + + /* We've got a new event... */ + *ev = *new_event; + + CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->eq_deq_seq, eq->eq_size); + + /* ...but did it overwrite an event we've not seen yet? */ + if (eq->eq_deq_seq == new_event->sequence) { + rc = 1; + } else { + /* don't complain with CERROR: some EQs are sized small + * anyway; if it's important, the caller should complain */ + CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", + eq->eq_deq_seq, new_event->sequence); + rc = -EOVERFLOW; + } + + eq->eq_deq_seq = new_event->sequence + 1; + RETURN(rc); +} + +/** + * A nonblocking function that can be used to get the next event in an EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. The event is removed from the queue. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 0 No pending event in the EQ. + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, 0, + event, &which); +} +EXPORT_SYMBOL(LNetEQGet); + +/** + * Block the calling process until there is an event in the EQ. + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully. This function returns the next event + * in the EQ and removes it from the EQ. + * + * \param eventq A handle for the event queue. + * \param event On successful return (1 or -EOVERFLOW), this location will + * hold the next event in the EQ. + * + * \retval 1 Indicates success. + * \retval -ENOENT If \a eventq does not point to a valid EQ. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ has been dropped due to limited space in the EQ. + */ +int +LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER, + event, &which); +} +EXPORT_SYMBOL(LNetEQWait); + + +static int +lnet_eq_wait_locked(int *timeout_ms) +{ + int tms = *timeout_ms; + int wait; + wait_queue_t wl; + cfs_time_t now; + + if (tms == 0) + return -1; /* don't want to wait and no new event */ + + init_waitqueue_entry_current(&wl); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + lnet_eq_wait_unlock(); + + if (tms < 0) { + waitq_wait(&wl, TASK_INTERRUPTIBLE); + + } else { + struct timeval tv; + + now = cfs_time_current(); + waitq_timedwait(&wl, TASK_INTERRUPTIBLE, + cfs_time_seconds(tms) / 1000); + cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); + tms -= (int)(tv.tv_sec * 1000 + tv.tv_usec / 1000); + if (tms < 0) /* no more wait but may have new event */ + tms = 0; + } + + wait = tms != 0; /* might need to call here again */ + *timeout_ms = tms; + + lnet_eq_wait_lock(); + remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); + + return wait; +} + + + +/** + * Block the calling process until there's an event from a set of EQs or + * timeout happens. + * + * If an event handler is associated with the EQ, the handler will run before + * this function returns successfully, in which case the corresponding event + * is consumed. + * + * LNetEQPoll() provides a timeout to allow applications to poll, block for a + * fixed period, or block indefinitely. + * + * \param eventqs,neq An array of EQ handles, and size of the array. + * \param timeout_ms Time in milliseconds to wait for an event to occur on + * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an + * infinite timeout. + * \param event,which On successful return (1 or -EOVERFLOW), \a event will + * hold the next event in the EQs, and \a which will contain the index of the + * EQ from which the event was taken. + * + * \retval 0 No pending event in the EQs after timeout. + * \retval 1 Indicates success. + * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that + * at least one event between this event and the last event obtained from the + * EQ indicated by \a which has been dropped due to limited space in the EQ. + * \retval -ENOENT If there's an invalid handle in \a eventqs. + */ +int +LNetEQPoll(lnet_handle_eq_t *eventqs, int neq, int timeout_ms, + lnet_event_t *event, int *which) +{ + int wait = 1; + int rc; + int i; + ENTRY; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (neq < 1) + RETURN(-ENOENT); + + lnet_eq_wait_lock(); + + for (;;) { + for (i = 0; i < neq; i++) { + lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]); + + if (eq == NULL) { + lnet_eq_wait_unlock(); + RETURN(-ENOENT); + } + + rc = lnet_eq_dequeue_event(eq, event); + if (rc != 0) { + lnet_eq_wait_unlock(); + *which = i; + RETURN(rc); + } + } + + if (wait == 0) + break; + + /* + * return value of lnet_eq_wait_locked: + * -1 : did nothing and it's sure no new event + * 1 : sleep inside and wait until new event + * 0 : don't want to wait anymore, but might have new event + * so need to call dequeue again + */ + wait = lnet_eq_wait_locked(&timeout_ms); + if (wait < 0) /* no new event */ + break; + } + + lnet_eq_wait_unlock(); + RETURN(0); +} diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c new file mode 100644 index 000000000000..ae643f26933b --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-md.c @@ -0,0 +1,451 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-md.c + * + * Memory Descriptor management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* must be called with lnet_res_lock held */ +void +lnet_md_unlink(lnet_libmd_t *md) +{ + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { + /* first unlink attempt... */ + lnet_me_t *me = md->md_me; + + md->md_flags |= LNET_MD_FLAG_ZOMBIE; + + /* Disassociate from ME (if any), and unlink it if it was created + * with LNET_UNLINK */ + if (me != NULL) { + /* detach MD from portal */ + lnet_ptl_detach_md(me, md); + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); + } + + /* ensure all future handle lookups fail */ + lnet_res_lh_invalidate(&md->md_lh); + } + + if (md->md_refcount != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if (md->md_eq != NULL) { + int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + + LASSERT(*md->md_eq->eq_refs[cpt] > 0); + (*md->md_eq->eq_refs[cpt])--; + } + + LASSERT(!list_empty(&md->md_list)); + list_del_init(&md->md_list); + lnet_md_free_locked(md); +} + +static int +lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink) +{ + int i; + unsigned int niov; + int total_length = 0; + + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_eq = NULL; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & LNET_MD_IOVEC) != 0) { + + if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */ + return -EINVAL; + + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.iov, umd->start, + niov * sizeof (lmd->md_iov.iov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the base address on trust */ + if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return -EINVAL; + + total_length += lmd->md_iov.iov[i].iov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + + } else if ((umd->options & LNET_MD_KIOV) != 0) { + lmd->md_niov = niov = umd->length; + memcpy(lmd->md_iov.kiov, umd->start, + niov * sizeof (lmd->md_iov.kiov[0])); + + for (i = 0; i < (int)niov; i++) { + /* We take the page pointer on trust */ + if (lmd->md_iov.kiov[i].kiov_offset + + lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE ) + return -EINVAL; /* invalid length */ + + total_length += lmd->md_iov.kiov[i].kiov_len; + } + + lmd->md_length = total_length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > total_length)) // illegal max_size + return -EINVAL; + } else { /* contiguous */ + lmd->md_length = umd->length; + lmd->md_niov = niov = 1; + lmd->md_iov.iov[0].iov_base = umd->start; + lmd->md_iov.iov[0].iov_len = umd->length; + + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ + (umd->max_size < 0 || + umd->max_size > (int)umd->length)) // illegal max_size + return -EINVAL; + } + + return 0; +} + +/* must be called with resource lock held */ +static int +lnet_md_link(lnet_libmd_t *md, lnet_handle_eq_t eq_handle, int cpt) +{ + struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; + + /* NB we are passed an allocated, but inactive md. + * if we return success, caller may lnet_md_unlink() it. + * otherwise caller may only lnet_md_free() it. + */ + /* This implementation doesn't know how to create START events or + * disable END events. Best to LASSERT our caller is compliant so + * we find out quickly... */ + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + * LASSERT (eq == NULL); + */ + if (!LNetHandleIsInvalid(eq_handle)) { + md->md_eq = lnet_handle2eq(&eq_handle); + + if (md->md_eq == NULL) + return -ENOENT; + + (*md->md_eq->eq_refs[cpt])++; + } + + lnet_res_lh_initialize(container, &md->md_lh); + + LASSERT(list_empty(&md->md_list)); + list_add(&md->md_list, &container->rec_active); + + return 0; +} + +/* must be called with lnet_res_lock held */ +void +lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + umd->start = lmd->md_start; + umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ? + lmd->md_length : lmd->md_niov; + umd->threshold = lmd->md_threshold; + umd->max_size = lmd->md_max_size; + umd->options = lmd->md_options; + umd->user_ptr = lmd->md_user_ptr; + lnet_eq2handle(&umd->eq_handle, lmd->md_eq); +} + +int +lnet_md_validate(lnet_md_t *umd) +{ + if (umd->start == NULL && umd->length != 0) { + CERROR("MD start pointer can not be NULL with length %u\n", + umd->length); + return -EINVAL; + } + + if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd->length > LNET_MAX_IOV) { + CERROR("Invalid option: too many fragments %u, %d max\n", + umd->length, LNET_MAX_IOV); + return -EINVAL; + } + + return 0; +} + +/** + * Create a memory descriptor and attach it to a ME + * + * \param meh A handle for a ME to associate the new MD with. + * \param umd Provides initial values for the user-visible parts of a MD. + * Other than its use for initialization, there is no linkage between this + * structure and the MD maintained by the LNet. + * \param unlink A flag to indicate whether the MD is automatically unlinked + * when it becomes inactive, either because the operation threshold drops to + * zero or because the available memory becomes less than \a umd.max_size. + * (Note that the check for unlinking a MD only occurs after the completion + * of a successful operation on the MD.) The value LNET_UNLINK enables auto + * unlinking; the value LNET_RETAIN disables it. + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(). + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a + * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by + * calling LNetInvalidateHandle() on it. + * \retval -EBUSY If the ME pointed to by \a meh is already associated with + * a MD. + */ +int +LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, + lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + LIST_HEAD (matches); + LIST_HEAD (drops); + struct lnet_me *me; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) == 0) { + CERROR("Invalid option: no MD_OP set\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + cpt = lnet_cpt_of_cookie(meh.cookie); + + lnet_res_lock(cpt); + if (rc != 0) + goto failed; + + me = lnet_handle2me(&meh); + if (me == NULL) + rc = -ENOENT; + else if (me->me_md != NULL) + rc = -EBUSY; + else + rc = lnet_md_link(md, umd.eq_handle, cpt); + + if (rc != 0) + goto failed; + + /* attach this MD to portal of ME and check if it matches any + * blocked msgs on this portal */ + lnet_ptl_attach_md(me, md, &matches, &drops); + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + + lnet_drop_delayed_msg_list(&drops, "Bad match"); + lnet_recv_delayed_msg_list(&matches); + + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDAttach); + +/** + * Create a "free floating" memory descriptor - a MD that is not associated + * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. + * + * \param umd,unlink See the discussion for LNetMDAttach(). + * \param handle On successful returns, a handle to the newly created MD is + * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), + * and LNetGet() operations. + * + * \retval 0 On success. + * \retval -EINVAL If \a umd is not valid. + * \retval -ENOMEM If new MD cannot be allocated. + * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that + * it's OK to supply a NULL \a umd.eq_handle by calling + * LNetInvalidateHandle() on it. + */ +int +LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) +{ + lnet_libmd_t *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (lnet_md_validate(&umd) != 0) + return -EINVAL; + + if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT)) != 0) { + CERROR("Invalid option: GET|PUT illegal on active MDs\n"); + return -EINVAL; + } + + md = lnet_md_alloc(&umd); + if (md == NULL) + return -ENOMEM; + + rc = lnet_md_build(md, &umd, unlink); + + cpt = lnet_res_lock_current(); + if (rc != 0) + goto failed; + + rc = lnet_md_link(md, umd.eq_handle, cpt); + if (rc != 0) + goto failed; + + lnet_md2handle(handle, md); + + lnet_res_unlock(cpt); + return 0; + + failed: + lnet_md_free_locked(md); + + lnet_res_unlock(cpt); + return rc; +} +EXPORT_SYMBOL(LNetMDBind); + +/** + * Unlink the memory descriptor from any ME it may be linked to and release + * the internal resources associated with it. + * + * This function does not free the memory region associated with the MD; + * i.e., the memory the user allocated for this MD. If the ME associated with + * this MD is not NULL and was created with auto unlink enabled, the ME is + * unlinked as well (see LNetMEAttach()). + * + * Explicitly unlinking a MD via this function call has the same behavior as + * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK + * is generated in the latter case. + * + * An unlinked event can be reported in two ways: + * - If there's no pending operations on the MD, it's unlinked immediately + * and an LNET_EVENT_UNLINK event is logged before this function returns. + * - Otherwise, the MD is only marked for deletion when this function + * returns, and the unlinked event will be piggybacked on the event of + * the completion of the last operation by setting the unlinked field of + * the event. No dedicated LNET_EVENT_UNLINK event is generated. + * + * Note that in both cases the unlinked field of the event is always set; no + * more event will happen on the MD after such an event is logged. + * + * \param mdh A handle for the MD to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a mdh does not point to a valid MD object. + */ +int +LNetMDUnlink (lnet_handle_md_t mdh) +{ + lnet_event_t ev; + lnet_libmd_t *md; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the NAL is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + + if (md->md_eq != NULL && + md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_md_unlink(md); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMDUnlink); diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c new file mode 100644 index 000000000000..0081075cabee --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-me.c @@ -0,0 +1,297 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-me.c + * + * Match Entry management routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** + * Create and attach a match entry to the match list of \a portal. The new + * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() + * can be used to attach a MD to an empty ME. + * + * \param portal The portal table index where the ME should be attached. + * \param match_id Specifies the match criteria for the process ID of + * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be + * used to wildcard either of the identifiers in the lnet_process_id_t + * structure. + * \param match_bits,ignore_bits Specify the match criteria to apply + * to the match bits in the incoming request. The ignore bits are used + * to mask out insignificant bits in the incoming match bits. The resulting + * bits are then compared to the ME's match bits to determine if the + * incoming request meets the match criteria. + * \param unlink Indicates whether the ME should be unlinked when the memory + * descriptor associated with it is unlinked (Note that the check for + * unlinking a ME only occurs when the memory descriptor is unlinked.). + * Valid values are LNET_RETAIN and LNET_UNLINK. + * \param pos Indicates whether the new ME should be prepended or + * appended to the match list. Allowed constants: LNET_INS_BEFORE, + * LNET_INS_AFTER. + * \param handle On successful returns, a handle to the newly created ME + * object is saved here. This handle can be used later in LNetMEInsert(), + * LNetMEUnlink(), or LNetMDAttach() functions. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is invalid. + * \retval -ENOMEM If new ME object cannot be allocated. + */ +int +LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_match_table *mtable; + struct lnet_me *me; + struct list_head *head; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if ((int)portal >= the_lnet.ln_nportals) + return -EINVAL; + + mtable = lnet_mt_of_attach(portal, match_id, + match_bits, ignore_bits, pos); + if (mtable == NULL) /* can't match portal type */ + return -EPERM; + + me = lnet_me_alloc(); + if (me == NULL) + return -ENOMEM; + + lnet_res_lock(mtable->mt_cpt); + + me->me_portal = portal; + me->me_match_id = match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt], + &me->me_lh); + if (ignore_bits != 0) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, match_id, match_bits); + + me->me_pos = head - &mtable->mt_mhash[0]; + if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) + list_add_tail(&me->me_list, head); + else + list_add(&me->me_list, head); + + lnet_me2handle(handle, me); + + lnet_res_unlock(mtable->mt_cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEAttach); + +/** + * Create and a match entry and insert it before or after the ME pointed to by + * \a current_meh. The new ME is empty, i.e. not associated with a memory + * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME. + * + * This function is identical to LNetMEAttach() except for the position + * where the new ME is inserted. + * + * \param current_meh A handle for a ME. The new ME will be inserted + * immediately before or immediately after this ME. + * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion + * for LNetMEAttach(). + * + * \retval 0 On success. + * \retval -ENOMEM If new ME object cannot be allocated. + * \retval -ENOENT If \a current_meh does not point to a valid match entry. + */ +int +LNetMEInsert(lnet_handle_me_t current_meh, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) +{ + struct lnet_me *current_me; + struct lnet_me *new_me; + struct lnet_portal *ptl; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + if (pos == LNET_INS_LOCAL) + return -EPERM; + + new_me = lnet_me_alloc(); + if (new_me == NULL) + return -ENOMEM; + + cpt = lnet_cpt_of_cookie(current_meh.cookie); + + lnet_res_lock(cpt); + + current_me = lnet_handle2me(¤t_meh); + if (current_me == NULL) { + lnet_me_free_locked(new_me); + + lnet_res_unlock(cpt); + return -ENOENT; + } + + LASSERT(current_me->me_portal < the_lnet.ln_nportals); + + ptl = the_lnet.ln_portals[current_me->me_portal]; + if (lnet_ptl_is_unique(ptl)) { + /* nosense to insertion on unique portal */ + lnet_me_free_locked(new_me); + lnet_res_unlock(cpt); + return -EPERM; + } + + new_me->me_pos = current_me->me_pos; + new_me->me_portal = current_me->me_portal; + new_me->me_match_id = match_id; + new_me->me_match_bits = match_bits; + new_me->me_ignore_bits = ignore_bits; + new_me->me_unlink = unlink; + new_me->me_md = NULL; + + lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh); + + if (pos == LNET_INS_AFTER) + list_add(&new_me->me_list, ¤t_me->me_list); + else + list_add_tail(&new_me->me_list, ¤t_me->me_list); + + lnet_me2handle(handle, new_me); + + lnet_res_unlock(cpt); + + return 0; +} +EXPORT_SYMBOL(LNetMEInsert); + +/** + * Unlink a match entry from its match list. + * + * This operation also releases any resources associated with the ME. If a + * memory descriptor is attached to the ME, then it will be unlinked as well + * and an unlink event will be generated. It is an error to use the ME handle + * after calling LNetMEUnlink(). + * + * \param meh A handle for the ME to be unlinked. + * + * \retval 0 On success. + * \retval -ENOENT If \a meh does not point to a valid ME. + * \see LNetMDUnlink() for the discussion on delivering unlink event. + */ +int +LNetMEUnlink(lnet_handle_me_t meh) +{ + lnet_me_t *me; + lnet_libmd_t *md; + lnet_event_t ev; + int cpt; + + LASSERT(the_lnet.ln_init); + LASSERT(the_lnet.ln_refcount > 0); + + cpt = lnet_cpt_of_cookie(meh.cookie); + lnet_res_lock(cpt); + + me = lnet_handle2me(&meh); + if (me == NULL) { + lnet_res_unlock(cpt); + return -ENOENT; + } + + md = me->me_md; + if (md != NULL && + md->md_eq != NULL && + md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } + + lnet_me_unlink(me); + + lnet_res_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(LNetMEUnlink); + +/* call with lnet_res_lock please */ +void +lnet_me_unlink(lnet_me_t *me) +{ + list_del(&me->me_list); + + if (me->me_md != NULL) { + lnet_libmd_t *md = me->me_md; + + /* detach MD from portal of this ME */ + lnet_ptl_detach_md(me, md); + lnet_md_unlink(md); + } + + lnet_res_lh_invalidate(&me->me_lh); + lnet_me_free_locked(me); +} + +#if 0 +static void +lib_me_dump(lnet_me_t *me) +{ + CWARN("Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", + me->me_match_bits, me->me_ignore_bits); + + CWARN("\tMD\t= %p\n", me->md); + CWARN("\tprev\t= %p\n", + list_entry(me->me_list.prev, lnet_me_t, me_list)); + CWARN("\tnext\t= %p\n", + list_entry(me->me_list.next, lnet_me_t, me_list)); +} +#endif diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c new file mode 100644 index 000000000000..49b0f1287a69 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-move.c @@ -0,0 +1,2441 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-move.c + * + * Data movement routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +static int local_nid_dist_zero = 1; +CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444, + "Reserved"); + +int +lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + LASSERT (the_lnet.ln_init); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + if (threshold != 0) { + /* Adding a new entry */ + LIBCFS_ALLOC(tp, sizeof(*tp)); + if (tp == NULL) + return -ENOMEM; + + tp->tp_nid = nid; + tp->tp_threshold = threshold; + + lnet_net_lock(0); + list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); + lnet_net_unlock(0); + return 0; + } + + /* removing entries */ + INIT_LIST_HEAD(&cull); + + lnet_net_lock(0); + + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + nid == LNET_NID_ANY || /* removing all entries */ + tp->tp_nid == nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + lnet_net_unlock(0); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); + + list_del (&tp->tp_list); + LIBCFS_FREE(tp, sizeof (*tp)); + } + return 0; +} + +static int +fail_peer (lnet_nid_t nid, int outgoing) +{ + lnet_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + /* NB: use lnet_net_lock(0) to serialize operations on test peers */ + lnet_net_lock(0); + + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != LNET_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + lnet_net_unlock(0); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); + list_del (&tp->tp_list); + + LIBCFS_FREE(tp, sizeof (*tp)); + } + + return (fail); +} + +unsigned int +lnet_iov_nob (unsigned int niov, struct iovec *iov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_iov_nob); + +void +lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset, + unsigned int nsiov, struct iovec *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + + if (nob == 0) + return; + + /* skip complete frags before 'doffset' */ + LASSERT (ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); + } + + /* skip complete frags before 'soffset' */ + LASSERT (nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); + } + + do { + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = MIN(this_nob, nob); + + memcpy ((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); +} +EXPORT_SYMBOL(lnet_copy_iov2iov); + +int +lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT ((int)niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_iov); + + +unsigned int +lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov) +{ + unsigned int nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} +EXPORT_SYMBOL(lnet_kiov_nob); + +void +lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, + unsigned int nob) +{ + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (ndiov > 0); + while (doffset >= diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); + } + + LASSERT (nsiov > 0); + while (soffset >= siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); + } + + do { + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = MIN(this_nob, nob); + + if (daddr == NULL) + daddr = ((char *)kmap(diov->kiov_page)) + + diov->kiov_offset + doffset; + if (saddr == NULL) + saddr = ((char *)kmap(siov->kiov_page)) + + siov->kiov_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy (daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->kiov_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + kunmap(diov->kiov_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->kiov_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + kunmap(siov->kiov_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + kunmap(diov->kiov_page); + if (saddr != NULL) + kunmap(siov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2kiov); + +void +lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + + do { + LASSERT (niov > 0); + LASSERT (nkiov > 0); + this_nob = MIN(iov->iov_len - iovoffset, + kiov->kiov_len - kiovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_kiov2iov); + +void +lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nob) +{ + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (nkiov > 0); + while (kiovoffset >= kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + + LASSERT (niov > 0); + while (iovoffset >= iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + do { + LASSERT (nkiov > 0); + LASSERT (niov > 0); + this_nob = MIN(kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = MIN(this_nob, nob); + + if (addr == NULL) + addr = ((char *)kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; + + memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + kunmap(kiov->kiov_page); +} +EXPORT_SYMBOL(lnet_copy_iov2kiov); + +int +lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT ((int)niov <= dst_niov); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + return (niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE); + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +EXPORT_SYMBOL(lnet_extract_kiov); + +void +lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + unsigned int niov = 0; + struct iovec *iov = NULL; + lnet_kiov_t *kiov = NULL; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + LASSERT(msg->msg_offset == offset); + LASSERT(msg->msg_wanted == mlen); + + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; + + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, rlen); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +void +lnet_setpayloadbuffer(lnet_msg_t *msg) +{ + lnet_libmd_t *md = msg->msg_md; + + LASSERT (msg->msg_len > 0); + LASSERT (!msg->msg_routing); + LASSERT (md != NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} + +void +lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); +} + +void +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +int +lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc; + + LASSERT(!msg->msg_sending); + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_rx_ready_delay); + LASSERT(ni->ni_lnd->lnd_eager_recv != NULL); + + msg->msg_rx_ready_delay = 1; + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nid2str(msg->msg_rxpeer->lp_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT(rc < 0); /* required by my callers */ + } + + return rc; +} + +/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ +void +lnet_ni_query_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + cfs_time_t last_alive = 0; + + LASSERT(lnet_peer_aliveness_enabled(lp)); + LASSERT(ni->ni_lnd->lnd_query != NULL); + + lnet_net_unlock(lp->lp_cpt); + (ni->ni_lnd->lnd_query)(ni, lp->lp_nid, &last_alive); + lnet_net_lock(lp->lp_cpt); + + lp->lp_last_query = cfs_time_current(); + + if (last_alive != 0) /* NI has updated timestamp */ + lp->lp_last_alive = last_alive; +} + +/* NB: always called with lnet_net_lock held */ +static inline int +lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) +{ + int alive; + cfs_time_t deadline; + + LASSERT (lnet_peer_aliveness_enabled(lp)); + + /* Trust lnet_notify() if it has more recent aliveness news, but + * ignore the initial assumed death (see lnet_peers_start_down()). + */ + if (!lp->lp_alive && lp->lp_alive_count > 0 && + cfs_time_aftereq(lp->lp_timestamp, lp->lp_last_alive)) + return 0; + + deadline = cfs_time_add(lp->lp_last_alive, + cfs_time_seconds(lp->lp_ni->ni_peertimeout)); + alive = cfs_time_after(deadline, now); + + /* Update obsolete lp_alive except for routers assumed to be dead + * initially, because router checker would update aliveness in this + * case, and moreover lp_last_alive at peer creation is assumed. + */ + if (alive && !lp->lp_alive && + !(lnet_isrouter(lp) && lp->lp_alive_count == 0)) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); + + return alive; +} + + +/* NB: returns 1 when alive, 0 when dead, negative when error; + * may drop the lnet_net_lock */ +int +lnet_peer_alive_locked (lnet_peer_t *lp) +{ + cfs_time_t now = cfs_time_current(); + + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; + + if (lnet_peer_is_alive(lp, now)) + return 1; + + /* Peer appears dead, but we should avoid frequent NI queries (at + * most once per lnet_queryinterval seconds). */ + if (lp->lp_last_query != 0) { + static const int lnet_queryinterval = 1; + + cfs_time_t next_query = + cfs_time_add(lp->lp_last_query, + cfs_time_seconds(lnet_queryinterval)); + + if (cfs_time_before(now, next_query)) { + if (lp->lp_alive) + CWARN("Unexpected aliveness of peer %s: " + "%d < %d (%d/%d)\n", + libcfs_nid2str(lp->lp_nid), + (int)now, (int)next_query, + lnet_queryinterval, + lp->lp_ni->ni_peertimeout); + return 0; + } + } + + /* query NI for latest aliveness news */ + lnet_ni_query_locked(lp->lp_ni, lp); + + if (lnet_peer_is_alive(lp, now)) + return 1; + + lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); + return 0; +} + +int +lnet_post_send_locked(lnet_msg_t *msg, int do_send) +{ + /* lnet_send is going to lnet_net_unlock immediately after this, + * so it sets do_send FALSE and I don't do the unlock/send/lock bit. + * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer + * appears dead, and 0 if sent or OK to send */ + struct lnet_peer *lp = msg->msg_txpeer; + struct lnet_ni *ni = lp->lp_ni; + struct lnet_tx_queue *tq; + int cpt; + + /* non-lnet_send() callers have checked before */ + LASSERT(!do_send || msg->msg_tx_delayed); + LASSERT(!msg->msg_receiving); + LASSERT(msg->msg_tx_committed); + + cpt = msg->msg_tx_cpt; + tq = ni->ni_tx_queues[cpt]; + + /* NB 'lp' is always the next hop */ + if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && + lnet_peer_alive_locked(lp) == 0) { + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; + lnet_net_unlock(cpt); + + CNETERR("Dropping message for %s: peer not alive\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -EHOSTUNREACH); + + lnet_net_lock(cpt); + return EHOSTUNREACH; + } + + if (!msg->msg_peertxcredit) { + LASSERT ((lp->lp_txcredits < 0) == + !list_empty(&lp->lp_txq)); + + msg->msg_peertxcredit = 1; + lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lp_txcredits--; + + if (lp->lp_txcredits < lp->lp_mintxcredits) + lp->lp_mintxcredits = lp->lp_txcredits; + + if (lp->lp_txcredits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_txq); + return EAGAIN; + } + } + + if (!msg->msg_txcredit) { + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + msg->msg_txcredit = 1; + tq->tq_credits--; + + if (tq->tq_credits < tq->tq_credits_min) + tq->tq_credits_min = tq->tq_credits; + + if (tq->tq_credits < 0) { + msg->msg_tx_delayed = 1; + list_add_tail(&msg->msg_list, &tq->tq_delayed); + return EAGAIN; + } + } + + if (do_send) { + lnet_net_unlock(cpt); + lnet_ni_send(ni, msg); + lnet_net_lock(cpt); + } + return 0; +} + + +lnet_rtrbufpool_t * +lnet_msg2bufpool(lnet_msg_t *msg) +{ + lnet_rtrbufpool_t *rbp; + int cpt; + + LASSERT(msg->msg_rx_committed); + + cpt = msg->msg_rx_cpt; + rbp = &the_lnet.ln_rtrpools[cpt][0]; + + LASSERT(msg->msg_len <= LNET_MTU); + while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) { + rbp++; + LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); + } + + return rbp; +} + +int +lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) +{ + /* lnet_parse is going to lnet_net_unlock immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. I + * return EAGAIN if msg blocked and 0 if received or OK to receive */ + lnet_peer_t *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; + + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); + + /* non-lnet_parse callers only receive delayed messages */ + LASSERT(!do_recv || msg->msg_rx_delayed); + + if (!msg->msg_peerrtrcredit) { + LASSERT ((lp->lp_rtrcredits < 0) == + !list_empty(&lp->lp_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lp_rtrcredits--; + if (lp->lp_rtrcredits < lp->lp_minrtrcredits) + lp->lp_minrtrcredits = lp->lp_rtrcredits; + + if (lp->lp_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + return EAGAIN; + } + } + + rbp = lnet_msg2bufpool(msg); + + if (!msg->msg_rtrcredit) { + LASSERT ((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT(msg->msg_rx_ready_delay); + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return EAGAIN; + } + } + + LASSERT (!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + if (do_recv) { + int cpt = msg->msg_rx_cpt; + + lnet_net_unlock(cpt); + lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + lnet_net_lock(cpt); + } + return 0; +} + +void +lnet_return_tx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *txpeer = msg->msg_txpeer; + lnet_msg_t *msg2; + + if (msg->msg_txcredit) { + struct lnet_ni *ni = txpeer->lp_ni; + struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; + + /* give back NI txcredits */ + msg->msg_txcredit = 0; + + LASSERT((tq->tq_credits < 0) == + !list_empty(&tq->tq_delayed)); + + tq->tq_credits++; + if (tq->tq_credits <= 0) { + msg2 = list_entry(tq->tq_delayed.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + LASSERT((txpeer->lp_txcredits < 0) == + !list_empty(&txpeer->lp_txq)); + + txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT (txpeer->lp_txqnob >= 0); + + txpeer->lp_txcredits++; + if (txpeer->lp_txcredits <= 0) { + msg2 = list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer == txpeer); + LASSERT(msg2->msg_tx_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } + } + + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_decref_locked(txpeer); + } +} + +void +lnet_return_rx_credits_locked(lnet_msg_t *msg) +{ + lnet_peer_t *rxpeer = msg->msg_rxpeer; + lnet_msg_t *msg2; + + if (msg->msg_rtrcredit) { + /* give back global router credits */ + lnet_rtrbuf_t *rb; + lnet_rtrbufpool_t *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT (msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rbp = rb->rb_pool; + LASSERT (rbp == lnet_msg2bufpool(msg)); + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT((rbp->rbp_credits < 0) == + !list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == + !list_empty(&rbp->rbp_bufs)); + + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) { + msg2 = list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (msg->msg_peerrtrcredit) { + /* give back peer router credits */ + msg->msg_peerrtrcredit = 0; + + LASSERT((rxpeer->lp_rtrcredits < 0) == + !list_empty(&rxpeer->lp_rtrq)); + + rxpeer->lp_rtrcredits++; + if (rxpeer->lp_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_decref_locked(rxpeer); + } +} + +static int +lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) +{ + lnet_peer_t *p1 = r1->lr_gateway; + lnet_peer_t *p2 = r2->lr_gateway; + + if (r1->lr_hops < r2->lr_hops) + return 1; + + if (r1->lr_hops > r2->lr_hops) + return -1; + + if (p1->lp_txqnob < p2->lp_txqnob) + return 1; + + if (p1->lp_txqnob > p2->lp_txqnob) + return -1; + + if (p1->lp_txcredits > p2->lp_txcredits) + return 1; + + if (p1->lp_txcredits < p2->lp_txcredits) + return -1; + + if (r1->lr_seq - r2->lr_seq <= 0) + return 1; + + return -1; +} + +static lnet_peer_t * +lnet_find_route_locked(lnet_ni_t *ni, lnet_nid_t target, lnet_nid_t rtr_nid) +{ + lnet_remotenet_t *rnet; + lnet_route_t *rtr; + lnet_route_t *rtr_best; + lnet_route_t *rtr_last; + struct lnet_peer *lp_best; + struct lnet_peer *lp; + int rc; + + /* If @rtr_nid is not LNET_NID_ANY, return the gateway with + * rtr_nid nid, otherwise find the best gateway I can use */ + + rnet = lnet_find_net_locked(LNET_NIDNET(target)); + if (rnet == NULL) + return NULL; + + lp_best = NULL; + rtr_best = rtr_last = NULL; + list_for_each_entry(rtr, &rnet->lrn_routes, lr_list) { + lp = rtr->lr_gateway; + + if (!lp->lp_alive || /* gateway is down */ + ((lp->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0 && + rtr->lr_downis != 0)) /* NI to target is down */ + continue; + + if (ni != NULL && lp->lp_ni != ni) + continue; + + if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ + return lp; + + if (lp_best == NULL) { + rtr_best = rtr_last = rtr; + lp_best = lp; + continue; + } + + /* no protection on below fields, but it's harmless */ + if (rtr_last->lr_seq - rtr->lr_seq < 0) + rtr_last = rtr; + + rc = lnet_compare_routes(rtr, rtr_best); + if (rc < 0) + continue; + + rtr_best = rtr; + lp_best = lp; + } + + /* set sequence number on the best router to the latest sequence + 1 + * so we can round-robin all routers, it's race and inaccurate but + * harmless and functional */ + if (rtr_best != NULL) + rtr_best->lr_seq = rtr_last->lr_seq + 1; + return lp_best; +} + +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + struct lnet_ni *src_ni; + struct lnet_ni *local_ni; + struct lnet_peer *lp; + int cpt; + int cpt2; + int rc; + + /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, + * but we might want to use pre-determined router for ACK/REPLY + * in the future */ + /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */ + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + LASSERT (!msg->msg_receiving); + + msg->msg_sending = 1; + + LASSERT(!msg->msg_tx_committed); + cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); + again: + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + if (src_nid == LNET_NID_ANY) { + src_ni = NULL; + } else { + src_ni = lnet_nid2ni_locked(src_nid, cpt); + if (src_ni == NULL) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Can't send to %s: src %s is not a " + "local nid\n", libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + LASSERT (!msg->msg_routing); + } + + /* Is this for someone on a local network? */ + local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); + + if (local_ni != NULL) { + if (src_ni == NULL) { + src_ni = local_ni; + src_nid = src_ni->ni_nid; + } else if (src_ni == local_ni) { + lnet_ni_decref_locked(local_ni, cpt); + } else { + lnet_ni_decref_locked(local_ni, cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + LCONSOLE_WARN("No route to %s via from %s\n", + libcfs_nid2str(dst_nid), + libcfs_nid2str(src_nid)); + return -EINVAL; + } + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + + if (src_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + lnet_net_unlock(cpt); + lnet_ni_send(src_ni, msg); + + lnet_net_lock(cpt); + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + return 0; + } + + rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); + /* lp has ref on src_ni; lose mine */ + lnet_ni_decref_locked(src_ni, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + LCONSOLE_WARN("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); + /* ENOMEM or shutting down */ + return rc; + } + LASSERT (lp->lp_ni == src_ni); + } else { + /* sending to a remote network */ + lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); + if (lp == NULL) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + LCONSOLE_WARN("No route to %s via %s " + "(all routers down)\n", + libcfs_id2str(msg->msg_target), + libcfs_nid2str(src_nid)); + return -EHOSTUNREACH; + } + + /* rtr_nid is LNET_NID_ANY or NID of pre-determined router, + * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't + * pre-determined router, this can happen if router table + * was changed when we release the lock */ + if (rtr_nid != lp->lp_nid) { + cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); + if (cpt2 != cpt) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni, cpt); + lnet_net_unlock(cpt); + + rtr_nid = lp->lp_nid; + cpt = cpt2; + goto again; + } + } + + CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), + lnet_msgtyp2str(msg->msg_type), msg->msg_len); + + if (src_ni == NULL) { + src_ni = lp->lp_ni; + src_nid = src_ni->ni_nid; + } else { + LASSERT (src_ni == lp->lp_ni); + lnet_ni_decref_locked(src_ni, cpt); + } + + lnet_peer_addref_locked(lp); + + LASSERT(src_nid != LNET_NID_ANY); + lnet_msg_commit(msg, cpt); + + if (!msg->msg_routing) { + /* I'm the source and now I know which NI to send on */ + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } + + msg->msg_target_is_router = 1; + msg->msg_target.nid = lp->lp_nid; + msg->msg_target.pid = LUSTRE_SRV_LNET_PID; + } + + /* 'lp' is our best choice of peer */ + + LASSERT (!msg->msg_peertxcredit); + LASSERT (!msg->msg_txcredit); + LASSERT (msg->msg_txpeer == NULL); + + msg->msg_txpeer = lp; /* msg takes my ref on lp */ + + rc = lnet_post_send_locked(msg, 0); + lnet_net_unlock(cpt); + + if (rc == EHOSTUNREACH) + return -EHOSTUNREACH; + + if (rc == 0) + lnet_ni_send(src_ni, msg); + + return 0; +} + +static void +lnet_drop_message(lnet_ni_t *ni, int cpt, void *private, unsigned int nob) +{ + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += nob; + lnet_net_unlock(cpt); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); +} + +static void +lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + + if (msg->msg_wanted != 0) + lnet_setpayloadbuffer(msg); + + lnet_build_msg_event(msg, LNET_EVENT_PUT); + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, + msg->msg_offset, msg->msg_wanted, hdr->payload_length); +} + +static int +lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + struct lnet_match_info info; + int rc; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + msg->msg_rx_ready_delay = ni->ni_lnd->lnd_eager_recv == NULL; + + again: + rc = lnet_ptl_match_md(&info, msg); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + lnet_recv_put(ni, msg); + return 0; + + case LNET_MATCHMD_NONE: + if (msg->msg_rx_delayed) /* attached on delayed list */ + return 0; + + rc = lnet_ni_eager_recv(ni, msg); + if (rc == 0) + goto again; + /* fall through */ + + case LNET_MATCHMD_DROP: + CNETERR("Dropping PUT from %s portal %d match "LPU64 + " offset %d length %d: %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); + + return ENOENT; /* +ve: OK but no match */ + } +} + +static int +lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) +{ + struct lnet_match_info info; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_handle_wire_t reply_wmd; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = le32_to_cpu(hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); + hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); + + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_GET; + info.mi_portal = hdr->msg.get.ptl_index; + info.mi_rlength = hdr->msg.get.sink_length; + info.mi_roffset = hdr->msg.get.src_offset; + info.mi_mbits = hdr->msg.get.match_bits; + + rc = lnet_ptl_match_md(&info, msg); + if (rc == LNET_MATCHMD_DROP) { + CNETERR("Dropping GET from %s portal %d match "LPU64 + " offset %d length %d\n", + libcfs_id2str(info.mi_id), info.mi_portal, + info.mi_mbits, info.mi_roffset, info.mi_rlength); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT(rc == LNET_MATCHMD_OK); + + lnet_build_msg_event(msg, LNET_EVENT_GET); + + reply_wmd = hdr->msg.get.return_wmd; + + lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, + msg->msg_offset, msg->msg_wanted); + + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; + + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } + + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nid2str(ni->ni_nid), + libcfs_id2str(info.mi_id), rc); + + lnet_finalize(ni, msg, rc); + } + + return 0; +} + +static int +lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int rlength; + int mlength; + int cpt; + + cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CNETERR("%s: Dropping REPLY from %s for %s " + "MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + LASSERT (md->md_offset == 0); + + rlength = hdr->payload_length; + mlength = MIN(rlength, (int)md->md_length); + + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CNETERR("%s: Dropping REPLY from %s length %d " + "for MD "LPX64" would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + lnet_res_unlock(cpt); + return ENOENT; /* +ve: OK but no match */ + } + + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, mlength); + + if (mlength != 0) + lnet_setpayloadbuffer(msg); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; +} + +static int +lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {0}; + lnet_libmd_t *md; + int cpt; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); + + cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); + lnet_res_lock(cpt); + + /* NB handles only looked up by creator (no flips) */ + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + /* Don't moan; this is expected */ + CDEBUG(D_NET, + "%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + return ENOENT; /* +ve! */ + } + + CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + hdr->msg.ack.dst_wmd.wh_object_cookie); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_ACK); + + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; +} + +static int +lnet_parse_forward_locked(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc = 0; + + if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + if (ni->ni_lnd->lnd_eager_recv == NULL) { + msg->msg_rx_ready_delay = 1; + } else { + lnet_net_unlock(msg->msg_rx_cpt); + rc = lnet_ni_eager_recv(ni, msg); + lnet_net_lock(msg->msg_rx_cpt); + } + } + + if (rc == 0) + rc = lnet_post_routed_recv_locked(msg, 0); + return rc; +} + +char * +lnet_msgtyp2str (int type) +{ + switch (type) { + case LNET_MSG_ACK: + return ("ACK"); + case LNET_MSG_PUT: + return ("PUT"); + case LNET_MSG_GET: + return ("GET"); + case LNET_MSG_REPLY: + return ("REPLY"); + case LNET_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} +EXPORT_SYMBOL(lnet_msgtyp2str); + +void +lnet_print_hdr(lnet_hdr_t * hdr) +{ + lnet_process_id_t src = {0}; + lnet_process_id_t dst = {0}; + char *type_str = lnet_msgtyp2str (hdr->type); + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + dst.nid = hdr->dest_nid; + dst.pid = hdr->dest_pid; + + CWARN("P3 Header at %p of type %s\n", hdr, type_str); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); + + switch (hdr->type) { + default: + break; + + case LNET_MSG_PUT: + CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPU64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + CWARN(" Length %d, offset %d, hdr data "LPX64"\n", + hdr->payload_length, hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case LNET_MSG_GET: + CWARN(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPU64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CWARN(" Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case LNET_MSG_ACK: + CWARN(" dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case LNET_MSG_REPLY: + CWARN(" dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + hdr->payload_length); + } + +} + +int +lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, + void *private, int rdma_req) +{ + int rc = 0; + int cpt; + int for_me; + struct lnet_msg *msg; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; + + LASSERT (!in_interrupt ()); + + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); + dest_nid = le64_to_cpu(hdr->dest_nid); + dest_pid = le32_to_cpu(hdr->dest_pid); + payload_length = le32_to_cpu(hdr->payload_length); + + for_me = (ni->ni_nid == dest_nid); + cpt = lnet_cpt_of_nid(from_nid); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; + + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } + + if (the_lnet.ln_routing && + ni->ni_last_alive != cfs_time_current_sec()) { + lnet_ni_lock(ni); + + /* NB: so far here is the only place to set NI status to "up */ + ni->ni_last_alive = cfs_time_current_sec(); + if (ni->ni_status != NULL && + ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) + ni->ni_status->ns_status = LNET_NI_STATUS_UP; + lnet_ni_unlock(ni); + } + + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR ("%s, src %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (lnet_islocalnid(dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR ("%s, src %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR ("%s, src %s: Bad optimized GET for %s " + "(final destination must be me)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR ("%s, src %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; + } + } + + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (src_nid, 0)) /* shall we now? */ + { + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; + } + + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + /* for building message event */ + msg->msg_from = from_nid; + if (!for_me) { + msg->msg_target.pid = dest_pid; + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + + } else { + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = dest_pid; + msg->msg_hdr.payload_length = payload_length; + } + + lnet_net_lock(cpt); + rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CERROR("%s, src %s: Dropping %s " + "(error %d looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), rc); + lnet_msg_free(msg); + goto drop; + } + + lnet_msg_commit(msg, cpt); + + if (!for_me) { + rc = lnet_parse_forward_locked(ni, msg); + lnet_net_unlock(cpt); + + if (rc < 0) + goto free_drop; + if (rc == 0) { + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + } + return 0; + } + + lnet_net_unlock(cpt); + + switch (type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); + break; + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); + break; + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, rdma_req); + break; + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); + break; + default: + LASSERT(0); + rc = -EPROTO; + goto free_drop; /* prevent an unused label if !kernel */ + } + + if (rc == 0) + return 0; + + LASSERT (rc == ENOENT); + + free_drop: + LASSERT(msg->msg_md == NULL); + lnet_finalize(ni, msg, rc); + + drop: + lnet_drop_message(ni, cpt, private, payload_length); + return 0; +} +EXPORT_SYMBOL(lnet_parse); + +void +lnet_drop_delayed_msg_list(struct list_head *head, char *reason) +{ + while (!list_empty(head)) { + lnet_process_id_t id = {0}; + lnet_msg_t *msg; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_md == NULL); + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match "LPU64 + " offset %d length %d: %s\n", + libcfs_id2str(id), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxpeer->lp_ni, + msg->msg_rxpeer->lp_cpt, + msg->msg_private, msg->msg_len); + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); + } +} + +void +lnet_recv_delayed_msg_list(struct list_head *head) +{ + while (!list_empty(head)) { + lnet_msg_t *msg; + lnet_process_id_t id; + + msg = list_entry(head->next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + /* md won't disappear under me, since each msg + * holds a ref on it */ + + id.nid = msg->msg_hdr.src_nid; + id.pid = msg->msg_hdr.src_pid; + + LASSERT(msg->msg_rx_delayed); + LASSERT(msg->msg_md != NULL); + LASSERT(msg->msg_rxpeer != NULL); + LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match "LPU64" offset %d length %d.\n", + libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length); + + lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); + } +} + +/** + * Initiate an asynchronous PUT operation. + * + * There are several events associated with a PUT: completion of the send on + * the initiator node (LNET_EVENT_SEND), and when the send completes + * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating + * that the operation was accepted by the target. The event LNET_EVENT_PUT is + * used at the target node to indicate the completion of incoming data + * delivery. + * + * The local events will be logged in the EQ associated with the MD pointed to + * by \a mdh handle. Using a MD without an associated EQ results in these + * events being discarded. In this case, the caller must have another + * mechanism (e.g., a higher level protocol) for determining when it is safe + * to modify the memory region associated with the MD. + * + * Note that LNet does not guarantee the order of LNET_EVENT_SEND and + * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. + * + * \param self Indicates the NID of a local interface through which to send + * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. + * \param mdh A handle for the MD that describes the memory to be sent. The MD + * must be "free floating" (See LNetMDBind()). + * \param ack Controls whether an acknowledgment is requested. + * Acknowledgments are only sent when they are requested by the initiating + * process and the target MD enables them. + * \param target A process identifier for the target process. + * \param portal The index in the \a target's portal table. + * \param match_bits The match bits to use for MD selection at the target + * process. + * \param offset The offset into the target MD (only used when the target + * MD has the LNET_MD_MANAGE_REMOTE option set). + * \param hdr_data 64 bits of user data that can be included in the message + * header. This data is written to an event queue entry at the target if an + * EQ is present on the matching MD. + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists). + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + * + * \see lnet_event_t::hdr_data and lnet_event_kind_t. + */ +int +LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + msg->msg_vmflush = !!memory_pressure_get(); + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping PUT ("LPU64":%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("Source MD also attached to portal %d\n", + md->md_me->me_portal); + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + return -ENOENT; + } + + CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; + + /* NB handles only looked up by creator (no flips) */ + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + } else { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + LNET_WIRE_HANDLE_COOKIE_NONE; + } + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc != 0) { + CNETERR( "Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetPut); + +lnet_msg_t * +lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) +{ + /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink + * data has been received. + * + * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when + * lnet_finalize() is called on it, so the LND must call this first */ + + struct lnet_msg *msg = lnet_msg_alloc(); + struct lnet_libmd *getmd = getmsg->msg_md; + lnet_process_id_t peer_id = getmsg->msg_target; + int cpt; + + LASSERT(!getmsg->msg_target_is_router); + LASSERT(!getmsg->msg_routing); + + cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); + lnet_res_lock(cpt); + + LASSERT (getmd->md_refcount > 0); + + if (msg == NULL) { + CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); + goto drop; + } + + if (getmd->md_threshold == 0) { + CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + getmd); + lnet_res_unlock(cpt); + goto drop; + } + + LASSERT(getmd->md_offset == 0); + + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); + + /* setup information for lnet_build_msg_event */ + msg->msg_from = peer_id.nid; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ + msg->msg_hdr.src_nid = peer_id.nid; + msg->msg_hdr.payload_length = getmd->md_length; + msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ + + lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); + lnet_res_unlock(cpt); + + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + lnet_msg_commit(msg, cpt); + lnet_net_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_REPLY); + + return msg; + + drop: + cpt = lnet_cpt_of_nid(peer_id.nid); + + lnet_net_lock(cpt); + the_lnet.ln_counters[cpt]->drop_count++; + the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; + lnet_net_unlock(cpt); + + if (msg != NULL) + lnet_msg_free(msg); + + return NULL; +} +EXPORT_SYMBOL(lnet_create_reply_msg); + +void +lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT (reply != NULL); + LASSERT (reply->msg_type == LNET_MSG_GET); + LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT (len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} +EXPORT_SYMBOL(lnet_set_reply_msg_len); + +/** + * Initiate an asynchronous GET operation. + * + * On the initiator node, an LNET_EVENT_SEND is logged when the GET request + * is sent, and an LNET_EVENT_REPLY is logged when the data returned from + * the target node in the REPLY has been written to local MD. + * + * On the target node, an LNET_EVENT_GET is logged when the GET request + * arrives and is accepted into a MD. + * + * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). + * \param mdh A handle for the MD that describes the memory into which the + * requested data will be received. The MD must be "free floating" (See LNetMDBind()). + * + * \retval 0 Success, and only in this case events will be generated + * and logged to EQ (if it exists) of the MD. + * \retval -EIO Simulated failure. + * \retval -ENOMEM Memory allocation failure. + * \retval -ENOENT Invalid MD object. + */ +int +LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset) +{ + struct lnet_msg *msg; + struct lnet_libmd *md; + int cpt; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ + { + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; + } + + msg = lnet_msg_alloc(); + if (msg == NULL) { + CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; + } + + cpt = lnet_cpt_of_cookie(mdh.cookie); + lnet_res_lock(cpt); + + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0 || md->md_me != NULL) { + CERROR("Dropping GET ("LPU64":%d:%s): MD (%d) invalid\n", + match_bits, portal, libcfs_id2str(target), + md == NULL ? -1 : md->md_threshold); + if (md != NULL && md->md_me != NULL) + CERROR("REPLY MD also attached to portal %d\n", + md->md_me->me_portal); + + lnet_res_unlock(cpt); + + lnet_msg_free(msg); + + return -ENOENT; + } + + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); + + lnet_msg_attach_md(msg, md, 0, 0); + + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); + + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); + + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + lnet_res_unlock(cpt); + + lnet_build_msg_event(msg, LNET_EVENT_SEND); + + rc = lnet_send(self, msg, LNET_NID_ANY); + if (rc < 0) { + CNETERR( "Error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} +EXPORT_SYMBOL(LNetGet); + +/** + * Calculate distance to node at \a dstnid. + * + * \param dstnid Target NID. + * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid + * is saved here. + * \param orderp If not NULL, order of the route to reach \a dstnid is saved + * here. + * + * \retval 0 If \a dstnid belongs to a local interface, and reserved option + * local_nid_dist_zero is set, which is the default. + * \retval positives Distance to target NID, i.e. number of hops plus one. + * \retval -EHOSTUNREACH If \a dstnid is not reachable. + */ +int +LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) +{ + struct list_head *e; + struct lnet_ni *ni; + lnet_remotenet_t *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int cpt; + __u32 order = 2; + struct list_head *rn_list; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + cpt = lnet_net_lock_current(); + + list_for_each (e, &the_lnet.ln_nis) { + ni = list_entry(e, lnet_ni_t, ni_list); + + if (ni->ni_nid == dstnid) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + *orderp = 0; + else + *orderp = 1; + } + lnet_net_unlock(cpt); + + return local_nid_dist_zero ? 0 : 1; + } + + if (LNET_NIDNET(ni->ni_nid) == dstnet) { + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return 1; + } + + order++; + } + + rn_list = lnet_net2rnethash(dstnet); + list_for_each(e, rn_list) { + rnet = list_entry(e, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == dstnet) { + lnet_route_t *route; + lnet_route_t *shortest = NULL; + + LASSERT (!list_empty(&rnet->lrn_routes)); + + list_for_each_entry(route, &rnet->lrn_routes, + lr_list) { + if (shortest == NULL || + route->lr_hops < shortest->lr_hops) + shortest = route; + } + + LASSERT (shortest != NULL); + hops = shortest->lr_hops; + if (srcnidp != NULL) + *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; + if (orderp != NULL) + *orderp = order; + lnet_net_unlock(cpt); + return hops + 1; + } + order++; + } + + lnet_net_unlock(cpt); + return -EHOSTUNREACH; +} +EXPORT_SYMBOL(LNetDist); + +/** + * Set the number of asynchronous messages expected from a target process. + * + * This function is only meaningful for userspace callers. It's a no-op when + * called from kernel. + * + * Asynchronous messages are those that can come from a target when the + * userspace process is not waiting for IO to complete; e.g., AST callbacks + * from Lustre servers. Specifying the expected number of such messages + * allows them to be eagerly received when user process is not running in + * LNet; otherwise network errors may occur. + * + * \param id Process ID of the target process. + * \param nasync Number of asynchronous messages expected from the target. + * + * \return 0 on success, and an error code otherwise. + */ +int +LNetSetAsync(lnet_process_id_t id, int nasync) +{ + return 0; +} +EXPORT_SYMBOL(LNetSetAsync); diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c new file mode 100644 index 000000000000..8f3a50bd5f69 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-msg.c @@ -0,0 +1,650 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-msg.c + * + * Message decoding, parsing and finalizing routines + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +void +lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev) +{ + ENTRY; + + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, &ev->md); + lnet_md2handle(&ev->md_handle, md); + EXIT; +} + +/* + * Don't need any lock, must be called after lnet_commit_md + */ +void +lnet_build_msg_event(lnet_msg_t *msg, lnet_event_kind_t ev_type) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_routing); + + ev->type = ev_type; + + if (ev_type == LNET_EVENT_SEND) { + /* event for active message */ + ev->target.nid = le64_to_cpu(hdr->dest_nid); + ev->target.pid = le32_to_cpu(hdr->dest_pid); + ev->initiator.nid = LNET_NID_ANY; + ev->initiator.pid = the_lnet.ln_pid; + ev->sender = LNET_NID_ANY; + + } else { + /* event for passive message */ + ev->target.pid = hdr->dest_pid; + ev->target.nid = hdr->dest_nid; + ev->initiator.pid = hdr->src_pid; + ev->initiator.nid = hdr->src_nid; + ev->rlength = hdr->payload_length; + ev->sender = msg->msg_from; + ev->mlength = msg->msg_wanted; + ev->offset = msg->msg_offset; + } + + switch (ev_type) { + default: + LBUG(); + + case LNET_EVENT_PUT: /* passive PUT */ + ev->pt_index = hdr->msg.put.ptl_index; + ev->match_bits = hdr->msg.put.match_bits; + ev->hdr_data = hdr->msg.put.hdr_data; + return; + + case LNET_EVENT_GET: /* passive GET */ + ev->pt_index = hdr->msg.get.ptl_index; + ev->match_bits = hdr->msg.get.match_bits; + ev->hdr_data = 0; + return; + + case LNET_EVENT_ACK: /* ACK */ + ev->match_bits = hdr->msg.ack.match_bits; + ev->mlength = hdr->msg.ack.mlength; + return; + + case LNET_EVENT_REPLY: /* REPLY */ + return; + + case LNET_EVENT_SEND: /* active message */ + if (msg->msg_type == LNET_MSG_PUT) { + ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); + ev->offset = le32_to_cpu(hdr->msg.put.offset); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->payload_length); + ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); + + } else { + LASSERT(msg->msg_type == LNET_MSG_GET); + ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); + ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); + ev->mlength = + ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); + ev->offset = le32_to_cpu(hdr->msg.get.src_offset); + ev->hdr_data = 0; + } + return; + } +} + +void +lnet_msg_commit(lnet_msg_t *msg, int cpt) +{ + struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; + lnet_counters_t *counters = the_lnet.ln_counters[cpt]; + + /* routed message can be committed for both receiving and sending */ + LASSERT(!msg->msg_tx_committed); + + if (msg->msg_sending) { + LASSERT(!msg->msg_receiving); + + msg->msg_tx_cpt = cpt; + msg->msg_tx_committed = 1; + if (msg->msg_rx_committed) { /* routed message REPLY */ + LASSERT(msg->msg_onactivelist); + return; + } + } else { + LASSERT(!msg->msg_sending); + msg->msg_rx_cpt = cpt; + msg->msg_rx_committed = 1; + } + + LASSERT(!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add(&msg->msg_activelist, &container->msc_active); + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; +} + +static void +lnet_msg_decommit_tx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(msg->msg_tx_committed); + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_tx_cpt]; + switch (ev->type) { + default: /* routed message */ + LASSERT(msg->msg_routing); + LASSERT(msg->msg_rx_committed); + LASSERT(ev->type == 0); + + counters->route_length += msg->msg_len; + counters->route_count++; + goto out; + + case LNET_EVENT_PUT: + /* should have been decommitted */ + LASSERT(!msg->msg_rx_committed); + /* overwritten while sending ACK */ + LASSERT(msg->msg_type == LNET_MSG_ACK); + msg->msg_type = LNET_MSG_PUT; /* fix type */ + break; + + case LNET_EVENT_SEND: + LASSERT(!msg->msg_rx_committed); + if (msg->msg_type == LNET_MSG_PUT) + counters->send_length += msg->msg_len; + break; + + case LNET_EVENT_GET: + LASSERT(msg->msg_rx_committed); + /* overwritten while sending reply, we should never be + * here for optimized GET */ + LASSERT(msg->msg_type == LNET_MSG_REPLY); + msg->msg_type = LNET_MSG_GET; /* fix type */ + break; + } + + counters->send_count++; + out: + lnet_return_tx_credits_locked(msg); + msg->msg_tx_committed = 0; +} + +static void +lnet_msg_decommit_rx(lnet_msg_t *msg, int status) +{ + lnet_counters_t *counters; + lnet_event_t *ev = &msg->msg_ev; + + LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ + LASSERT(msg->msg_rx_committed); + + if (status != 0) + goto out; + + counters = the_lnet.ln_counters[msg->msg_rx_cpt]; + switch (ev->type) { + default: + LASSERT(ev->type == 0); + LASSERT(msg->msg_routing); + goto out; + + case LNET_EVENT_ACK: + LASSERT(msg->msg_type == LNET_MSG_ACK); + break; + + case LNET_EVENT_GET: + /* type is "REPLY" if it's an optimized GET on passive side, + * because optimized GET will never be committed for sending, + * so message type wouldn't be changed back to "GET" by + * lnet_msg_decommit_tx(), see details in lnet_parse_get() */ + LASSERT(msg->msg_type == LNET_MSG_REPLY || + msg->msg_type == LNET_MSG_GET); + counters->send_length += msg->msg_wanted; + break; + + case LNET_EVENT_PUT: + LASSERT(msg->msg_type == LNET_MSG_PUT); + break; + + case LNET_EVENT_REPLY: + /* type is "GET" if it's an optimized GET on active side, + * see details in lnet_create_reply_msg() */ + LASSERT(msg->msg_type == LNET_MSG_GET || + msg->msg_type == LNET_MSG_REPLY); + break; + } + + counters->recv_count++; + if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) + counters->recv_length += msg->msg_wanted; + + out: + lnet_return_rx_credits_locked(msg); + msg->msg_rx_committed = 0; +} + +void +lnet_msg_decommit(lnet_msg_t *msg, int cpt, int status) +{ + int cpt2 = cpt; + + LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); + LASSERT(msg->msg_onactivelist); + + if (msg->msg_tx_committed) { /* always decommit for sending first */ + LASSERT(cpt == msg->msg_tx_cpt); + lnet_msg_decommit_tx(msg, status); + } + + if (msg->msg_rx_committed) { + /* forwarding msg committed for both receiving and sending */ + if (cpt != msg->msg_rx_cpt) { + lnet_net_unlock(cpt); + cpt2 = msg->msg_rx_cpt; + lnet_net_lock(cpt2); + } + lnet_msg_decommit_rx(msg, status); + } + + list_del(&msg->msg_activelist); + msg->msg_onactivelist = 0; + + the_lnet.ln_counters[cpt2]->msgs_alloc--; + + if (cpt2 != cpt) { + lnet_net_unlock(cpt2); + lnet_net_lock(cpt); + } +} + +void +lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, + unsigned int offset, unsigned int mlen) +{ + /* NB: @offset and @len are only useful for receiving */ + /* Here, we attach the MD on lnet_msg and mark it busy and + * decrementing its threshold. Come what may, the lnet_msg "owns" + * the MD until a call to lnet_msg_detach_md or lnet_finalize() + * signals completion. */ + LASSERT(!msg->msg_routing); + + msg->msg_md = md; + if (msg->msg_receiving) { /* commited for receiving */ + msg->msg_offset = offset; + msg->msg_wanted = mlen; + } + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT(md->md_threshold > 0); + md->md_threshold--; + } + + /* build umd in event */ + lnet_md2handle(&msg->msg_ev.md_handle, md); + lnet_md_deconstruct(md, &msg->msg_ev.md); +} + +void +lnet_msg_detach_md(lnet_msg_t *msg, int status) +{ + lnet_libmd_t *md = msg->msg_md; + int unlink; + + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT(md->md_refcount >= 0); + + unlink = lnet_md_unlinkable(md); + if (md->md_eq != NULL) { + msg->msg_ev.status = status; + msg->msg_ev.unlinked = unlink; + lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev); + } + + if (unlink) + lnet_md_unlink(md); + + msg->msg_md = NULL; +} + +static int +lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) +{ + lnet_handle_wire_t ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT (msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_msg_decommit(msg, cpt, 0); + + msg->msg_ack = 0; + lnet_net_unlock(cpt); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + /* NB: we probably want to use NID of msg::msg_from as 3rd + * parameter (router NID) if it's routed message */ + rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; + + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { + /* not forwarded */ + LASSERT(!msg->msg_receiving); /* called back recv already */ + lnet_net_unlock(cpt); + + rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); + + lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; + } + + lnet_msg_decommit(msg, cpt, status); + lnet_msg_free_locked(msg); + return 0; +} + +void +lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) +{ + struct lnet_msg_container *container; + int my_slot; + int cpt; + int rc; + int i; + + LASSERT (!in_interrupt ()); + + if (msg == NULL) + return; +#if 0 + CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", + lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), + msg->msg_target_is_router ? "t" : "", + msg->msg_routing ? "X" : "", + msg->msg_ack ? "A" : "", + msg->msg_sending ? "S" : "", + msg->msg_receiving ? "R" : "", + msg->msg_delayed ? "d" : "", + msg->msg_txcredit ? "C" : "", + msg->msg_peertxcredit ? "c" : "", + msg->msg_rtrcredit ? "F" : "", + msg->msg_peerrtrcredit ? "f" : "", + msg->msg_onactivelist ? "!" : "", + msg->msg_txpeer == NULL ? "" : libcfs_nid2str(msg->msg_txpeer->lp_nid), + msg->msg_rxpeer == NULL ? "" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); +#endif + msg->msg_ev.status = status; + + if (msg->msg_md != NULL) { + cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); + + lnet_res_lock(cpt); + lnet_msg_detach_md(msg, status); + lnet_res_unlock(cpt); + } + + again: + rc = 0; + if (!msg->msg_tx_committed && !msg->msg_rx_committed) { + /* not commited to network yet */ + LASSERT(!msg->msg_onactivelist); + lnet_msg_free(msg); + return; + } + + /* + * NB: routed message can be commited for both receiving and sending, + * we should finalize in LIFO order and keep counters correct. + * (finalize sending first then finalize receiving) + */ + cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + + container = the_lnet.ln_msg_containers[cpt]; + list_add_tail(&msg->msg_list, &container->msc_finalizing); + + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ + + my_slot = -1; + for (i = 0; i < container->msc_nfinalizers; i++) { + if (container->msc_finalizers[i] == current) + break; + + if (my_slot < 0 && container->msc_finalizers[i] == NULL) + my_slot = i; + } + + if (i < container->msc_nfinalizers || my_slot < 0) { + lnet_net_unlock(cpt); + return; + } + + container->msc_finalizers[my_slot] = current; + + while (!list_empty(&container->msc_finalizing)) { + msg = list_entry(container->msc_finalizing.next, + lnet_msg_t, msg_list); + + list_del(&msg->msg_list); + + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; + } + + container->msc_finalizers[my_slot] = NULL; + lnet_net_unlock(cpt); + + if (rc != 0) + goto again; +} +EXPORT_SYMBOL(lnet_finalize); + +void +lnet_msg_container_cleanup(struct lnet_msg_container *container) +{ + int count = 0; + + if (container->msc_init == 0) + return; + + while (!list_empty(&container->msc_active)) { + lnet_msg_t *msg = list_entry(container->msc_active.next, + lnet_msg_t, msg_activelist); + + LASSERT(msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del(&msg->msg_activelist); + lnet_msg_free(msg); + count++; + } + + if (count > 0) + CERROR("%d active msg on exit\n", count); + + if (container->msc_finalizers != NULL) { + LIBCFS_FREE(container->msc_finalizers, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + container->msc_finalizers = NULL; + } +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_fini(&container->msc_freelist); +#endif + container->msc_init = 0; +} + +int +lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) +{ + int rc; + + container->msc_init = 1; + + INIT_LIST_HEAD(&container->msc_active); + INIT_LIST_HEAD(&container->msc_finalizing); + +#ifdef LNET_USE_LIB_FREELIST + memset(&container->msc_freelist, 0, sizeof(lnet_freelist_t)); + + rc = lnet_freelist_init(&container->msc_freelist, + LNET_FL_MAX_MSGS, sizeof(lnet_msg_t)); + if (rc != 0) { + CERROR("Failed to init freelist for message container\n"); + lnet_msg_container_cleanup(container); + return rc; + } +#else + rc = 0; +#endif + /* number of CPUs */ + container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); + + LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt, + container->msc_nfinalizers * + sizeof(*container->msc_finalizers)); + + if (container->msc_finalizers == NULL) { + CERROR("Failed to allocate message finalizers\n"); + lnet_msg_container_cleanup(container); + return -ENOMEM; + } + + return rc; +} + +void +lnet_msg_containers_destroy(void) +{ + struct lnet_msg_container *container; + int i; + + if (the_lnet.ln_msg_containers == NULL) + return; + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) + lnet_msg_container_cleanup(container); + + cfs_percpt_free(the_lnet.ln_msg_containers); + the_lnet.ln_msg_containers = NULL; +} + +int +lnet_msg_containers_create(void) +{ + struct lnet_msg_container *container; + int rc; + int i; + + the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*container)); + + if (the_lnet.ln_msg_containers == NULL) { + CERROR("Failed to allocate cpu-partition data for network\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { + rc = lnet_msg_container_setup(container, i); + if (rc != 0) { + lnet_msg_containers_destroy(); + return rc; + } + } + + return 0; +} diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c new file mode 100644 index 000000000000..9b9e7d3139b0 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lib-ptl.c @@ -0,0 +1,938 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/lib-ptl.c + * + * portal & match routines + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* NB: add /proc interfaces in upcoming patches */ +int portal_rotor = LNET_PTL_ROTOR_HASH_RT; +CFS_MODULE_PARM(portal_rotor, "i", int, 0644, + "redirect PUTs to different cpu-partitions"); + +static int +lnet_ptl_match_type(unsigned int index, lnet_process_id_t match_id, + __u64 mbits, __u64 ignore_bits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[index]; + int unique; + + unique = ignore_bits == 0 && + match_id.nid != LNET_NID_ANY && + match_id.pid != LNET_PID_ANY; + + LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); + + /* prefer to check w/o any lock */ + if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) + goto match; + + /* unset, new portal */ + lnet_ptl_lock(ptl); + /* check again with lock */ + if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { + lnet_ptl_unlock(ptl); + goto match; + } + + /* still not set */ + if (unique) + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); + else + lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); + + lnet_ptl_unlock(ptl); + + return 1; + + match: + if ((lnet_ptl_is_unique(ptl) && !unique) || + (lnet_ptl_is_wildcard(ptl) && unique)) + return 0; + return 1; +} + +static void +lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + mtable->mt_enabled = 1; + + ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; + for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { + LASSERT(ptl->ptl_mt_maps[i] != cpt); + if (ptl->ptl_mt_maps[i] < cpt) + break; + + /* swap to order */ + ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; + ptl->ptl_mt_maps[i] = cpt; + } + + ptl->ptl_mt_nmaps++; +} + +static void +lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) +{ + struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; + int i; + + /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + if (LNET_CPT_NUMBER == 1) + return; /* never disable the only match-table */ + + mtable->mt_enabled = 0; + + LASSERT(ptl->ptl_mt_nmaps > 0 && + ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); + + /* remove it from mt_maps */ + ptl->ptl_mt_nmaps--; + for (i = 0; i < ptl->ptl_mt_nmaps; i++) { + if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ + ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; + } +} + +static int +lnet_try_match_md(lnet_libmd_t *md, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + /* ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + lnet_me_t *me = md->md_me; + + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; + + /* mismatched MD op */ + if ((md->md_options & info->mi_opc) == 0) + return LNET_MATCHMD_NONE; + + /* mismatched ME nid/pid? */ + if (me->me_match_id.nid != LNET_NID_ANY && + me->me_match_id.nid != info->mi_id.nid) + return LNET_MATCHMD_NONE; + + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != info->mi_id.pid) + return LNET_MATCHMD_NONE; + + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = info->mi_roffset; + + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT(md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } + + if (info->mi_rlength <= mlength) { /* fits in allowed space */ + mlength = info->mi_rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match "LPU64 + " length %d too big: %d left, %d allowed\n", + libcfs_id2str(info->mi_id), info->mi_mbits, + info->mi_rlength, md->md_length - offset, mlength); + + return LNET_MATCHMD_DROP; + } + + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of " + "length %d/%d into md "LPX64" [%d] + %d\n", + (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", + info->mi_portal, libcfs_id2str(info->mi_id), mlength, + info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_msg_attach_md(msg, md, offset, mlength); + md->md_offset = offset + mlength; + + if (!lnet_md_exhausted(md)) + return LNET_MATCHMD_OK; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0) + lnet_md_unlink(md); + + return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; +} + +static struct lnet_match_table * +lnet_match2mt(struct lnet_portal *ptl, lnet_process_id_t id, __u64 mbits) +{ + if (LNET_CPT_NUMBER == 1) + return ptl->ptl_mtables[0]; /* the only one */ + + /* if it's a unique portal, return match-table hashed by NID */ + return lnet_ptl_is_unique(ptl) ? + ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL; +} + +struct lnet_match_table * +lnet_mt_of_attach(unsigned int index, lnet_process_id_t id, + __u64 mbits, __u64 ignore_bits, lnet_ins_pos_t pos) +{ + struct lnet_portal *ptl; + struct lnet_match_table *mtable; + + /* NB: called w/o lock */ + LASSERT(index < the_lnet.ln_nportals); + + if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) + return NULL; + + ptl = the_lnet.ln_portals[index]; + + mtable = lnet_match2mt(ptl, id, mbits); + if (mtable != NULL) /* unique portal or only one match-table */ + return mtable; + + /* it's a wildcard portal */ + switch (pos) { + default: + return NULL; + case LNET_INS_BEFORE: + case LNET_INS_AFTER: + /* posted by no affinity thread, always hash to specific + * match-table to avoid buffer stealing which is heavy */ + return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; + case LNET_INS_LOCAL: + /* posted by cpu-affinity thread */ + return ptl->ptl_mtables[lnet_cpt_current()]; + } +} + +static struct lnet_match_table * +lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int nmaps; + int rotor; + int routed; + int cpt; + + /* NB: called w/o lock */ + LASSERT(info->mi_portal < the_lnet.ln_nportals); + ptl = the_lnet.ln_portals[info->mi_portal]; + + LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); + + mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits); + if (mtable != NULL) + return mtable; + + /* it's a wildcard portal */ + routed = LNET_NIDNET(msg->msg_hdr.src_nid) != + LNET_NIDNET(msg->msg_hdr.dest_nid); + + if (portal_rotor == LNET_PTL_ROTOR_OFF || + (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { + cpt = lnet_cpt_current(); + if (ptl->ptl_mtables[cpt]->mt_enabled) + return ptl->ptl_mtables[cpt]; + } + + rotor = ptl->ptl_rotor++; /* get round-robin factor */ + if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) + cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid); + else + cpt = rotor % LNET_CPT_NUMBER; + + if (!ptl->ptl_mtables[cpt]->mt_enabled) { + /* is there any active entry for this portal? */ + nmaps = ptl->ptl_mt_nmaps; + /* map to an active mtable to avoid heavy "stealing" */ + if (nmaps != 0) { + /* NB: there is possibility that ptl_mt_maps is being + * changed because we are not under protection of + * lnet_ptl_lock, but it shouldn't hurt anything */ + cpt = ptl->ptl_mt_maps[rotor % nmaps]; + } + } + + return ptl->ptl_mtables[cpt]; +} + +static int +lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) +{ + __u64 *bmap; + int i; + + if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + return 0; + + if (pos < 0) { /* check all bits */ + for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { + if (mtable->mt_exhausted[i] != (__u64)(-1)) + return 0; + } + return 1; + } + + LASSERT(pos <= LNET_MT_HASH_IGNORE); + /* mtable::mt_mhash[pos] is marked as exhausted or not */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + return ((*bmap) & (1ULL << pos)) != 0; +} + +static void +lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) +{ + __u64 *bmap; + + LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); + LASSERT(pos <= LNET_MT_HASH_IGNORE); + + /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ + bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; + pos &= (1 << LNET_MT_BITS_U64) - 1; + + if (!exhausted) + *bmap &= ~(1ULL << pos); + else + *bmap |= 1ULL << pos; +} + +struct list_head * +lnet_mt_match_head(struct lnet_match_table *mtable, + lnet_process_id_t id, __u64 mbits) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; + + if (lnet_ptl_is_wildcard(ptl)) { + return &mtable->mt_mhash[mbits & LNET_MT_HASH_MASK]; + } else { + unsigned long hash = mbits + id.nid + id.pid; + + LASSERT(lnet_ptl_is_unique(ptl)); + hash = cfs_hash_long(hash, LNET_MT_HASH_BITS); + return &mtable->mt_mhash[hash]; + } +} + +int +lnet_mt_match_md(struct lnet_match_table *mtable, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct list_head *head; + lnet_me_t *me; + lnet_me_t *tmp; + int exhausted = 0; + int rc; + + /* any ME with ignore bits? */ + if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) + head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; + else + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + again: + /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ + if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) + exhausted = LNET_MATCHMD_EXHAUSTED; + + list_for_each_entry_safe(me, tmp, head, me_list) { + /* ME attached but MD not attached yet */ + if (me->me_md == NULL) + continue; + + LASSERT(me == me->me_md->md_me); + + rc = lnet_try_match_md(me->me_md, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) == 0) + exhausted = 0; /* mlist is not empty */ + + if ((rc & LNET_MATCHMD_FINISH) != 0) { + /* don't return EXHAUSTED bit because we don't know + * whether the mlist is empty or not */ + return rc & ~LNET_MATCHMD_EXHAUSTED; + } + } + + if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ + lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); + if (!lnet_mt_test_exhausted(mtable, -1)) + exhausted = 0; + } + + if (exhausted == 0 && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { + head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); + goto again; /* re-check MEs w/o ignore-bits */ + } + + if (info->mi_opc == LNET_MD_OP_GET || + !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) + return LNET_MATCHMD_DROP | exhausted; + + return LNET_MATCHMD_NONE | exhausted; +} + +static int +lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) +{ + int rc; + + /* message arrived before any buffer posting on this portal, + * simply delay or drop this message */ + if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) + return 0; + + lnet_ptl_lock(ptl); + /* check it again with hold of lock */ + if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { + lnet_ptl_unlock(ptl); + return 0; + } + + if (lnet_ptl_is_lazy(ptl)) { + if (msg->msg_rx_ready_delay) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + } + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + + lnet_ptl_unlock(ptl); + return rc; +} + +static int +lnet_ptl_match_delay(struct lnet_portal *ptl, + struct lnet_match_info *info, struct lnet_msg *msg) +{ + int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ + int rc = 0; + int i; + + /* steal buffer from other CPTs, and delay it if nothing to steal, + * this function is more expensive than a regular match, but we + * don't expect it can happen a lot */ + LASSERT(lnet_ptl_is_wildcard(ptl)); + + for (i = 0; i < LNET_CPT_NUMBER; i++) { + struct lnet_match_table *mtable; + int cpt; + + cpt = (first + i) % LNET_CPT_NUMBER; + mtable = ptl->ptl_mtables[cpt]; + if (i != 0 && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) + continue; + + lnet_res_lock(cpt); + lnet_ptl_lock(ptl); + + if (i == 0) { /* the first try, attach on stealing list */ + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_stealing); + } + + if (!list_empty(&msg->msg_list)) { /* on stealing list */ + rc = lnet_mt_match_md(mtable, info, msg); + + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && + mtable->mt_enabled) + lnet_ptl_disable_mt(ptl, cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0) + list_del_init(&msg->msg_list); + + } else { + /* could be matched by lnet_ptl_attach_md() + * which is called by another thread */ + rc = msg->msg_md == NULL ? + LNET_MATCHMD_DROP : LNET_MATCHMD_OK; + } + + if (!list_empty(&msg->msg_list) && /* not matched yet */ + (i == LNET_CPT_NUMBER - 1 || /* the last CPT */ + ptl->ptl_mt_nmaps == 0 || /* no active CPT */ + (ptl->ptl_mt_nmaps == 1 && /* the only active CPT */ + ptl->ptl_mt_maps[0] == cpt))) { + /* nothing to steal, delay or drop */ + list_del_init(&msg->msg_list); + + if (lnet_ptl_is_lazy(ptl)) { + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, + &ptl->ptl_msg_delayed); + rc = LNET_MATCHMD_NONE; + } else { + rc = LNET_MATCHMD_DROP; + } + } + + lnet_ptl_unlock(ptl); + lnet_res_unlock(cpt); + + if ((rc & LNET_MATCHMD_FINISH) != 0 || msg->msg_rx_delayed) + break; + } + + return rc; +} + +int +lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) +{ + struct lnet_match_table *mtable; + struct lnet_portal *ptl; + int rc; + + CDEBUG(D_NET, "Request from %s of length %d into portal %d " + "MB="LPX64"\n", libcfs_id2str(info->mi_id), + info->mi_rlength, info->mi_portal, info->mi_mbits); + + if (info->mi_portal >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + info->mi_portal, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + ptl = the_lnet.ln_portals[info->mi_portal]; + rc = lnet_ptl_match_early(ptl, msg); + if (rc != 0) /* matched or delayed early message */ + return rc; + + mtable = lnet_mt_of_match(info, msg); + lnet_res_lock(mtable->mt_cpt); + + if (the_lnet.ln_shutdown) { + rc = LNET_MATCHMD_DROP; + goto out1; + } + + rc = lnet_mt_match_md(mtable, info, msg); + if ((rc & LNET_MATCHMD_EXHAUSTED) != 0 && mtable->mt_enabled) { + lnet_ptl_lock(ptl); + lnet_ptl_disable_mt(ptl, mtable->mt_cpt); + lnet_ptl_unlock(ptl); + } + + if ((rc & LNET_MATCHMD_FINISH) != 0) /* matched or dropping */ + goto out1; + + if (!msg->msg_rx_ready_delay) + goto out1; + + LASSERT(lnet_ptl_is_lazy(ptl)); + LASSERT(!msg->msg_rx_delayed); + + /* NB: we don't expect "delay" can happen a lot */ + if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { + lnet_ptl_lock(ptl); + + msg->msg_rx_delayed = 1; + list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(mtable->mt_cpt); + + } else { + lnet_res_unlock(mtable->mt_cpt); + rc = lnet_ptl_match_delay(ptl, info, msg); + } + + if (msg->msg_rx_delayed) { + CDEBUG(D_NET, + "Delaying %s from %s ptl %d MB "LPX64" off %d len %d\n", + info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", + libcfs_id2str(info->mi_id), info->mi_portal, + info->mi_mbits, info->mi_roffset, info->mi_rlength); + } + goto out0; + out1: + lnet_res_unlock(mtable->mt_cpt); + out0: + /* EXHAUSTED bit is only meaningful for internal functions */ + return rc & ~LNET_MATCHMD_EXHAUSTED; +} + +void +lnet_ptl_detach_md(lnet_me_t *me, lnet_libmd_t *md) +{ + LASSERT(me->me_md == md && md->md_me == me); + + me->me_md = NULL; + md->md_me = NULL; +} + +/* called with lnet_res_lock held */ +void +lnet_ptl_attach_md(lnet_me_t *me, lnet_libmd_t *md, + struct list_head *matches, struct list_head *drops) +{ + struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; + struct lnet_match_table *mtable; + struct list_head *head; + lnet_msg_t *tmp; + lnet_msg_t *msg; + int exhausted = 0; + int cpt; + + LASSERT(md->md_refcount == 0); /* a brand new MD */ + + me->me_md = md; + md->md_me = me; + + cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); + mtable = ptl->ptl_mtables[cpt]; + + if (list_empty(&ptl->ptl_msg_stealing) && + list_empty(&ptl->ptl_msg_delayed) && + !lnet_mt_test_exhausted(mtable, me->me_pos)) + return; + + lnet_ptl_lock(ptl); + head = &ptl->ptl_msg_stealing; + again: + list_for_each_entry_safe(msg, tmp, head, msg_list) { + struct lnet_match_info info; + lnet_hdr_t *hdr; + int rc; + + LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); + + hdr = &msg->msg_hdr; + info.mi_id.nid = hdr->src_nid; + info.mi_id.pid = hdr->src_pid; + info.mi_opc = LNET_MD_OP_PUT; + info.mi_portal = hdr->msg.put.ptl_index; + info.mi_rlength = hdr->payload_length; + info.mi_roffset = hdr->msg.put.offset; + info.mi_mbits = hdr->msg.put.match_bits; + + rc = lnet_try_match_md(md, &info, msg); + + exhausted = (rc & LNET_MATCHMD_EXHAUSTED) != 0; + if ((rc & LNET_MATCHMD_NONE) != 0) { + if (exhausted) + break; + continue; + } + + /* Hurrah! This _is_ a match */ + LASSERT((rc & LNET_MATCHMD_FINISH) != 0); + list_del_init(&msg->msg_list); + + if (head == &ptl->ptl_msg_stealing) { + if (exhausted) + break; + /* stealing thread will handle the message */ + continue; + } + + if ((rc & LNET_MATCHMD_OK) != 0) { + list_add_tail(&msg->msg_list, matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match "LPU64" offset %d length %d.\n", + libcfs_id2str(info.mi_id), + info.mi_portal, info.mi_mbits, + info.mi_roffset, info.mi_rlength); + } else { + list_add_tail(&msg->msg_list, drops); + } + + if (exhausted) + break; + } + + if (!exhausted && head == &ptl->ptl_msg_stealing) { + head = &ptl->ptl_msg_delayed; + goto again; + } + + if (lnet_ptl_is_wildcard(ptl) && !exhausted) { + lnet_mt_set_exhausted(mtable, me->me_pos, 0); + if (!mtable->mt_enabled) + lnet_ptl_enable_mt(ptl, cpt); + } + + lnet_ptl_unlock(ptl); +} + +void +lnet_ptl_cleanup(struct lnet_portal *ptl) +{ + struct lnet_match_table *mtable; + int i; + + if (ptl->ptl_mtables == NULL) /* uninitialized portal */ + return; + + LASSERT(list_empty(&ptl->ptl_msg_delayed)); + LASSERT(list_empty(&ptl->ptl_msg_stealing)); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + struct list_head *mhash; + lnet_me_t *me; + int j; + + if (mtable->mt_mhash == NULL) /* uninitialized match-table */ + continue; + + mhash = mtable->mt_mhash; + /* cleanup ME */ + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { + while (!list_empty(&mhash[j])) { + me = list_entry(mhash[j].next, + lnet_me_t, me_list); + CERROR("Active ME %p on exit\n", me); + list_del(&me->me_list); + lnet_me_free(me); + } + } + /* the extra entry is for MEs with ignore bits */ + LIBCFS_FREE(mhash, sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + } + + cfs_percpt_free(ptl->ptl_mtables); + ptl->ptl_mtables = NULL; +} + +int +lnet_ptl_setup(struct lnet_portal *ptl, int index) +{ + struct lnet_match_table *mtable; + struct list_head *mhash; + int i; + int j; + + ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct lnet_match_table)); + if (ptl->ptl_mtables == NULL) { + CERROR("Failed to create match table for portal %d\n", index); + return -ENOMEM; + } + + ptl->ptl_index = index; + INIT_LIST_HEAD(&ptl->ptl_msg_delayed); + INIT_LIST_HEAD(&ptl->ptl_msg_stealing); + spin_lock_init(&ptl->ptl_lock); + cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { + /* the extra entry is for MEs with ignore bits */ + LIBCFS_CPT_ALLOC(mhash, lnet_cpt_table(), i, + sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1)); + if (mhash == NULL) { + CERROR("Failed to create match hash for portal %d\n", + index); + goto failed; + } + + memset(&mtable->mt_exhausted[0], -1, + sizeof(mtable->mt_exhausted[0]) * + LNET_MT_EXHAUSTED_BMAP); + mtable->mt_mhash = mhash; + for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) + INIT_LIST_HEAD(&mhash[j]); + + mtable->mt_portal = index; + mtable->mt_cpt = i; + } + + return 0; + failed: + lnet_ptl_cleanup(ptl); + return -ENOMEM; +} + +void +lnet_portals_destroy(void) +{ + int i; + + if (the_lnet.ln_portals == NULL) + return; + + for (i = 0; i < the_lnet.ln_nportals; i++) + lnet_ptl_cleanup(the_lnet.ln_portals[i]); + + cfs_array_free(the_lnet.ln_portals); + the_lnet.ln_portals = NULL; +} + +int +lnet_portals_create(void) +{ + int size; + int i; + + size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]); + + the_lnet.ln_nportals = MAX_PORTALS; + the_lnet.ln_portals = cfs_array_alloc(the_lnet.ln_nportals, size); + if (the_lnet.ln_portals == NULL) { + CERROR("Failed to allocate portals table\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) { + lnet_portals_destroy(); + return -ENOMEM; + } + } + + return 0; +} + +/** + * Turn on the lazy portal attribute. Use with caution! + * + * This portal attribute only affects incoming PUT requests to the portal, + * and is off by default. By default, if there's no matching MD for an + * incoming PUT request, it is simply dropped. With the lazy attribute on, + * such requests are queued indefinitely until either a matching MD is + * posted to the portal or the lazy attribute is turned off. + * + * It would prevent dropped requests, however it should be regarded as the + * last line of defense - i.e. users must keep a close watch on active + * buffers on a lazy portal and once it becomes too low post more buffers as + * soon as possible. This is because delayed requests usually have detrimental + * effects on underlying network connections. A few delayed requests often + * suffice to bring an underlying connection to a complete halt, due to flow + * control mechanisms. + * + * There's also a DOS attack risk. If users don't post match-all MDs on a + * lazy portal, a malicious peer can easily stop a service by sending some + * PUT requests with match bits that won't match any MD. A routed server is + * especially vulnerable since the connections to its neighbor routers are + * shared among all clients. + * + * \param portal Index of the portal to enable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetSetLazyPortal(int portal) +{ + struct lnet_portal *ptl; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + lnet_ptl_setopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + return 0; +} +EXPORT_SYMBOL(LNetSetLazyPortal); + +/** + * Turn off the lazy portal attribute. Delayed requests on the portal, + * if any, will be all dropped when this function returns. + * + * \param portal Index of the portal to disable the lazy attribute on. + * + * \retval 0 On success. + * \retval -EINVAL If \a portal is not a valid index. + */ +int +LNetClearLazyPortal(int portal) +{ + struct lnet_portal *ptl; + LIST_HEAD (zombies); + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + ptl = the_lnet.ln_portals[portal]; + + lnet_res_lock(LNET_LOCK_EX); + lnet_ptl_lock(ptl); + + if (!lnet_ptl_is_lazy(ptl)) { + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + return 0; + } + + if (the_lnet.ln_shutdown) + CWARN("Active lazy portal %d on exit\n", portal); + else + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); + + /* grab all the blocked messages atomically */ + list_splice_init(&ptl->ptl_msg_delayed, &zombies); + + lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); + + lnet_ptl_unlock(ptl); + lnet_res_unlock(LNET_LOCK_EX); + + lnet_drop_delayed_msg_list(&zombies, "Clearing lazy portal attr"); + + return 0; +} +EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c new file mode 100644 index 000000000000..670dae34107c --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/lo.c @@ -0,0 +1,120 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +int +lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + LASSERT (!lntmsg->msg_routing); + LASSERT (!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); +} + +int +lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + lnet_msg_t *sendmsg = private; + + if (lntmsg != NULL) { /* not discarding */ + if (sendmsg->msg_iov != NULL) { + if (iov != NULL) + lnet_copy_iov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + else + lnet_copy_iov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + } else { + if (iov != NULL) + lnet_copy_kiov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + else + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + } + + lnet_finalize(ni, lntmsg, 0); + } + + lnet_finalize(ni, sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +void +lolnd_shutdown(lnet_ni_t *ni) +{ + CDEBUG (D_NET, "shutdown\n"); + LASSERT (lolnd_instanced); + + lolnd_instanced = 0; +} + +int +lolnd_startup (lnet_ni_t *ni) +{ + LASSERT (ni->ni_lnd == &the_lolnd); + LASSERT (!lolnd_instanced); + lolnd_instanced = 1; + + return (0); +} + +lnd_t the_lolnd = { + /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list}, + /* .lnd_refcount = */ 0, + /* .lnd_type = */ LOLND, + /* .lnd_startup = */ lolnd_startup, + /* .lnd_shutdown = */ lolnd_shutdown, + /* .lnt_ctl = */ NULL, + /* .lnd_send = */ lolnd_send, + /* .lnd_recv = */ lolnd_recv, + /* .lnd_eager_recv = */ NULL, + /* .lnd_notify = */ NULL, + /* .lnd_accept = */ NULL +}; diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c new file mode 100644 index 000000000000..c8323854580a --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/module.c @@ -0,0 +1,154 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +static int config_on_load = 0; +CFS_MODULE_PARM(config_on_load, "i", int, 0444, + "configure network at module load"); + +static struct mutex lnet_config_mutex; + +int +lnet_configure (void *arg) +{ + /* 'arg' only there so I can be passed to cfs_create_thread() */ + int rc = 0; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; + } + } + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return rc; +} + +int +lnet_unconfigure (void) +{ + int refcount; + + LNET_MUTEX_LOCK(&lnet_config_mutex); + + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); + } + + LNET_MUTEX_LOCK(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + LNET_MUTEX_UNLOCK(&the_lnet.ln_api_mutex); + + LNET_MUTEX_UNLOCK(&lnet_config_mutex); + return (refcount == 0) ? 0 : -EBUSY; +} + +int +lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + int rc; + + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: + return lnet_configure(NULL); + + case IOC_LIBCFS_UNCONFIGURE: + return lnet_unconfigure(); + + default: + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, data); + LNetNIFini(); + } + return rc; + } +} + +DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl); + +int +init_lnet(void) +{ + int rc; + ENTRY; + + mutex_init(&lnet_config_mutex); + + rc = LNetInit(); + if (rc != 0) { + CERROR("LNetInit: error %d\n", rc); + RETURN(rc); + } + + rc = libcfs_register_ioctl(&lnet_ioctl_handler); + LASSERT (rc == 0); + + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void) kthread_run(lnet_configure, NULL, "lnet_initd"); + } + + RETURN(0); +} + +void +fini_lnet(void) +{ + int rc; + + rc = libcfs_deregister_ioctl(&lnet_ioctl_handler); + LASSERT (rc == 0); + + LNetFini(); +} + +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +cfs_module(lnet, "1.0.0", init_lnet, fini_lnet); diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c new file mode 100644 index 000000000000..286977691393 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/peer.c @@ -0,0 +1,337 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/lnet/peer.c + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +int +lnet_peer_tables_create(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*ptable)); + if (the_lnet.ln_peer_tables == NULL) { + CERROR("Failed to allocate cpu-partition peer tables\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + INIT_LIST_HEAD(&ptable->pt_deathrow); + + LIBCFS_CPT_ALLOC(hash, lnet_cpt_table(), i, + LNET_PEER_HASH_SIZE * sizeof(*hash)); + if (hash == NULL) { + CERROR("Failed to create peer hash table\n"); + lnet_peer_tables_destroy(); + return -ENOMEM; + } + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + INIT_LIST_HEAD(&hash[j]); + ptable->pt_hash = hash; /* sign of initialization */ + } + + return 0; +} + +void +lnet_peer_tables_destroy(void) +{ + struct lnet_peer_table *ptable; + struct list_head *hash; + int i; + int j; + + if (the_lnet.ln_peer_tables == NULL) + return; + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + hash = ptable->pt_hash; + if (hash == NULL) /* not intialized */ + break; + + LASSERT(list_empty(&ptable->pt_deathrow)); + + ptable->pt_hash = NULL; + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) + LASSERT(list_empty(&hash[j])); + + LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash)); + } + + cfs_percpt_free(the_lnet.ln_peer_tables); + the_lnet.ln_peer_tables = NULL; +} + +void +lnet_peer_tables_cleanup(void) +{ + struct lnet_peer_table *ptable; + int i; + int j; + + LASSERT(the_lnet.ln_shutdown); /* i.e. no new peers */ + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + lnet_net_lock(i); + + for (j = 0; j < LNET_PEER_HASH_SIZE; j++) { + struct list_head *peers = &ptable->pt_hash[j]; + + while (!list_empty(peers)) { + lnet_peer_t *lp = list_entry(peers->next, + lnet_peer_t, + lp_hashlist); + list_del_init(&lp->lp_hashlist); + /* lose hash table's ref */ + lnet_peer_decref_locked(lp); + } + } + + lnet_net_unlock(i); + } + + cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { + LIST_HEAD (deathrow); + lnet_peer_t *lp; + + lnet_net_lock(i); + + for (j = 3; ptable->pt_number != 0; j++) { + lnet_net_unlock(i); + + if ((j & (j - 1)) == 0) { + CDEBUG(D_WARNING, + "Waiting for %d peers on peer table\n", + ptable->pt_number); + } + cfs_pause(cfs_time_seconds(1) / 2); + lnet_net_lock(i); + } + list_splice_init(&ptable->pt_deathrow, &deathrow); + + lnet_net_unlock(i); + + while (!list_empty(&deathrow)) { + lp = list_entry(deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + LIBCFS_FREE(lp, sizeof(*lp)); + } + } +} + +void +lnet_destroy_peer_locked(lnet_peer_t *lp) +{ + struct lnet_peer_table *ptable; + + LASSERT(lp->lp_refcount == 0); + LASSERT(lp->lp_rtr_refcount == 0); + LASSERT(list_empty(&lp->lp_txq)); + LASSERT(list_empty(&lp->lp_hashlist)); + LASSERT(lp->lp_txqnob == 0); + + ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; + LASSERT(ptable->pt_number > 0); + ptable->pt_number--; + + lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt); + lp->lp_ni = NULL; + + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); +} + +lnet_peer_t * +lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) +{ + struct list_head *peers; + lnet_peer_t *lp; + + LASSERT(!the_lnet.ln_shutdown); + + peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; + list_for_each_entry(lp, peers, lp_hashlist) { + if (lp->lp_nid == nid) { + lnet_peer_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +int +lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt) +{ + struct lnet_peer_table *ptable; + lnet_peer_t *lp = NULL; + lnet_peer_t *lp2; + int cpt2; + int rc = 0; + + *lpp = NULL; + if (the_lnet.ln_shutdown) /* it's shutting down */ + return -ESHUTDOWN; + + /* cpt can be LNET_LOCK_EX if it's called from router functions */ + cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid); + + ptable = the_lnet.ln_peer_tables[cpt2]; + lp = lnet_find_peer_locked(ptable, nid); + if (lp != NULL) { + *lpp = lp; + return 0; + } + + if (!list_empty(&ptable->pt_deathrow)) { + lp = list_entry(ptable->pt_deathrow.next, + lnet_peer_t, lp_hashlist); + list_del(&lp->lp_hashlist); + } + + /* + * take extra refcount in case another thread has shutdown LNet + * and destroyed locks and peer-table before I finish the allocation + */ + ptable->pt_number++; + lnet_net_unlock(cpt); + + if (lp != NULL) + memset(lp, 0, sizeof(*lp)); + else + LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp)); + + if (lp == NULL) { + rc = -ENOMEM; + lnet_net_lock(cpt); + goto out; + } + + INIT_LIST_HEAD(&lp->lp_txq); + INIT_LIST_HEAD(&lp->lp_rtrq); + INIT_LIST_HEAD(&lp->lp_routes); + + lp->lp_notify = 0; + lp->lp_notifylnd = 0; + lp->lp_notifying = 0; + lp->lp_alive_count = 0; + lp->lp_timestamp = 0; + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_last_alive = cfs_time_current(); /* assumes alive */ + lp->lp_last_query = 0; /* haven't asked NI yet */ + lp->lp_ping_timestamp = 0; + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; + lp->lp_nid = nid; + lp->lp_cpt = cpt2; + lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ + lp->lp_rtr_refcount = 0; + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + lp2 = lnet_find_peer_locked(ptable, nid); + if (lp2 != NULL) { + *lpp = lp2; + goto out; + } + + lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2); + if (lp->lp_ni == NULL) { + rc = -EHOSTUNREACH; + goto out; + } + + lp->lp_txcredits = + lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; + lp->lp_rtrcredits = + lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni); + + list_add_tail(&lp->lp_hashlist, + &ptable->pt_hash[lnet_nid2peerhash(nid)]); + ptable->pt_version++; + *lpp = lp; + + return 0; +out: + if (lp != NULL) + list_add(&lp->lp_hashlist, &ptable->pt_deathrow); + ptable->pt_number--; + return rc; +} + +void +lnet_debug_peer(lnet_nid_t nid) +{ + char *aliveness = "NA"; + lnet_peer_t *lp; + int rc; + int cpt; + + cpt = lnet_cpt_of_nid(nid); + lnet_net_lock(cpt); + + rc = lnet_nid2peer_locked(&lp, nid, cpt); + if (rc != 0) { + lnet_net_unlock(cpt); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + aliveness = lp->lp_alive ? "up" : "down"; + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nid2str(lp->lp_nid), lp->lp_refcount, + aliveness, lp->lp_ni->ni_peertxcredits, + lp->lp_rtrcredits, lp->lp_minrtrcredits, + lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); +} diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c new file mode 100644 index 000000000000..c5ff97aaacc3 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/router.c @@ -0,0 +1,1693 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +#if defined(LNET_ROUTER) + +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) + +static char *forwarding = ""; +CFS_MODULE_PARM(forwarding, "s", charp, 0444, + "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers; +CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444, + "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; +CFS_MODULE_PARM(small_router_buffers, "i", int, 0444, + "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; +CFS_MODULE_PARM(large_router_buffers, "i", int, 0444, + "# of large messages to buffer in the router"); +static int peer_buffer_credits = 0; +CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444, + "# router buffer credits per peer"); + +static int auto_down = 1; +CFS_MODULE_PARM(auto_down, "i", int, 0444, + "Automatically mark peers down on comms error"); + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + /* NI option overrides LNet default */ + if (ni->ni_peerrtrcredits > 0) + return ni->ni_peerrtrcredits; + if (peer_buffer_credits > 0) + return peer_buffer_credits; + + /* As an approximation, allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + return ni->ni_peertxcredits; +} + +/* forward ref's */ +static int lnet_router_checker(void *); +#else + +int +lnet_peer_buffer_credits(lnet_ni_t *ni) +{ + return 0; +} + +#endif + +static int check_routers_before_use = 0; +CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, + "Assume routers are down and ping them before use"); + +static int avoid_asym_router_failure = 1; +CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644, + "Avoid asymmetrical router failures (0 to disable)"); + +static int dead_router_check_interval = 60; +CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644, + "Seconds between dead router health checks (<= 0 to disable)"); + +static int live_router_check_interval = 60; +CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644, + "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644, + "Seconds to wait for the reply to a router health query"); + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +void +lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when) +{ + if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */ + CDEBUG(D_NET, "Out of date\n"); + return; + } + + lp->lp_timestamp = when; /* update timestamp */ + lp->lp_ping_deadline = 0; /* disable ping timeout */ + + if (lp->lp_alive_count != 0 && /* got old news */ + (!lp->lp_alive) == (!alive)) { /* new date for old news */ + CDEBUG(D_NET, "Old news\n"); + return; + } + + /* Flag that notification is outstanding */ + + lp->lp_alive_count++; + lp->lp_alive = !(!alive); /* 1 bit! */ + lp->lp_notify = 1; + lp->lp_notifylnd |= notifylnd; + if (lp->lp_alive) + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); +} + +void +lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp) +{ + int alive; + int notifylnd; + + /* Notify only in 1 thread at any time to ensure ordered notification. + * NB individual events can be missed; the only guarantee is that you + * always get the most recent news */ + + if (lp->lp_notifying) + return; + + lp->lp_notifying = 1; + + while (lp->lp_notify) { + alive = lp->lp_alive; + notifylnd = lp->lp_notifylnd; + + lp->lp_notifylnd = 0; + lp->lp_notify = 0; + + if (notifylnd && ni->ni_lnd->lnd_notify != NULL) { + lnet_net_unlock(lp->lp_cpt); + + /* A new notification could happen now; I'll handle it + * when control returns to me */ + + (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); + + lnet_net_lock(lp->lp_cpt); + } + } + + lp->lp_notifying = 0; +} + + +static void +lnet_rtr_addref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount >= 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount++; + if (lp->lp_rtr_refcount == 1) { + struct list_head *pos; + + /* a simple insertion sort */ + list_for_each_prev(pos, &the_lnet.ln_routers) { + lnet_peer_t *rtr = list_entry(pos, lnet_peer_t, + lp_rtr_list); + + if (rtr->lp_nid < lp->lp_nid) + break; + } + + list_add(&lp->lp_rtr_list, pos); + /* addref for the_lnet.ln_routers */ + lnet_peer_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(lnet_peer_t *lp) +{ + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount > 0); + + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { + LASSERT(list_empty(&lp->lp_routes)); + + if (lp->lp_rcd != NULL) { + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + + list_del(&lp->lp_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +lnet_remotenet_t * +lnet_find_net_locked (__u32 net) +{ + lnet_remotenet_t *rnet; + struct list_head *tmp; + struct list_head *rn_list; + + LASSERT(!the_lnet.ln_shutdown); + + rn_list = lnet_net2rnethash(net); + list_for_each(tmp, rn_list) { + rnet = list_entry(tmp, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +static void lnet_shuffle_seed(void) +{ + static int seeded = 0; + int lnd_type, seed[2]; + struct timeval tv; + lnet_ni_t *ni; + struct list_head *tmp; + + if (seeded) + return; + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + list_for_each(tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + if (lnd_type != LOLND) + seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type); + } + + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + seeded = 1; + return; +} + +/* NB expects LNET_LOCK held */ +void +lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route) +{ + unsigned int len = 0; + unsigned int offset = 0; + struct list_head *e; + + lnet_shuffle_seed(); + + list_for_each (e, &rnet->lrn_routes) { + len++; + } + + /* len+1 positions to add a new entry, also prevents division by 0 */ + offset = cfs_rand() % (len + 1); + list_for_each (e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); + + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); +} + +int +lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) +{ + struct list_head *e; + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_ni_t *ni; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND || + net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND || + LNET_NIDNET(gateway) == net || + hops < 1 || hops > 255) + return (-EINVAL); + + if (lnet_islocalnet(net)) /* it's a local network */ + return 0; /* ignore the route entry */ + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + route->lr_hops = hops; + route->lr_net = net; + + lnet_net_lock(LNET_LOCK_EX); + + rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX); + if (rc != 0) { + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */ + return 0; /* ignore the route entry */ + } else { + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, + libcfs_nid2str(gateway)); + } + return rc; + } + + LASSERT (!the_lnet.ln_shutdown); + + rnet2 = lnet_find_net_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); + rnet2 = rnet; + } + + /* Search for a duplicate route (it's a NOOP if it is) */ + add_route = 1; + list_for_each (e, &rnet2->lrn_routes) { + lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list); + + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our lookups must be true */ + LASSERT (route2->lr_gateway->lp_nid != gateway); + } + + if (add_route) { + lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */ + lnet_add_route_to_rnet(rnet2, route); + + ni = route->lr_gateway->lp_ni; + lnet_net_unlock(LNET_LOCK_EX); + + /* XXX Assume alive */ + if (ni->ni_lnd->lnd_notify != NULL) + (ni->ni_lnd->lnd_notify)(ni, gateway, 1); + + lnet_net_lock(LNET_LOCK_EX); + } + + /* -1 for notify or !add_route */ + lnet_peer_decref_locked(route->lr_gateway); + lnet_net_unlock(LNET_LOCK_EX); + + if (!add_route) + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + return 0; +} + +int +lnet_check_routes(void) +{ + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *route2; + struct list_head *e1; + struct list_head *e2; + int cpt; + struct list_head *rn_list; + int i; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + route2 = NULL; + list_for_each(e2, &rnet->lrn_routes) { + lnet_nid_t nid1; + lnet_nid_t nid2; + int net; + + route = list_entry(e2, lnet_route_t, + lr_list); + + if (route2 == NULL) { + route2 = route; + continue; + } + + if (route->lr_gateway->lp_ni == + route2->lr_gateway->lp_ni) + continue; + + nid1 = route->lr_gateway->lp_nid; + nid2 = route2->lr_gateway->lp_nid; + net = rnet->lrn_net; + + lnet_net_unlock(cpt); + + CERROR("Routes to %s via %s and %s not " + "supported\n", + libcfs_net2str(net), + libcfs_nid2str(nid1), + libcfs_nid2str(nid2)); + return -EINVAL; + } + } + } + + lnet_net_unlock(cpt); + return 0; +} + +int +lnet_del_route(__u32 net, lnet_nid_t gw_nid) +{ + struct lnet_peer *gateway; + lnet_remotenet_t *rnet; + lnet_route_t *route; + struct list_head *e1; + struct list_head *e2; + int rc = -ENOENT; + struct list_head *rn_list; + int idx = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + if (net == LNET_NIDNET(LNET_NID_ANY)) + rn_list = &the_lnet.ln_remote_nets_hash[0]; + else + rn_list = lnet_net2rnethash(net); + + again: + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + gateway = route->lr_gateway; + if (!(gw_nid == LNET_NID_ANY || + gw_nid == gateway->lp_nid)) + continue; + + list_del(&route->lr_list); + list_del(&route->lr_gwlist); + the_lnet.ln_remote_nets_version++; + + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_rtr_decref_locked(gateway); + lnet_peer_decref_locked(gateway); + + lnet_net_unlock(LNET_LOCK_EX); + + LIBCFS_FREE(route, sizeof(*route)); + + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = 0; + lnet_net_lock(LNET_LOCK_EX); + goto again; + } + } + + if (net == LNET_NIDNET(LNET_NID_ANY) && + ++idx < LNET_REMOTE_NETS_HASH_SIZE) { + rn_list = &the_lnet.ln_remote_nets_hash[idx]; + goto again; + } + lnet_net_unlock(LNET_LOCK_EX); + + return rc; +} + +void +lnet_destroy_routes (void) +{ + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); +} + +int +lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) +{ + struct list_head *e1; + struct list_head *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + int cpt; + int i; + struct list_head *rn_list; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + list_for_each(e1, rn_list) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + list_for_each(e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = route->lr_hops; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + lnet_net_unlock(cpt); + return 0; + } + } + } + } + + lnet_net_unlock(cpt); + return -ENOENT; +} + +void +lnet_swap_pinginfo(lnet_ping_info_t *info) +{ + int i; + lnet_ni_status_t *stat; + + __swab32s(&info->pi_magic); + __swab32s(&info->pi_features); + __swab32s(&info->pi_pid); + __swab32s(&info->pi_nnis); + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + stat = &info->pi_ni[i]; + __swab64s(&stat->ns_nid); + __swab32s(&stat->ns_status); + } + return; +} + +/** + * parse router-checker pinginfo, record number of down NIs for remote + * networks on that router. + */ +static void +lnet_parse_rc_info(lnet_rc_data_t *rcd) +{ + lnet_ping_info_t *info = rcd->rcd_pinginfo; + struct lnet_peer *gw = rcd->rcd_gateway; + lnet_route_t *rtr; + + if (!gw->lp_alive) + return; + + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) + lnet_swap_pinginfo(info); + + /* NB always racing with network! */ + if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CDEBUG(D_NET, "%s: Unexpected magic %08x\n", + libcfs_nid2str(gw->lp_nid), info->pi_magic); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + gw->lp_ping_feats = info->pi_features; + if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) { + CDEBUG(D_NET, "%s: Unexpected features 0x%x\n", + libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats); + return; /* nothing I can understand */ + } + + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) + return; /* can't carry NI status info */ + + list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) { + int ptl_status = LNET_NI_STATUS_INVALID; + int down = 0; + int up = 0; + int i; + + for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { + lnet_ni_status_t *stat = &info->pi_ni[i]; + lnet_nid_t nid = stat->ns_nid; + + if (nid == LNET_NID_ANY) { + CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", + libcfs_nid2str(gw->lp_nid)); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + continue; + + if (stat->ns_status == LNET_NI_STATUS_DOWN) { + if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND) + down++; + else if (ptl_status != LNET_NI_STATUS_UP) + ptl_status = LNET_NI_STATUS_DOWN; + continue; + } + + if (stat->ns_status == LNET_NI_STATUS_UP) { + if (LNET_NIDNET(nid) == rtr->lr_net) { + up = 1; + break; + } + /* ptl NIs are considered down only when + * they're all down */ + if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND) + ptl_status = LNET_NI_STATUS_UP; + continue; + } + + CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", + libcfs_nid2str(gw->lp_nid), stat->ns_status); + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; + return; + } + + if (up) { /* ignore downed NIs if NI for dest network is up */ + rtr->lr_downis = 0; + continue; + } + rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN); + } +} + +static void +lnet_router_checker_event(lnet_event_t *event) +{ + lnet_rc_data_t *rcd = event->md.user_ptr; + struct lnet_peer *lp; + + LASSERT(rcd != NULL); + + if (event->unlinked) { + LNetInvalidateHandle(&rcd->rcd_mdh); + return; + } + + LASSERT(event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_REPLY); + + lp = rcd->rcd_gateway; + LASSERT(lp != NULL); + + /* NB: it's called with holding lnet_res_lock, we have a few + * places need to hold both locks at the same time, please take + * care of lock ordering */ + lnet_net_lock(lp->lp_cpt); + if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) { + /* ignore if no longer a router or rcd is replaced */ + goto out; + } + + if (event->type == LNET_EVENT_SEND) { + lp->lp_ping_notsent = 0; + if (event->status == 0) + goto out; + } + + /* LNET_EVENT_REPLY */ + /* A successful REPLY means the router is up. If _any_ comms + * to the router fail I assume it's down (this will happen if + * we ping alive routers to try to detect router death before + * apps get burned). */ + + lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current()); + /* The router checker will wake up very shortly and do the + * actual notification. + * XXX If 'lp' stops being a router before then, it will still + * have the notification pending!!! */ + + if (avoid_asym_router_failure && event->status == 0) + lnet_parse_rc_info(rcd); + + out: + lnet_net_unlock(lp->lp_cpt); +} + +void +lnet_wait_known_routerstate(void) +{ + lnet_peer_t *rtr; + struct list_head *entry; + int all_known; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + for (;;) { + int cpt = lnet_net_lock_current(); + + all_known = 1; + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_alive_count == 0) { + all_known = 0; + break; + } + } + + lnet_net_unlock(cpt); + + if (all_known) + return; + + cfs_pause(cfs_time_seconds(1)); + } +} + +void +lnet_update_ni_status_locked(void) +{ + lnet_ni_t *ni; + long now; + int timeout; + + LASSERT(the_lnet.ln_routing); + + timeout = router_ping_timeout + + MAX(live_router_check_interval, dead_router_check_interval); + + now = cfs_time_current_sec(); + list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + if (ni->ni_lnd->lnd_type == LOLND) + continue; + + if (now < ni->ni_last_alive + timeout) + continue; + + lnet_ni_lock(ni); + /* re-check with lock */ + if (now < ni->ni_last_alive + timeout) { + lnet_ni_unlock(ni); + continue; + } + + LASSERT(ni->ni_status != NULL); + + if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { + CDEBUG(D_NET, "NI(%s:%d) status changed to down\n", + libcfs_nid2str(ni->ni_nid), timeout); + /* NB: so far, this is the only place to set + * NI status to "down" */ + ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + } + lnet_ni_unlock(ni); + } +} + +void +lnet_destroy_rc_data(lnet_rc_data_t *rcd) +{ + LASSERT(list_empty(&rcd->rcd_list)); + /* detached from network */ + LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh)); + + if (rcd->rcd_gateway != NULL) { + int cpt = rcd->rcd_gateway->lp_cpt; + + lnet_net_lock(cpt); + lnet_peer_decref_locked(rcd->rcd_gateway); + lnet_net_unlock(cpt); + } + + if (rcd->rcd_pinginfo != NULL) + LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE); + + LIBCFS_FREE(rcd, sizeof(*rcd)); +} + +lnet_rc_data_t * +lnet_create_rc_data_locked(lnet_peer_t *gateway) +{ + lnet_rc_data_t *rcd = NULL; + lnet_ping_info_t *pi; + int rc; + int i; + + lnet_net_unlock(gateway->lp_cpt); + + LIBCFS_ALLOC(rcd, sizeof(*rcd)); + if (rcd == NULL) + goto out; + + LNetInvalidateHandle(&rcd->rcd_mdh); + INIT_LIST_HEAD(&rcd->rcd_list); + + LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE); + if (pi == NULL) + goto out; + + memset(pi, 0, LNET_PINGINFO_SIZE); + for (i = 0; i < LNET_MAX_RTR_NIS; i++) { + pi->pi_ni[i].ns_nid = LNET_NID_ANY; + pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; + } + rcd->rcd_pinginfo = pi; + + LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh)); + rc = LNetMDBind((lnet_md_t){.start = pi, + .user_ptr = rcd, + .length = LNET_PINGINFO_SIZE, + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &rcd->rcd_mdh); + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out; + } + LASSERT(rc == 0); + + lnet_net_lock(gateway->lp_cpt); + /* router table changed or someone has created rcd for this gateway */ + if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) { + lnet_net_unlock(gateway->lp_cpt); + goto out; + } + + lnet_peer_addref_locked(gateway); + rcd->rcd_gateway = gateway; + gateway->lp_rcd = rcd; + gateway->lp_ping_notsent = 0; + + return rcd; + + out: + if (rcd != NULL) { + if (!LNetHandleIsInvalid(rcd->rcd_mdh)) { + rc = LNetMDUnlink(rcd->rcd_mdh); + LASSERT(rc == 0); + } + lnet_destroy_rc_data(rcd); + } + + lnet_net_lock(gateway->lp_cpt); + return gateway->lp_rcd; +} + +static int +lnet_router_check_interval (lnet_peer_t *rtr) +{ + int secs; + + secs = rtr->lp_alive ? live_router_check_interval : + dead_router_check_interval; + if (secs < 0) + secs = 0; + + return secs; +} + +static void +lnet_ping_router_locked (lnet_peer_t *rtr) +{ + lnet_rc_data_t *rcd = NULL; + cfs_time_t now = cfs_time_current(); + int secs; + + lnet_peer_addref_locked(rtr); + + if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ + cfs_time_after(now, rtr->lp_ping_deadline)) + lnet_notify_locked(rtr, 1, 0, now); + + /* Run any outstanding notifications */ + lnet_ni_notify_locked(rtr->lp_ni, rtr); + + if (!lnet_isrouter(rtr) || + the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router table changed or router checker is shutting down */ + lnet_peer_decref_locked(rtr); + return; + } + + rcd = rtr->lp_rcd != NULL ? + rtr->lp_rcd : lnet_create_rc_data_locked(rtr); + + if (rcd == NULL) + return; + + secs = lnet_router_check_interval(rtr); + + CDEBUG(D_NET, + "rtr %s %d: deadline %lu ping_notsent %d alive %d " + "alive_count %d lp_ping_timestamp %lu\n", + libcfs_nid2str(rtr->lp_nid), secs, + rtr->lp_ping_deadline, rtr->lp_ping_notsent, + rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); + + if (secs != 0 && !rtr->lp_ping_notsent && + cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp, + cfs_time_seconds(secs)))) { + int rc; + lnet_process_id_t id; + lnet_handle_md_t mdh; + + id.nid = rtr->lp_nid; + id.pid = LUSTRE_SRV_LNET_PID; + CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); + + rtr->lp_ping_notsent = 1; + rtr->lp_ping_timestamp = now; + + mdh = rcd->rcd_mdh; + + if (rtr->lp_ping_deadline == 0) { + rtr->lp_ping_deadline = + cfs_time_shift(router_ping_timeout); + } + + lnet_net_unlock(rtr->lp_cpt); + + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + lnet_net_lock(rtr->lp_cpt); + if (rc != 0) + rtr->lp_ping_notsent = 0; /* no event pending */ + } + + lnet_peer_decref_locked(rtr); + return; +} + +int +lnet_router_checker_start(void) +{ + int rc; + int eqsz; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + if (check_routers_before_use && + dead_router_check_interval <= 0) { + LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be" + " set if 'check_routers_before_use' is set" + "\n"); + return -EINVAL; + } + + if (!the_lnet.ln_routing && + live_router_check_interval <= 0 && + dead_router_check_interval <= 0) + return 0; + + sema_init(&the_lnet.ln_rc_signal, 0); + /* EQ size doesn't matter; the callback is guaranteed to get every + * event */ + eqsz = 0; + rc = LNetEQAlloc(eqsz, lnet_router_checker_event, + &the_lnet.ln_rc_eqh); + if (rc != 0) { + CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); + return -ENOMEM; + } + + the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; + rc = PTR_ERR(kthread_run(lnet_router_checker, + NULL, "router_checker")); + if (IS_ERR_VALUE(rc)) { + CERROR("Can't start router checker thread: %d\n", rc); + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT(rc == 0); + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + return -ENOMEM; + } + + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } + + return 0; +} + +void +lnet_router_checker_stop (void) +{ + int rc; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING; + + /* block until event callback signals exit */ + down(&the_lnet.ln_rc_signal); + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); + return; +} + +static void +lnet_prune_rc_data(int wait_unlink) +{ + lnet_rc_data_t *rcd; + lnet_rc_data_t *tmp; + lnet_peer_t *lp; + struct list_head head; + int i = 2; + + if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING && + list_empty(&the_lnet.ln_rcd_deathrow) && + list_empty(&the_lnet.ln_rcd_zombie))) + return; + + INIT_LIST_HEAD(&head); + + lnet_net_lock(LNET_LOCK_EX); + + if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { + /* router checker is stopping, prune all */ + list_for_each_entry(lp, &the_lnet.ln_routers, + lp_rtr_list) { + if (lp->lp_rcd == NULL) + continue; + + LASSERT(list_empty(&lp->lp_rcd->rcd_list)); + list_add(&lp->lp_rcd->rcd_list, + &the_lnet.ln_rcd_deathrow); + lp->lp_rcd = NULL; + } + } + + /* unlink all RCDs on deathrow list */ + list_splice_init(&the_lnet.ln_rcd_deathrow, &head); + + if (!list_empty(&head)) { + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry(rcd, &head, rcd_list) + LNetMDUnlink(rcd->rcd_mdh); + + lnet_net_lock(LNET_LOCK_EX); + } + + list_splice_init(&head, &the_lnet.ln_rcd_zombie); + + /* release all zombie RCDs */ + while (!list_empty(&the_lnet.ln_rcd_zombie)) { + list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, + rcd_list) { + if (LNetHandleIsInvalid(rcd->rcd_mdh)) + list_move(&rcd->rcd_list, &head); + } + + wait_unlink = wait_unlink && + !list_empty(&the_lnet.ln_rcd_zombie); + + lnet_net_unlock(LNET_LOCK_EX); + + while (!list_empty(&head)) { + rcd = list_entry(head.next, + lnet_rc_data_t, rcd_list); + list_del_init(&rcd->rcd_list); + lnet_destroy_rc_data(rcd); + } + + if (!wait_unlink) + return; + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for rc buffers to unlink\n"); + cfs_pause(cfs_time_seconds(1) / 4); + + lnet_net_lock(LNET_LOCK_EX); + } + + lnet_net_unlock(LNET_LOCK_EX); +} + + +#if defined(LNET_ROUTER) + +static int +lnet_router_checker(void *arg) +{ + lnet_peer_t *rtr; + struct list_head *entry; + + cfs_block_allsigs(); + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + __u64 version; + int cpt; + int cpt2; + + cpt = lnet_net_lock_current(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each(entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid); + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + /* the routers list has changed */ + if (version != the_lnet.ln_routers_version) + goto rescan; + } + + lnet_ping_router_locked(rtr); + + /* NB dropped lock */ + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + if (the_lnet.ln_routing) + lnet_update_ni_status_locked(); + + lnet_net_unlock(cpt); + + lnet_prune_rc_data(0); /* don't wait for UNLINK */ + + /* Call cfs_pause() here always adds 1 to load average + * because kernel counts # active tasks as nr_running + * + nr_uninterruptible. */ + schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + } + + LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING); + + lnet_prune_rc_data(1); /* wait for UNLINK */ + + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + up(&the_lnet.ln_rc_signal); + /* The unlink event callback will signal final completion */ + return 0; +} + +void +lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) +{ + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + + while (--npages >= 0) + __free_page(rb->rb_kiov[npages].kiov_page); + + LIBCFS_FREE(rb, sz); +} + +lnet_rtrbuf_t * +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + struct page *page; + lnet_rtrbuf_t *rb; + int i; + + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + __GFP_ZERO | GFP_IOFS); + if (page == NULL) { + while (--i >= 0) + __free_page(rb->rb_kiov[i].kiov_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; + } + + return rb; +} + +void +lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; + + LASSERT (list_empty(&rbp->rbp_msgs)); + LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers); + + while (!list_empty(&rbp->rbp_bufs)) { + LASSERT (rbp->rbp_credits > 0); + + rb = list_entry(rbp->rbp_bufs.next, + lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + nbuffers++; + } + + LASSERT (rbp->rbp_nbuffers == nbuffers); + LASSERT (rbp->rbp_credits == nbuffers); + + rbp->rbp_nbuffers = rbp->rbp_credits = 0; +} + +int +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt) +{ + lnet_rtrbuf_t *rb; + int i; + + if (rbp->rbp_nbuffers != 0) { + LASSERT (rbp->rbp_nbuffers == nbufs); + return 0; + } + + for (i = 0; i < nbufs; i++) { + rb = lnet_new_rtrbuf(rbp, cpt); + + if (rb == NULL) { + CERROR("Failed to allocate %d router bufs of %d pages\n", + nbufs, rbp->rbp_npages); + return -ENOMEM; + } + + rbp->rbp_nbuffers++; + rbp->rbp_credits++; + rbp->rbp_mincredits++; + list_add(&rb->rb_list, &rbp->rbp_bufs); + + /* No allocation "under fire" */ + /* Otherwise we'd need code to schedule blocked msgs etc */ + LASSERT (!the_lnet.ln_routing); + } + + LASSERT (rbp->rbp_credits == nbufs); + return 0; +} + +void +lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) +{ + INIT_LIST_HEAD(&rbp->rbp_msgs); + INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_rtrpools_free(void) +{ + lnet_rtrbufpool_t *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[0]); + lnet_rtrpool_free_bufs(&rtrp[1]); + lnet_rtrpool_free_bufs(&rtrp[2]); + } + + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; +} + +static int +lnet_nrb_tiny_calculate(int npages) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + return -1; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(int npages) +{ + int nrbs = LNET_NRB_SMALL; + + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + return -1; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); +} + +static int +lnet_nrb_large_calculate(int npages) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + return -1; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} + +int +lnet_rtrpools_alloc(int im_a_router) +{ + lnet_rtrbufpool_t *rtrp; + int large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int small_pages = 1; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either " + "'enabled' or 'disabled'\n"); + return -EINVAL; + } + + nrb_tiny = lnet_nrb_tiny_calculate(0); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(small_pages); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(large_pages); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(lnet_rtrbufpool_t)); + if (the_lnet.ln_rtrpools == NULL) { + LCONSOLE_ERROR_MSG(0x10c, + "Failed to initialize router buffe pool\n"); + return -ENOMEM; + } + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[0], 0); + rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[1], small_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[2], large_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i); + if (rc != 0) + goto failed; + } + + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); + + return 0; + + failed: + lnet_rtrpools_free(); + return rc; +} + +int +lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) +{ + struct lnet_peer *lp = NULL; + cfs_time_t now = cfs_time_current(); + int cpt = lnet_cpt_of_nid(nid); + + LASSERT (!in_interrupt ()); + + CDEBUG (D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN ("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (cfs_time_after(when, now)) { + CWARN ("Ignoring prediction from %s of %s %s " + "%ld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", + cfs_duration_sec(cfs_time_sub(when, now))); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + lnet_net_lock(cpt); + + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); + if (lp == NULL) { + /* nid not found */ + lnet_net_unlock(cpt); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + /* We can't fully trust LND on reporting exact peer last_alive + * if he notifies us about dead peer. For example ksocklnd can + * call us with when == _time_when_the_node_was_booted_ if + * no connections were successfully established */ + if (ni != NULL && !alive && when < lp->lp_last_alive) + when = lp->lp_last_alive; + + lnet_notify_locked(lp, ni == NULL, alive, when); + + lnet_ni_notify_locked(ni, lp); + + lnet_peer_decref_locked(lp); + + lnet_net_unlock(cpt); + return 0; +} +EXPORT_SYMBOL(lnet_notify); + +void +lnet_get_tunables (void) +{ + return; +} + +#else + +int +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) +{ + return -EOPNOTSUPP; +} + +void +lnet_router_checker (void) +{ + static time_t last = 0; + static int running = 0; + + time_t now = cfs_time_current_sec(); + int interval = now - last; + int rc; + __u64 version; + lnet_peer_t *rtr; + + /* It's no use to call me again within a sec - all intervals and + * timeouts are measured in seconds */ + if (last != 0 && interval < 2) + return; + + if (last != 0 && + interval > MAX(live_router_check_interval, + dead_router_check_interval)) + CNETERR("Checker(%d/%d) not called for %d seconds\n", + live_router_check_interval, dead_router_check_interval, + interval); + + LASSERT(LNET_CPT_NUMBER == 1); + + lnet_net_lock(0); + LASSERT(!running); /* recursion check */ + running = 1; + lnet_net_unlock(0); + + last = now; + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) + lnet_prune_rc_data(0); /* unlink all rcd and nowait */ + + /* consume all pending events */ + while (1) { + int i; + lnet_event_t ev; + + /* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the + * recursion breaker in LNetEQPoll would fail */ + rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i); + if (rc == 0) /* no event pending */ + break; + + /* NB a lost SENT prevents me from pinging a router again */ + if (rc == -EOVERFLOW) { + CERROR("Dropped an event!!!\n"); + abort(); + } + + LASSERT (rc == 1); + + lnet_router_checker_event(&ev); + } + + if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) { + lnet_prune_rc_data(1); /* release rcd */ + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + running = 0; + return; + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + lnet_net_lock(0); + + version = the_lnet.ln_routers_version; + list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) { + lnet_ping_router_locked(rtr); + LASSERT (version == the_lnet.ln_routers_version); + } + + lnet_net_unlock(0); + + running = 0; /* lock only needed for the recursion check */ + return; +} + +/* NB lnet_peers_start_down depends on me, + * so must be called before any peer creation */ +void +lnet_get_tunables (void) +{ + char *s; + + s = getenv("LNET_ROUTER_PING_TIMEOUT"); + if (s != NULL) router_ping_timeout = atoi(s); + + s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL"); + if (s != NULL) live_router_check_interval = atoi(s); + + s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL"); + if (s != NULL) dead_router_check_interval = atoi(s); + + /* This replaces old lnd_notify mechanism */ + check_routers_before_use = 1; + if (dead_router_check_interval <= 0) + dead_router_check_interval = 30; +} + +void +lnet_rtrpools_free(void) +{ +} + +int +lnet_rtrpools_alloc(int im_a_arouter) +{ + return 0; +} + +#endif diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c new file mode 100644 index 000000000000..3084b0c75983 --- /dev/null +++ b/drivers/staging/lustre/lnet/lnet/router_proc.c @@ -0,0 +1,950 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include + +#if defined(LNET_ROUTER) + +/* This is really lnet_proc.c. You might need to update sanity test 215 + * if any file format is changed. */ + +static ctl_table_header_t *lnet_table_header = NULL; + +#define CTL_LNET (0x100) +enum { + PSDEV_LNET_STATS = 100, + PSDEV_LNET_ROUTES, + PSDEV_LNET_ROUTERS, + PSDEV_LNET_PEERS, + PSDEV_LNET_BUFFERS, + PSDEV_LNET_NIS, + PSDEV_LNET_PTL_ROTOR, +}; + +#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) +/* + * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system + */ +#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) +/* change version, 16 bits or 8 bits */ +#define LNET_PROC_VER_BITS MAX(((MIN(LNET_LOFFT_BITS, 64)) / 4), 8) + +#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS +/* + * bits for peer hash offset + * NB: we don't use the highest bit of *ppos because it's signed + */ +#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ + LNET_PROC_CPT_BITS - \ + LNET_PROC_VER_BITS - \ + LNET_PROC_HASH_BITS - 1) +/* bits for hash index + position */ +#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) +/* bits for peer hash table + hash version */ +#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) + +#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) +#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) +#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) +#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) + +#define LNET_PROC_CPT_GET(pos) \ + (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) + +#define LNET_PROC_VER_GET(pos) \ + (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) + +#define LNET_PROC_HASH_GET(pos) \ + (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) + +#define LNET_PROC_HOFF_GET(pos) \ + (int)((pos) & LNET_PROC_HOFF_MASK) + +#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ + (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ + ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ + ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ + ((off) & LNET_PROC_HOFF_MASK)) + +#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) + +static int __proc_lnet_stats(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + int rc; + lnet_counters_t *ctrs; + int len; + char *tmpstr; + const int tmpsiz = 256; /* 7 %u and 4 LPU64 */ + + if (write) { + lnet_counters_reset(); + return 0; + } + + /* read */ + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) { + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return -ENOMEM; + } + + lnet_counters_get(ctrs); + + len = snprintf(tmpstr, tmpsiz, + "%u %u %u %u %u %u %u "LPU64" "LPU64" " + LPU64" "LPU64, + ctrs->msgs_alloc, ctrs->msgs_max, + ctrs->errors, + ctrs->send_count, ctrs->recv_count, + ctrs->route_count, ctrs->drop_count, + ctrs->send_length, ctrs->recv_length, + ctrs->route_length, ctrs->drop_length); + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + + LIBCFS_FREE(tmpstr, tmpsiz); + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +DECLARE_PROC_HANDLER(proc_lnet_stats); + +int LL_PROC_PROTO(proc_lnet_routes) +{ + const int tmpsiz = 256; + char *tmpstr; + char *s; + int rc = 0; + int len; + int ver; + int off; + + DECLARE_LL_PROC_PPOS_DECL; + + CLASSERT(sizeof(loff_t) >= 4); + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + LASSERT (tmpstr + tmpsiz - s > 0); + + s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %7s %s\n", + "net", "hops", "state", "router"); + LASSERT (tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_remote_nets_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *n; + struct list_head *r; + lnet_route_t *route = NULL; + lnet_remotenet_t *rnet = NULL; + int skip = off - 1; + struct list_head *rn_list; + int i; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { + lnet_net_unlock(0); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && route == NULL; + i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + + n = rn_list->next; + + while (n != rn_list && route == NULL) { + rnet = list_entry(n, lnet_remotenet_t, + lrn_list); + + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + lnet_route_t *re = + list_entry(r, lnet_route_t, + lr_list); + if (skip == 0) { + route = re; + break; + } + + skip--; + r = r->next; + } + + n = n->next; + } + } + + if (route != NULL) { + __u32 net = rnet->lrn_net; + unsigned int hops = route->lr_hops; + lnet_nid_t nid = route->lr_gateway->lp_nid; + int alive = route->lr_gateway->lp_alive; + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-8s %4u %7s %s\n", + libcfs_net2str(net), hops, + alive ? "up" : "down", + libcfs_nid2str(nid)); + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +int LL_PROC_PROTO(proc_lnet_routers) +{ + int rc = 0; + char *tmpstr; + char *s; + const int tmpsiz = 256; + int len; + int ver; + int off; + + DECLARE_LL_PROC_PPOS_DECL; + + off = LNET_PROC_HOFF_GET(*ppos); + ver = LNET_PROC_VER_GET(*ppos); + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", + "last_ping", "ping_sent", "deadline", + "down_ni", "router"); + LASSERT(tmpstr + tmpsiz - s > 0); + + lnet_net_lock(0); + ver = (unsigned int)the_lnet.ln_routers_version; + lnet_net_unlock(0); + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } else { + struct list_head *r; + struct lnet_peer *peer = NULL; + int skip = off - 1; + + lnet_net_lock(0); + + if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { + lnet_net_unlock(0); + + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + r = the_lnet.ln_routers.next; + + while (r != &the_lnet.ln_routers) { + lnet_peer_t *lp = list_entry(r, lnet_peer_t, + lp_rtr_list); + + if (skip == 0) { + peer = lp; + break; + } + + skip--; + r = r->next; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + cfs_time_t now = cfs_time_current(); + cfs_time_t deadline = peer->lp_ping_deadline; + int nrefs = peer->lp_refcount; + int nrtrrefs = peer->lp_rtr_refcount; + int alive_cnt = peer->lp_alive_count; + int alive = peer->lp_alive; + int pingsent = !peer->lp_ping_notsent; + int last_ping = cfs_duration_sec(cfs_time_sub(now, + peer->lp_ping_timestamp)); + int down_ni = 0; + lnet_route_t *rtr; + + if ((peer->lp_ping_feats & + LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rtr, &peer->lp_routes, + lr_gwlist) { + /* downis on any route should be the + * number of downis on the gateway */ + if (rtr->lr_downis != 0) { + down_ni = rtr->lr_downis; + break; + } + } + } + + if (deadline == 0) + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, "NA", down_ni, + libcfs_nid2str(nid)); + else + s += snprintf(s, tmpstr + tmpsiz - s, + "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n", + nrefs, nrtrrefs, alive_cnt, + alive ? "up" : "down", last_ping, + pingsent, + cfs_duration_sec(cfs_time_sub(deadline, now)), + down_ni, libcfs_nid2str(nid)); + LASSERT (tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else { + off += 1; + *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); + } + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +int LL_PROC_PROTO(proc_lnet_peers) +{ + const int tmpsiz = 256; + struct lnet_peer_table *ptable; + char *tmpstr; + char *s; + int cpt = LNET_PROC_CPT_GET(*ppos); + int ver = LNET_PROC_VER_GET(*ppos); + int hash = LNET_PROC_HASH_GET(*ppos); + int hoff = LNET_PROC_HOFF_GET(*ppos); + int rc = 0; + int len; + + CLASSERT(LNET_PROC_HASH_BITS >= LNET_PEER_HASH_BITS); + LASSERT(!write); + + if (*lenp == 0) + return 0; + + if (cpt >= LNET_CPT_NUMBER) { + *lenp = 0; + return 0; + } + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "last", "max", + "rtr", "min", "tx", "min", "queue"); + LASSERT (tmpstr + tmpsiz - s > 0); + + hoff++; + } else { + struct lnet_peer *peer; + struct list_head *p; + int skip; + again: + p = NULL; + peer = NULL; + skip = hoff - 1; + + lnet_net_lock(cpt); + ptable = the_lnet.ln_peer_tables[cpt]; + if (hoff == 1) + ver = LNET_PROC_VERSION(ptable->pt_version); + + if (ver != LNET_PROC_VERSION(ptable->pt_version)) { + lnet_net_unlock(cpt); + LIBCFS_FREE(tmpstr, tmpsiz); + return -ESTALE; + } + + while (hash < LNET_PEER_HASH_SIZE) { + if (p == NULL) + p = ptable->pt_hash[hash].next; + + while (p != &ptable->pt_hash[hash]) { + lnet_peer_t *lp = list_entry(p, lnet_peer_t, + lp_hashlist); + if (skip == 0) { + peer = lp; + + /* minor optimization: start from idx+1 + * on next iteration if we've just + * drained lp_hashlist */ + if (lp->lp_hashlist.next == + &ptable->pt_hash[hash]) { + hoff = 1; + hash++; + } else { + hoff++; + } + + break; + } + + skip--; + p = lp->lp_hashlist.next; + } + + if (peer != NULL) + break; + + p = NULL; + hoff = 1; + hash++; + } + + if (peer != NULL) { + lnet_nid_t nid = peer->lp_nid; + int nrefs = peer->lp_refcount; + int lastalive = -1; + char *aliveness = "NA"; + int maxcr = peer->lp_ni->ni_peertxcredits; + int txcr = peer->lp_txcredits; + int mintxcr = peer->lp_mintxcredits; + int rtrcr = peer->lp_rtrcredits; + int minrtrcr = peer->lp_minrtrcredits; + int txqnob = peer->lp_txqnob; + + if (lnet_isrouter(peer) || + lnet_peer_aliveness_enabled(peer)) + aliveness = peer->lp_alive ? "up" : "down"; + + if (lnet_peer_aliveness_enabled(peer)) { + cfs_time_t now = cfs_time_current(); + cfs_duration_t delta; + + delta = cfs_time_sub(now, peer->lp_last_alive); + lastalive = cfs_duration_sec(delta); + + /* No need to mess up peers contents with + * arbitrarily long integers - it suffices to + * know that lastalive is more than 10000s old + */ + if (lastalive >= 10000) + lastalive = 9999; + } + + lnet_net_unlock(cpt); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n", + libcfs_nid2str(nid), nrefs, aliveness, + lastalive, maxcr, rtrcr, minrtrcr, txcr, + mintxcr, txqnob); + LASSERT (tmpstr + tmpsiz - s > 0); + + } else { /* peer is NULL */ + lnet_net_unlock(cpt); + } + + if (hash == LNET_PEER_HASH_SIZE) { + cpt++; + hash = 0; + hoff = 1; + if (peer == NULL && cpt < LNET_CPT_NUMBER) + goto again; + } + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +static int __proc_lnet_buffers(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + char *s; + char *tmpstr; + int tmpsiz; + int idx; + int len; + int rc; + int i; + + LASSERT(!write); + + /* (4 %d) * 4 * LNET_CPT_NUMBER */ + tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + s += snprintf(s, tmpstr + tmpsiz - s, + "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + + if (the_lnet.ln_rtrpools == NULL) + goto out; /* I'm not a router */ + + for (idx = 0; idx < LNET_NRBPOOLS; idx++) { + lnet_rtrbufpool_t *rbp; + + lnet_net_lock(LNET_LOCK_EX); + cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%5d %5d %7d %7d\n", + rbp[idx].rbp_npages, + rbp[idx].rbp_nbuffers, + rbp[idx].rbp_credits, + rbp[idx].rbp_mincredits); + LASSERT(tmpstr + tmpsiz - s > 0); + } + lnet_net_unlock(LNET_LOCK_EX); + } + + out: + len = s - tmpstr; + + if (pos >= min_t(int, len, strlen(tmpstr))) + rc = 0; + else + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, NULL); + + LIBCFS_FREE(tmpstr, tmpsiz); + return rc; +} + +DECLARE_PROC_HANDLER(proc_lnet_buffers); + +int LL_PROC_PROTO(proc_lnet_nis) +{ + int tmpsiz = 128 * LNET_CPT_NUMBER; + int rc = 0; + char *tmpstr; + char *s; + int len; + + DECLARE_LL_PROC_PPOS_DECL; + + LASSERT (!write); + + if (*lenp == 0) + return 0; + + LIBCFS_ALLOC(tmpstr, tmpsiz); + if (tmpstr == NULL) + return -ENOMEM; + + s = tmpstr; /* points to current position in tmpstr[] */ + + if (*ppos == 0) { + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", + "nid", "status", "alive", "refs", "peer", + "rtr", "max", "tx", "min"); + LASSERT (tmpstr + tmpsiz - s > 0); + } else { + struct list_head *n; + lnet_ni_t *ni = NULL; + int skip = *ppos - 1; + + lnet_net_lock(0); + + n = the_lnet.ln_nis.next; + + while (n != &the_lnet.ln_nis) { + lnet_ni_t *a_ni = list_entry(n, lnet_ni_t, ni_list); + + if (skip == 0) { + ni = a_ni; + break; + } + + skip--; + n = n->next; + } + + if (ni != NULL) { + struct lnet_tx_queue *tq; + char *stat; + long now = cfs_time_current_sec(); + int last_alive = -1; + int i; + int j; + + if (the_lnet.ln_routing) + last_alive = now - ni->ni_last_alive; + + /* @lo forever alive */ + if (ni->ni_lnd->lnd_type == LOLND) + last_alive = 0; + + lnet_ni_lock(ni); + LASSERT(ni->ni_status != NULL); + stat = (ni->ni_status->ns_status == + LNET_NI_STATUS_UP) ? "up" : "down"; + lnet_ni_unlock(ni); + + /* we actually output credits information for + * TX queue of each partition */ + cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { + for (j = 0; ni->ni_cpts != NULL && + j < ni->ni_ncpts; j++) { + if (i == ni->ni_cpts[j]) + break; + } + + if (j == ni->ni_ncpts) + continue; + + if (i != 0) + lnet_net_lock(i); + + s += snprintf(s, tmpstr + tmpsiz - s, + "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n", + libcfs_nid2str(ni->ni_nid), stat, + last_alive, *ni->ni_refs[i], + ni->ni_peertxcredits, + ni->ni_peerrtrcredits, + tq->tq_credits_max, + tq->tq_credits, tq->tq_credits_min); + if (i != 0) + lnet_net_unlock(i); + } + LASSERT(tmpstr + tmpsiz - s > 0); + } + + lnet_net_unlock(0); + } + + len = s - tmpstr; /* how many bytes was written */ + + if (len > *lenp) { /* linux-supplied buffer is too small */ + rc = -EINVAL; + } else if (len > 0) { /* wrote something */ + if (copy_to_user(buffer, tmpstr, len)) + rc = -EFAULT; + else + *ppos += 1; + } + + LIBCFS_FREE(tmpstr, tmpsiz); + + if (rc == 0) + *lenp = len; + + return rc; +} + +struct lnet_portal_rotors { + int pr_value; + const char *pr_name; + const char *pr_desc; +}; + +static struct lnet_portal_rotors portal_rotors[] = { + { + .pr_value = LNET_PTL_ROTOR_OFF, + .pr_name = "OFF", + .pr_desc = "Turn off message rotor for wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_ON, + .pr_name = "ON", + .pr_desc = "round-robin dispatch all PUT messages for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_RR_RT, + .pr_name = "RR_RT", + .pr_desc = "round-robin dispatch routed PUT message for " + "wildcard portals" + }, + { + .pr_value = LNET_PTL_ROTOR_HASH_RT, + .pr_name = "HASH_RT", + .pr_desc = "dispatch routed PUT message by hashing source " + "NID for wildcard portals" + }, + { + .pr_value = -1, + .pr_name = NULL, + .pr_desc = NULL + }, +}; + +extern int portal_rotor; + +static int __proc_lnet_portal_rotor(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + const int buf_len = 128; + char *buf; + char *tmp; + int rc; + int i; + + LIBCFS_ALLOC(buf, buf_len); + if (buf == NULL) + return -ENOMEM; + + if (!write) { + lnet_res_lock(0); + + for (i = 0; portal_rotors[i].pr_value >= 0; i++) { + if (portal_rotors[i].pr_value == portal_rotor) + break; + } + + LASSERT(portal_rotors[i].pr_value == portal_rotor); + lnet_res_unlock(0); + + rc = snprintf(buf, buf_len, + "{\n\tportals: all\n" + "\trotor: %s\n\tdescription: %s\n}", + portal_rotors[i].pr_name, + portal_rotors[i].pr_desc); + + if (pos >= min_t(int, rc, buf_len)) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + buf + pos, "\n"); + } + goto out; + } + + rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob); + if (rc < 0) + goto out; + + tmp = cfs_trimwhite(buf); + + rc = -EINVAL; + lnet_res_lock(0); + for (i = 0; portal_rotors[i].pr_name != NULL; i++) { + if (cfs_strncasecmp(portal_rotors[i].pr_name, tmp, + strlen(portal_rotors[i].pr_name)) == 0) { + portal_rotor = portal_rotors[i].pr_value; + rc = 0; + break; + } + } + lnet_res_unlock(0); +out: + LIBCFS_FREE(buf, buf_len); + return rc; +} +DECLARE_PROC_HANDLER(proc_lnet_portal_rotor); + +static ctl_table_t lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + INIT_CTL_NAME(PSDEV_LNET_STATS) + .procname = "stats", + .mode = 0644, + .proc_handler = &proc_lnet_stats, + }, + { + INIT_CTL_NAME(PSDEV_LNET_ROUTES) + .procname = "routes", + .mode = 0444, + .proc_handler = &proc_lnet_routes, + }, + { + INIT_CTL_NAME(PSDEV_LNET_ROUTERS) + .procname = "routers", + .mode = 0444, + .proc_handler = &proc_lnet_routers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PEERS) + .procname = "peers", + .mode = 0444, + .proc_handler = &proc_lnet_peers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PEERS) + .procname = "buffers", + .mode = 0444, + .proc_handler = &proc_lnet_buffers, + }, + { + INIT_CTL_NAME(PSDEV_LNET_NIS) + .procname = "nis", + .mode = 0444, + .proc_handler = &proc_lnet_nis, + }, + { + INIT_CTL_NAME(PSDEV_LNET_PTL_ROTOR) + .procname = "portal_rotor", + .mode = 0644, + .proc_handler = &proc_lnet_portal_rotor, + }, + { + INIT_CTL_NAME(0) + } +}; + +static ctl_table_t top_table[] = { + { + INIT_CTL_NAME(CTL_LNET) + .procname = "lnet", + .mode = 0555, + .data = NULL, + .maxlen = 0, + .child = lnet_table, + }, + { + INIT_CTL_NAME(0) + } +}; + +void +lnet_proc_init(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header == NULL) + lnet_table_header = cfs_register_sysctl_table(top_table, 0); +#endif +} + +void +lnet_proc_fini(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header != NULL) + unregister_sysctl_table(lnet_table_header); + + lnet_table_header = NULL; +#endif +} + +#else + +void +lnet_proc_init(void) +{ +} + +void +lnet_proc_fini(void) +{ +} + +#endif diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile new file mode 100644 index 000000000000..1e40aeea2962 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o + +lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \ + module.o ping_test.o brw_test.o + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c new file mode 100644 index 000000000000..3bb6fbe23f78 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/brw_test.c @@ -0,0 +1,499 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/brw_test.c + * + * Author: Isaac Huang + */ + +#include "selftest.h" + +static int brw_srv_workitems = SFW_TEST_WI_MAX; +CFS_MODULE_PARM(brw_srv_workitems, "i", int, 0644, "# BRW server workitems"); + +static int brw_inject_errors; +CFS_MODULE_PARM(brw_inject_errors, "i", int, 0644, + "# data errors to inject randomly, zero by default"); + +static void +brw_client_fini (sfw_test_instance_t *tsi) +{ + srpc_bulk_t *bulk; + sfw_test_unit_t *tsu; + + LASSERT (tsi->tsi_is_client); + + list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) { + bulk = tsu->tsu_private; + if (bulk == NULL) continue; + + srpc_free_bulk(bulk); + tsu->tsu_private = NULL; + } +} + +int +brw_client_init (sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + int flags; + int npg; + int len; + int opc; + srpc_bulk_t *bulk; + sfw_test_unit_t *tsu; + + LASSERT(sn != NULL); + LASSERT(tsi->tsi_is_client); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + /* NB: this is not going to work for variable page size, + * but we have to keep it for compatibility */ + len = npg * PAGE_CACHE_SIZE; + + } else { + test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + if (npg > LNET_MAX_IOV || npg <= 0) + return -EINVAL; + + if (opc != LST_BRW_READ && opc != LST_BRW_WRITE) + return -EINVAL; + + if (flags != LST_BRW_CHECK_NONE && + flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE) + return -EINVAL; + + list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { + bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid), + npg, len, opc == LST_BRW_READ); + if (bulk == NULL) { + brw_client_fini(tsi); + return -ENOMEM; + } + + tsu->tsu_private = bulk; + } + + return 0; +} + +#define BRW_POISON 0xbeefbeefbeefbeefULL +#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL +#define BRW_MSIZE sizeof(__u64) + +int +brw_inject_one_error (void) +{ + struct timeval tv; + + if (brw_inject_errors <= 0) return 0; + + do_gettimeofday(&tv); + + if ((tv.tv_usec & 1) == 0) return 0; + + return brw_inject_errors--; +} + +void +brw_fill_page (struct page *pg, int pattern, __u64 magic) +{ + char *addr = page_address(pg); + int i; + + LASSERT (addr != NULL); + + if (pattern == LST_BRW_CHECK_NONE) return; + + if (magic == BRW_MAGIC) + magic += brw_inject_one_error(); + + if (pattern == LST_BRW_CHECK_SIMPLE) { + memcpy(addr, &magic, BRW_MSIZE); + addr += PAGE_CACHE_SIZE - BRW_MSIZE; + memcpy(addr, &magic, BRW_MSIZE); + return; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) + memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE); + return; + } + + LBUG (); + return; +} + +int +brw_check_page (struct page *pg, int pattern, __u64 magic) +{ + char *addr = page_address(pg); + __u64 data = 0; /* make compiler happy */ + int i; + + LASSERT (addr != NULL); + + if (pattern == LST_BRW_CHECK_NONE) + return 0; + + if (pattern == LST_BRW_CHECK_SIMPLE) { + data = *((__u64 *) addr); + if (data != magic) goto bad_data; + + addr += PAGE_CACHE_SIZE - BRW_MSIZE; + data = *((__u64 *) addr); + if (data != magic) goto bad_data; + + return 0; + } + + if (pattern == LST_BRW_CHECK_FULL) { + for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) { + data = *(((__u64 *) addr) + i); + if (data != magic) goto bad_data; + } + + return 0; + } + + LBUG (); + +bad_data: + CERROR ("Bad data in page %p: "LPX64", "LPX64" expected\n", + pg, data, magic); + return 1; +} + +void +brw_fill_bulk (srpc_bulk_t *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + brw_fill_page(pg, pattern, magic); + } +} + +int +brw_check_bulk (srpc_bulk_t *bk, int pattern, __u64 magic) +{ + int i; + struct page *pg; + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + if (brw_check_page(pg, pattern, magic) != 0) { + CERROR ("Bulk page %p (%d/%d) is corrupted!\n", + pg, i, bk->bk_niov); + return 1; + } + } + + return 0; +} + +static int +brw_client_prep_rpc (sfw_test_unit_t *tsu, + lnet_process_id_t dest, srpc_client_rpc_t **rpcpp) +{ + srpc_bulk_t *bulk = tsu->tsu_private; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_client_rpc_t *rpc; + srpc_brw_reqst_t *req; + int flags; + int npg; + int len; + int opc; + int rc; + + LASSERT(sn != NULL); + LASSERT(bulk != NULL); + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *breq = &tsi->tsi_u.bulk_v0; + + opc = breq->blk_opc; + flags = breq->blk_flags; + npg = breq->blk_npg; + len = npg * PAGE_CACHE_SIZE; + + } else { + test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1; + + /* I should never get this step if it's unknown feature + * because make_session will reject unknown feature */ + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + opc = breq->blk_opc; + flags = breq->blk_flags; + len = breq->blk_len; + npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc); + if (rc != 0) + return rc; + + memcpy(&rpc->crpc_bulk, bulk, offsetof(srpc_bulk_t, bk_iovs[npg])); + if (opc == LST_BRW_WRITE) + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC); + else + brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON); + + req = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + req->brw_flags = flags; + req->brw_rw = opc; + req->brw_len = len; + + *rpcpp = rpc; + return 0; +} + +static void +brw_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) +{ + __u64 magic = BRW_MAGIC; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_msg_t *msg = &rpc->crpc_replymsg; + srpc_brw_reply_t *reply = &msg->msg_body.brw_reply; + srpc_brw_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; + + LASSERT (sn != NULL); + + if (rpc->crpc_status != 0) { + CERROR ("BRW RPC to %s failed with %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_brw_errors); + goto out; + } + + if (msg->msg_magic != SRPC_MSG_MAGIC) { + __swab64s(&magic); + __swab32s(&reply->brw_status); + } + + CDEBUG (reply->brw_status ? D_WARNING : D_NET, + "BRW RPC to %s finished with brw_status: %d\n", + libcfs_id2str(rpc->crpc_dest), reply->brw_status); + + if (reply->brw_status != 0) { + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -(int)reply->brw_status; + goto out; + } + + if (reqst->brw_rw == LST_BRW_WRITE) goto out; + + if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR ("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->crpc_dest)); + atomic_inc(&sn->sn_brw_errors); + rpc->crpc_status = -EBADMSG; + } + +out: + return; +} + +void +brw_server_rpc_done (srpc_server_rpc_t *rpc) +{ + srpc_bulk_t *blk = rpc->srpc_bulk; + + if (blk == NULL) return; + + if (rpc->srpc_status != 0) + CERROR ("Bulk transfer %s %s has failed: %d\n", + blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer), rpc->srpc_status); + else + CDEBUG (D_NET, "Transfered %d pages bulk data %s %s\n", + blk->bk_niov, blk->bk_sink ? "from" : "to", + libcfs_id2str(rpc->srpc_peer)); + + sfw_free_pages(rpc); +} + +int +brw_bulk_ready (srpc_server_rpc_t *rpc, int status) +{ + __u64 magic = BRW_MAGIC; + srpc_brw_reply_t *reply = &rpc->srpc_replymsg.msg_body.brw_reply; + srpc_brw_reqst_t *reqst; + srpc_msg_t *reqstmsg; + + LASSERT (rpc->srpc_bulk != NULL); + LASSERT (rpc->srpc_reqstbuf != NULL); + + reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + reqst = &reqstmsg->msg_body.brw_reqst; + + if (status != 0) { + CERROR ("BRW bulk %s failed for RPC from %s: %d\n", + reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE", + libcfs_id2str(rpc->srpc_peer), status); + return -EIO; + } + + if (reqst->brw_rw == LST_BRW_READ) + return 0; + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) + __swab64s(&magic); + + if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic) != 0) { + CERROR ("Bulk data from %s is corrupted!\n", + libcfs_id2str(rpc->srpc_peer)); + reply->brw_status = EBADMSG; + } + + return 0; +} + +int +brw_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *replymsg = &rpc->srpc_replymsg; + srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + srpc_brw_reply_t *reply = &replymsg->msg_body.brw_reply; + srpc_brw_reqst_t *reqst = &reqstmsg->msg_body.brw_reqst; + int npg; + int rc; + + LASSERT (sv->sv_id == SRPC_SERVICE_BRW); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&reqst->brw_rw); + __swab32s(&reqst->brw_len); + __swab32s(&reqst->brw_flags); + __swab64s(&reqst->brw_rpyid); + __swab64s(&reqst->brw_bulkid); + } + LASSERT (reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id)); + + reply->brw_status = 0; + rpc->srpc_done = brw_server_rpc_done; + + if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) || + (reqst->brw_flags != LST_BRW_CHECK_NONE && + reqst->brw_flags != LST_BRW_CHECK_FULL && + reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) { + reply->brw_status = EINVAL; + return 0; + } + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + reply->brw_status = EPROTO; + return 0; + } + + if ((reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + /* compat with old version */ + if ((reqst->brw_len & ~CFS_PAGE_MASK) != 0) { + reply->brw_status = EINVAL; + return 0; + } + npg = reqst->brw_len >> PAGE_CACHE_SHIFT; + + } else { + npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + if (reqst->brw_len == 0 || npg > LNET_MAX_IOV) { + reply->brw_status = EINVAL; + return 0; + } + + rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg, + reqst->brw_len, + reqst->brw_rw == LST_BRW_WRITE); + if (rc != 0) + return rc; + + if (reqst->brw_rw == LST_BRW_READ) + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC); + else + brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON); + + return 0; +} + +sfw_test_client_ops_t brw_test_client; +void brw_init_test_client(void) +{ + brw_test_client.tso_init = brw_client_init; + brw_test_client.tso_fini = brw_client_fini; + brw_test_client.tso_prep_rpc = brw_client_prep_rpc; + brw_test_client.tso_done_rpc = brw_client_done_rpc; +}; + +srpc_service_t brw_test_service; +void brw_init_test_service(void) +{ + + brw_test_service.sv_id = SRPC_SERVICE_BRW; + brw_test_service.sv_name = "brw_test"; + brw_test_service.sv_handler = brw_server_handle; + brw_test_service.sv_bulk_ready = brw_bulk_ready; + brw_test_service.sv_wi_total = brw_srv_workitems; +} diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c new file mode 100644 index 000000000000..bce3d3bde6b2 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/conctl.c @@ -0,0 +1,931 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * IOC handle in kernel + * + * Author: Liang Zhen + */ + +#include +#include +#include +#include "console.h" + +int +lst_session_new_ioctl(lstio_session_new_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_ses_idp == NULL || /* address for output sid */ + args->lstio_ses_key == 0 || /* no key is specified */ + args->lstio_ses_namep == NULL || /* session name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_ses_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_ses_namep, + args->lstio_ses_nmlen)) { + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_ses_nmlen] = 0; + + rc = lstcon_session_new(name, + args->lstio_ses_key, + args->lstio_ses_feats, + args->lstio_ses_force, + args->lstio_ses_timeout, + args->lstio_ses_idp); + + LIBCFS_FREE(name, args->lstio_ses_nmlen + 1); + return rc; +} + +int +lst_session_end_ioctl(lstio_session_end_args_t *args) +{ + if (args->lstio_ses_key != console_session.ses_key) + return -EACCES; + + return lstcon_session_end(); +} + +int +lst_session_info_ioctl(lstio_session_info_args_t *args) +{ + /* no checking of key */ + + if (args->lstio_ses_idp == NULL || /* address for ouput sid */ + args->lstio_ses_keyp == NULL || /* address for ouput key */ + args->lstio_ses_featp == NULL || /* address for ouput features */ + args->lstio_ses_ndinfo == NULL || /* address for output ndinfo */ + args->lstio_ses_namep == NULL || /* address for ouput name */ + args->lstio_ses_nmlen <= 0 || + args->lstio_ses_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_session_info(args->lstio_ses_idp, + args->lstio_ses_keyp, + args->lstio_ses_featp, + args->lstio_ses_ndinfo, + args->lstio_ses_namep, + args->lstio_ses_nmlen); +} + +int +lst_debug_ioctl(lstio_debug_args_t *args) +{ + char *name = NULL; + int client = 1; + int rc; + + if (args->lstio_dbg_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_dbg_resultp == NULL) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL && /* name of batch/group */ + (args->lstio_dbg_nmlen <= 0 || + args->lstio_dbg_nmlen > LST_NAME_SIZE)) + return -EINVAL; + + if (args->lstio_dbg_namep != NULL) { + LIBCFS_ALLOC(name, args->lstio_dbg_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_dbg_namep, + args->lstio_dbg_nmlen)) { + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_dbg_nmlen] = 0; + } + + rc = -EINVAL; + + switch (args->lstio_dbg_type) { + case LST_OPC_SESSION: + rc = lstcon_session_debug(args->lstio_dbg_timeout, + args->lstio_dbg_resultp); + break; + + case LST_OPC_BATCHSRV: + client = 0; + case LST_OPC_BATCHCLI: + if (name == NULL) + goto out; + + rc = lstcon_batch_debug(args->lstio_dbg_timeout, + name, client, args->lstio_dbg_resultp); + break; + + case LST_OPC_GROUP: + if (name == NULL) + goto out; + + rc = lstcon_group_debug(args->lstio_dbg_timeout, + name, args->lstio_dbg_resultp); + break; + + case LST_OPC_NODES: + if (args->lstio_dbg_count <= 0 || + args->lstio_dbg_idsp == NULL) + goto out; + + rc = lstcon_nodes_debug(args->lstio_dbg_timeout, + args->lstio_dbg_count, + args->lstio_dbg_idsp, + args->lstio_dbg_resultp); + break; + + default: + break; + } + +out: + if (name != NULL) + LIBCFS_FREE(name, args->lstio_dbg_nmlen + 1); + + return rc; +} + +int +lst_group_add_ioctl(lstio_group_add_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL|| + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_add(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +int +lst_group_del_ioctl(lstio_group_del_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_del(name); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +int +lst_group_update_ioctl(lstio_group_update_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_resultp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + switch (args->lstio_grp_opc) { + case LST_GROUP_CLEAN: + rc = lstcon_group_clean(name, args->lstio_grp_args); + break; + + case LST_GROUP_REFRESH: + rc = lstcon_group_refresh(name, args->lstio_grp_resultp); + break; + + case LST_GROUP_RMND: + if (args->lstio_grp_count <= 0 || + args->lstio_grp_idsp == NULL) { + rc = -EINVAL; + break; + } + rc = lstcon_nodes_remove(name, args->lstio_grp_count, + args->lstio_grp_idsp, + args->lstio_grp_resultp); + break; + + default: + rc = -EINVAL; + break; + } + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return rc; +} + +int +lst_nodes_add_ioctl(lstio_group_nodes_args_t *args) +{ + unsigned feats; + int rc; + char *name; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idsp == NULL || /* array of ids */ + args->lstio_grp_count <= 0 || + args->lstio_grp_resultp == NULL || + args->lstio_grp_featp == NULL || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_nodes_add(name, args->lstio_grp_count, + args->lstio_grp_idsp, &feats, + args->lstio_grp_resultp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + if (rc == 0 && + copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) { + return -EINVAL; + } + + return rc; +} + +int +lst_group_list_ioctl(lstio_group_list_args_t *args) +{ + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_idx < 0 || + args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_group_list(args->lstio_grp_idx, + args->lstio_grp_nmlen, + args->lstio_grp_namep); +} + +int +lst_group_info_ioctl(lstio_group_info_args_t *args) +{ + char *name; + int ndent; + int index; + int rc; + + if (args->lstio_grp_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_grp_namep == NULL || + args->lstio_grp_nmlen <= 0 || + args->lstio_grp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_grp_entp == NULL && /* output: group entry */ + args->lstio_grp_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_grp_dentsp != NULL) { /* have node entry */ + if (args->lstio_grp_idxp == NULL || /* node index */ + args->lstio_grp_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&ndent, args->lstio_grp_ndentp, + sizeof(ndent)) || + copy_from_user(&index, args->lstio_grp_idxp, + sizeof(index))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_grp_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_grp_namep, + args->lstio_grp_nmlen)) { + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_grp_nmlen] = 0; + + rc = lstcon_group_info(name, args->lstio_grp_entp, + &index, &ndent, args->lstio_grp_dentsp); + + LIBCFS_FREE(name, args->lstio_grp_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_grp_dentsp != NULL && + (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return 0; +} + +int +lst_batch_add_ioctl(lstio_batch_add_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_add(name); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +int +lst_batch_run_ioctl(lstio_batch_run_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_run(name, args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +int +lst_batch_stop_ioctl(lstio_batch_stop_args_t *args) +{ + int rc; + char *name; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_stop(name, args->lstio_bat_force, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +int +lst_batch_query_ioctl(lstio_batch_query_args_t *args) +{ + char *name; + int rc; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_resultp == NULL || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_testidx < 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, + args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_test_batch_query(name, + args->lstio_bat_testidx, + args->lstio_bat_client, + args->lstio_bat_timeout, + args->lstio_bat_resultp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + return rc; +} + +int +lst_batch_list_ioctl(lstio_batch_list_args_t *args) +{ + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_idx < 0 || + args->lstio_bat_namep == NULL || + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + return lstcon_batch_list(args->lstio_bat_idx, + args->lstio_bat_nmlen, + args->lstio_bat_namep); +} + +int +lst_batch_info_ioctl(lstio_batch_info_args_t *args) +{ + char *name; + int rc; + int index; + int ndent; + + if (args->lstio_bat_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_bat_namep == NULL || /* batch name */ + args->lstio_bat_nmlen <= 0 || + args->lstio_bat_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_bat_entp == NULL && /* output: batch entry */ + args->lstio_bat_dentsp == NULL) /* output: node entry */ + return -EINVAL; + + if (args->lstio_bat_dentsp != NULL) { /* have node entry */ + if (args->lstio_bat_idxp == NULL || /* node index */ + args->lstio_bat_ndentp == NULL) /* # of node entry */ + return -EINVAL; + + if (copy_from_user(&index, args->lstio_bat_idxp, + sizeof(index)) || + copy_from_user(&ndent, args->lstio_bat_ndentp, + sizeof(ndent))) + return -EFAULT; + + if (ndent <= 0 || index < 0) + return -EINVAL; + } + + LIBCFS_ALLOC(name, args->lstio_bat_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, + args->lstio_bat_namep, args->lstio_bat_nmlen)) { + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + return -EFAULT; + } + + name[args->lstio_bat_nmlen] = 0; + + rc = lstcon_batch_info(name, + args->lstio_bat_entp, args->lstio_bat_server, + args->lstio_bat_testidx, &index, &ndent, + args->lstio_bat_dentsp); + + LIBCFS_FREE(name, args->lstio_bat_nmlen + 1); + + if (rc != 0) + return rc; + + if (args->lstio_bat_dentsp != NULL && + (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) || + copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent)))) + rc = -EFAULT; + + return rc; +} + +int +lst_stat_query_ioctl(lstio_stat_args_t *args) +{ + int rc; + char *name; + + /* TODO: not finished */ + if (args->lstio_sta_key != console_session.ses_key) + return -EACCES; + + if (args->lstio_sta_resultp == NULL || + (args->lstio_sta_namep == NULL && + args->lstio_sta_idsp == NULL) || + args->lstio_sta_nmlen <= 0 || + args->lstio_sta_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_sta_idsp != NULL && + args->lstio_sta_count <= 0) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_sta_nmlen + 1); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, args->lstio_sta_namep, + args->lstio_sta_nmlen)) { + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + return -EFAULT; + } + + if (args->lstio_sta_idsp == NULL) { + rc = lstcon_group_stat(name, args->lstio_sta_timeout, + args->lstio_sta_resultp); + } else { + rc = lstcon_nodes_stat(args->lstio_sta_count, + args->lstio_sta_idsp, + args->lstio_sta_timeout, + args->lstio_sta_resultp); + } + + LIBCFS_FREE(name, args->lstio_sta_nmlen + 1); + + return rc; +} + +int lst_test_add_ioctl(lstio_test_args_t *args) +{ + char *name; + char *srcgrp = NULL; + char *dstgrp = NULL; + void *param = NULL; + int ret = 0; + int rc = -ENOMEM; + + if (args->lstio_tes_resultp == NULL || + args->lstio_tes_retp == NULL || + args->lstio_tes_bat_name == NULL || /* no specified batch */ + args->lstio_tes_bat_nmlen <= 0 || + args->lstio_tes_bat_nmlen > LST_NAME_SIZE || + args->lstio_tes_sgrp_name == NULL || /* no source group */ + args->lstio_tes_sgrp_nmlen <= 0 || + args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE || + args->lstio_tes_dgrp_name == NULL || /* no target group */ + args->lstio_tes_dgrp_nmlen <= 0 || + args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE) + return -EINVAL; + + if (args->lstio_tes_loop == 0 || /* negative is infinite */ + args->lstio_tes_concur <= 0 || + args->lstio_tes_dist <= 0 || + args->lstio_tes_span <= 0) + return -EINVAL; + + /* have parameter, check if parameter length is valid */ + if (args->lstio_tes_param != NULL && + (args->lstio_tes_param_len <= 0 || + args->lstio_tes_param_len > PAGE_CACHE_SIZE - sizeof(lstcon_test_t))) + return -EINVAL; + + LIBCFS_ALLOC(name, args->lstio_tes_bat_nmlen + 1); + if (name == NULL) + return rc; + + LIBCFS_ALLOC(srcgrp, args->lstio_tes_sgrp_nmlen + 1); + if (srcgrp == NULL) + goto out; + + LIBCFS_ALLOC(dstgrp, args->lstio_tes_dgrp_nmlen + 1); + if (dstgrp == NULL) + goto out; + + if (args->lstio_tes_param != NULL) { + LIBCFS_ALLOC(param, args->lstio_tes_param_len); + if (param == NULL) + goto out; + } + + rc = -EFAULT; + if (copy_from_user(name, + args->lstio_tes_bat_name, + args->lstio_tes_bat_nmlen) || + copy_from_user(srcgrp, + args->lstio_tes_sgrp_name, + args->lstio_tes_sgrp_nmlen) || + copy_from_user(dstgrp, + args->lstio_tes_dgrp_name, + args->lstio_tes_dgrp_nmlen) || + copy_from_user(param, args->lstio_tes_param, + args->lstio_tes_param_len)) + goto out; + + rc = lstcon_test_add(name, + args->lstio_tes_type, + args->lstio_tes_loop, + args->lstio_tes_concur, + args->lstio_tes_dist, args->lstio_tes_span, + srcgrp, dstgrp, param, args->lstio_tes_param_len, + &ret, args->lstio_tes_resultp); + + if (ret != 0) + rc = (copy_to_user(args->lstio_tes_retp, &ret, + sizeof(ret))) ? -EFAULT : 0; +out: + if (name != NULL) + LIBCFS_FREE(name, args->lstio_tes_bat_nmlen + 1); + + if (srcgrp != NULL) + LIBCFS_FREE(srcgrp, args->lstio_tes_sgrp_nmlen + 1); + + if (dstgrp != NULL) + LIBCFS_FREE(dstgrp, args->lstio_tes_dgrp_nmlen + 1); + + if (param != NULL) + LIBCFS_FREE(param, args->lstio_tes_param_len); + + return rc; +} + +int +lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + char *buf; + int opc = data->ioc_u32[0]; + int rc; + + if (cmd != IOC_LIBCFS_LNETST) + return -EINVAL; + + if (data->ioc_plen1 > PAGE_CACHE_SIZE) + return -EINVAL; + + LIBCFS_ALLOC(buf, data->ioc_plen1); + if (buf == NULL) + return -ENOMEM; + + /* copy in parameter */ + if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) { + LIBCFS_FREE(buf, data->ioc_plen1); + return -EFAULT; + } + + mutex_lock(&console_session.ses_mutex); + + console_session.ses_laststamp = cfs_time_current_sec(); + + if (console_session.ses_shutdown) { + rc = -ESHUTDOWN; + goto out; + } + + if (console_session.ses_expired) + lstcon_session_end(); + + if (opc != LSTIO_SESSION_NEW && + console_session.ses_state == LST_SESSION_NONE) { + CDEBUG(D_NET, "LST no active session\n"); + rc = -ESRCH; + goto out; + } + + memset(&console_session.ses_trans_stat, 0, sizeof(lstcon_trans_stat_t)); + + switch (opc) { + case LSTIO_SESSION_NEW: + rc = lst_session_new_ioctl((lstio_session_new_args_t *)buf); + break; + case LSTIO_SESSION_END: + rc = lst_session_end_ioctl((lstio_session_end_args_t *)buf); + break; + case LSTIO_SESSION_INFO: + rc = lst_session_info_ioctl((lstio_session_info_args_t *)buf); + break; + case LSTIO_DEBUG: + rc = lst_debug_ioctl((lstio_debug_args_t *)buf); + break; + case LSTIO_GROUP_ADD: + rc = lst_group_add_ioctl((lstio_group_add_args_t *)buf); + break; + case LSTIO_GROUP_DEL: + rc = lst_group_del_ioctl((lstio_group_del_args_t *)buf); + break; + case LSTIO_GROUP_UPDATE: + rc = lst_group_update_ioctl((lstio_group_update_args_t *)buf); + break; + case LSTIO_NODES_ADD: + rc = lst_nodes_add_ioctl((lstio_group_nodes_args_t *)buf); + break; + case LSTIO_GROUP_LIST: + rc = lst_group_list_ioctl((lstio_group_list_args_t *)buf); + break; + case LSTIO_GROUP_INFO: + rc = lst_group_info_ioctl((lstio_group_info_args_t *)buf); + break; + case LSTIO_BATCH_ADD: + rc = lst_batch_add_ioctl((lstio_batch_add_args_t *)buf); + break; + case LSTIO_BATCH_START: + rc = lst_batch_run_ioctl((lstio_batch_run_args_t *)buf); + break; + case LSTIO_BATCH_STOP: + rc = lst_batch_stop_ioctl((lstio_batch_stop_args_t *)buf); + break; + case LSTIO_BATCH_QUERY: + rc = lst_batch_query_ioctl((lstio_batch_query_args_t *)buf); + break; + case LSTIO_BATCH_LIST: + rc = lst_batch_list_ioctl((lstio_batch_list_args_t *)buf); + break; + case LSTIO_BATCH_INFO: + rc = lst_batch_info_ioctl((lstio_batch_info_args_t *)buf); + break; + case LSTIO_TEST_ADD: + rc = lst_test_add_ioctl((lstio_test_args_t *)buf); + break; + case LSTIO_STAT_QUERY: + rc = lst_stat_query_ioctl((lstio_stat_args_t *)buf); + break; + default: + rc = -EINVAL; + } + + if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat, + sizeof(lstcon_trans_stat_t))) + rc = -EFAULT; +out: + mutex_unlock(&console_session.ses_mutex); + + LIBCFS_FREE(buf, data->ioc_plen1); + + return rc; +} + +EXPORT_SYMBOL(lstcon_ioctl_entry); diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c new file mode 100644 index 000000000000..446de0e4672f --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/conrpc.c @@ -0,0 +1,1397 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Console framework rpcs + * + * Author: Liang Zhen + */ + + +#include +#include +#include "timer.h" +#include "conrpc.h" +#include "console.h" + +void lstcon_rpc_stat_reply(lstcon_rpc_trans_t *, srpc_msg_t *, + lstcon_node_t *, lstcon_trans_stat_t *); + +static void +lstcon_rpc_done(srpc_client_rpc_t *rpc) +{ + lstcon_rpc_t *crpc = (lstcon_rpc_t *)rpc->crpc_priv; + + LASSERT(crpc != NULL && rpc == crpc->crp_rpc); + LASSERT(crpc->crp_posted && !crpc->crp_finished); + + spin_lock(&rpc->crpc_lock); + + if (crpc->crp_trans == NULL) { + /* Orphan RPC is not in any transaction, + * I'm just a poor body and nobody loves me */ + spin_unlock(&rpc->crpc_lock); + + /* release it */ + lstcon_rpc_put(crpc); + return; + } + + /* not an orphan RPC */ + crpc->crp_finished = 1; + + if (crpc->crp_stamp == 0) { + /* not aborted */ + LASSERT (crpc->crp_status == 0); + + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = rpc->crpc_status; + } + + /* wakeup (transaction)thread if I'm the last RPC in the transaction */ + if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining)) + wake_up(&crpc->crp_trans->tas_waitq); + + spin_unlock(&rpc->crpc_lock); +} + +int +lstcon_rpc_init(lstcon_node_t *nd, int service, unsigned feats, + int bulk_npg, int bulk_len, int embedded, lstcon_rpc_t *crpc) +{ + crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service, + feats, bulk_npg, bulk_len, + lstcon_rpc_done, (void *)crpc); + if (crpc->crp_rpc == NULL) + return -ENOMEM; + + crpc->crp_trans = NULL; + crpc->crp_node = nd; + crpc->crp_posted = 0; + crpc->crp_finished = 0; + crpc->crp_unpacked = 0; + crpc->crp_status = 0; + crpc->crp_stamp = 0; + crpc->crp_embedded = embedded; + INIT_LIST_HEAD(&crpc->crp_link); + + atomic_inc(&console_session.ses_rpc_counter); + + return 0; +} + +int +lstcon_rpc_prep(lstcon_node_t *nd, int service, unsigned feats, + int bulk_npg, int bulk_len, lstcon_rpc_t **crpcpp) +{ + lstcon_rpc_t *crpc = NULL; + int rc; + + spin_lock(&console_session.ses_rpc_lock); + + if (!list_empty(&console_session.ses_rpc_freelist)) { + crpc = list_entry(console_session.ses_rpc_freelist.next, + lstcon_rpc_t, crp_link); + list_del_init(&crpc->crp_link); + } + + spin_unlock(&console_session.ses_rpc_lock); + + if (crpc == NULL) { + LIBCFS_ALLOC(crpc, sizeof(*crpc)); + if (crpc == NULL) + return -ENOMEM; + } + + rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc); + if (rc == 0) { + *crpcpp = crpc; + return 0; + } + + LIBCFS_FREE(crpc, sizeof(*crpc)); + + return rc; +} + +void +lstcon_rpc_put(lstcon_rpc_t *crpc) +{ + srpc_bulk_t *bulk = &crpc->crp_rpc->crpc_bulk; + int i; + + LASSERT (list_empty(&crpc->crp_link)); + + for (i = 0; i < bulk->bk_niov; i++) { + if (bulk->bk_iovs[i].kiov_page == NULL) + continue; + + __free_page(bulk->bk_iovs[i].kiov_page); + } + + srpc_client_rpc_decref(crpc->crp_rpc); + + if (crpc->crp_embedded) { + /* embedded RPC, don't recycle it */ + memset(crpc, 0, sizeof(*crpc)); + crpc->crp_embedded = 1; + + } else { + spin_lock(&console_session.ses_rpc_lock); + + list_add(&crpc->crp_link, + &console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + } + + /* RPC is not alive now */ + atomic_dec(&console_session.ses_rpc_counter); +} + +void +lstcon_rpc_post(lstcon_rpc_t *crpc) +{ + lstcon_rpc_trans_t *trans = crpc->crp_trans; + + LASSERT (trans != NULL); + + atomic_inc(&trans->tas_remaining); + crpc->crp_posted = 1; + + sfw_post_rpc(crpc->crp_rpc); +} + +static char * +lstcon_rpc_trans_name(int transop) +{ + if (transop == LST_TRANS_SESNEW) + return "SESNEW"; + + if (transop == LST_TRANS_SESEND) + return "SESEND"; + + if (transop == LST_TRANS_SESQRY) + return "SESQRY"; + + if (transop == LST_TRANS_SESPING) + return "SESPING"; + + if (transop == LST_TRANS_TSBCLIADD) + return "TSBCLIADD"; + + if (transop == LST_TRANS_TSBSRVADD) + return "TSBSRVADD"; + + if (transop == LST_TRANS_TSBRUN) + return "TSBRUN"; + + if (transop == LST_TRANS_TSBSTOP) + return "TSBSTOP"; + + if (transop == LST_TRANS_TSBCLIQRY) + return "TSBCLIQRY"; + + if (transop == LST_TRANS_TSBSRVQRY) + return "TSBSRVQRY"; + + if (transop == LST_TRANS_STATQRY) + return "STATQRY"; + + return "Unknown"; +} + +int +lstcon_rpc_trans_prep(struct list_head *translist, + int transop, lstcon_rpc_trans_t **transpp) +{ + lstcon_rpc_trans_t *trans; + + if (translist != NULL) { + list_for_each_entry(trans, translist, tas_link) { + /* Can't enqueue two private transaction on + * the same object */ + if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE) + return -EPERM; + } + } + + /* create a trans group */ + LIBCFS_ALLOC(trans, sizeof(*trans)); + if (trans == NULL) + return -ENOMEM; + + trans->tas_opc = transop; + + if (translist == NULL) + INIT_LIST_HEAD(&trans->tas_olink); + else + list_add_tail(&trans->tas_olink, translist); + + list_add_tail(&trans->tas_link, &console_session.ses_trans_list); + + INIT_LIST_HEAD(&trans->tas_rpcs_list); + atomic_set(&trans->tas_remaining, 0); + init_waitqueue_head(&trans->tas_waitq); + + spin_lock(&console_session.ses_rpc_lock); + trans->tas_features = console_session.ses_features; + spin_unlock(&console_session.ses_rpc_lock); + + *transpp = trans; + return 0; +} + +void +lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *crpc) +{ + list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list); + crpc->crp_trans = trans; +} + +void +lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error) +{ + srpc_client_rpc_t *rpc; + lstcon_rpc_t *crpc; + lstcon_node_t *nd; + + list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + if (!crpc->crp_posted || /* not posted */ + crpc->crp_stamp != 0) { /* rpc done or aborted already */ + if (crpc->crp_stamp == 0) { + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = -EINTR; + } + spin_unlock(&rpc->crpc_lock); + continue; + } + + crpc->crp_stamp = cfs_time_current(); + crpc->crp_status = error; + + spin_unlock(&rpc->crpc_lock); + + sfw_abort_rpc(rpc); + + if (error != ETIMEDOUT) + continue; + + nd = crpc->crp_node; + if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp)) + continue; + + nd->nd_stamp = crpc->crp_stamp; + nd->nd_state = LST_NODE_DOWN; + } +} + +static int +lstcon_rpc_trans_check(lstcon_rpc_trans_t *trans) +{ + if (console_session.ses_shutdown && + !list_empty(&trans->tas_olink)) /* Not an end session RPC */ + return 1; + + return (atomic_read(&trans->tas_remaining) == 0) ? 1: 0; +} + +int +lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout) +{ + lstcon_rpc_t *crpc; + int rc; + + if (list_empty(&trans->tas_rpcs_list)) + return 0; + + if (timeout < LST_TRANS_MIN_TIMEOUT) + timeout = LST_TRANS_MIN_TIMEOUT; + + CDEBUG(D_NET, "Transaction %s started\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + /* post all requests */ + list_for_each_entry (crpc, &trans->tas_rpcs_list, crp_link) { + LASSERT (!crpc->crp_posted); + + lstcon_rpc_post(crpc); + } + + mutex_unlock(&console_session.ses_mutex); + + rc = wait_event_interruptible_timeout(trans->tas_waitq, + lstcon_rpc_trans_check(trans), + cfs_time_seconds(timeout)); + rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT); + + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown) + rc = -ESHUTDOWN; + + if (rc != 0 || atomic_read(&trans->tas_remaining) != 0) { + /* treat short timeout as canceled */ + if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2) + rc = -EINTR; + + lstcon_rpc_trans_abort(trans, rc); + } + + CDEBUG(D_NET, "Transaction %s stopped: %d\n", + lstcon_rpc_trans_name(trans->tas_opc), rc); + + lstcon_rpc_trans_stat(trans, lstcon_trans_stat()); + + return rc; +} + +int +lstcon_rpc_get_reply(lstcon_rpc_t *crpc, srpc_msg_t **msgpp) +{ + lstcon_node_t *nd = crpc->crp_node; + srpc_client_rpc_t *rpc = crpc->crp_rpc; + srpc_generic_reply_t *rep; + + LASSERT (nd != NULL && rpc != NULL); + LASSERT (crpc->crp_stamp != 0); + + if (crpc->crp_status != 0) { + *msgpp = NULL; + return crpc->crp_status; + } + + *msgpp = &rpc->crpc_replymsg; + if (!crpc->crp_unpacked) { + sfw_unpack_message(*msgpp); + crpc->crp_unpacked = 1; + } + + if (cfs_time_after(nd->nd_stamp, crpc->crp_stamp)) + return 0; + + nd->nd_stamp = crpc->crp_stamp; + rep = &(*msgpp)->msg_body.reply; + + if (rep->sid.ses_nid == LNET_NID_ANY) + nd->nd_state = LST_NODE_UNKNOWN; + else if (lstcon_session_match(rep->sid)) + nd->nd_state = LST_NODE_ACTIVE; + else + nd->nd_state = LST_NODE_BUSY; + + return 0; +} + +void +lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, lstcon_trans_stat_t *stat) +{ + lstcon_rpc_t *crpc; + srpc_msg_t *rep; + int error; + + LASSERT (stat != NULL); + + memset(stat, 0, sizeof(*stat)); + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + lstcon_rpc_stat_total(stat, 1); + + LASSERT (crpc->crp_stamp != 0); + + error = lstcon_rpc_get_reply(crpc, &rep); + if (error != 0) { + lstcon_rpc_stat_failure(stat, 1); + if (stat->trs_rpc_errno == 0) + stat->trs_rpc_errno = -error; + + continue; + } + + lstcon_rpc_stat_success(stat, 1); + + lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat); + } + + if (trans->tas_opc == LST_TRANS_SESNEW && stat->trs_fwk_errno == 0) { + stat->trs_fwk_errno = + lstcon_session_feats_check(trans->tas_features); + } + + CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, " + "RPC error(%d), Framework error(%d)\n", + lstcon_rpc_trans_name(trans->tas_opc), + lstcon_rpc_stat_success(stat, 0), + lstcon_rpc_stat_failure(stat, 0), + lstcon_rpc_stat_total(stat, 0), + stat->trs_rpc_errno, stat->trs_fwk_errno); + + return; +} + +int +lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans, + struct list_head *head_up, + lstcon_rpc_readent_func_t readent) +{ + struct list_head tmp; + struct list_head *next; + lstcon_rpc_ent_t *ent; + srpc_generic_reply_t *rep; + lstcon_rpc_t *crpc; + srpc_msg_t *msg; + lstcon_node_t *nd; + cfs_duration_t dur; + struct timeval tv; + int error; + + LASSERT (head_up != NULL); + + next = head_up; + + list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { + if (copy_from_user(&tmp, next, + sizeof(struct list_head))) + return -EFAULT; + + if (tmp.next == head_up) + return 0; + + next = tmp.next; + + ent = list_entry(next, lstcon_rpc_ent_t, rpe_link); + + LASSERT (crpc->crp_stamp != 0); + + error = lstcon_rpc_get_reply(crpc, &msg); + + nd = crpc->crp_node; + + dur = (cfs_duration_t)cfs_time_sub(crpc->crp_stamp, + (cfs_time_t)console_session.ses_id.ses_stamp); + cfs_duration_usec(dur, &tv); + + if (copy_to_user(&ent->rpe_peer, + &nd->nd_id, sizeof(lnet_process_id_t)) || + copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) || + copy_to_user(&ent->rpe_state, + &nd->nd_state, sizeof(nd->nd_state)) || + copy_to_user(&ent->rpe_rpc_errno, &error, + sizeof(error))) + return -EFAULT; + + if (error != 0) + continue; + + /* RPC is done */ + rep = (srpc_generic_reply_t *)&msg->msg_body.reply; + + if (copy_to_user(&ent->rpe_sid, + &rep->sid, sizeof(lst_sid_t)) || + copy_to_user(&ent->rpe_fwk_errno, + &rep->status, sizeof(rep->status))) + return -EFAULT; + + if (readent == NULL) + continue; + + if ((error = readent(trans->tas_opc, msg, ent)) != 0) + return error; + } + + return 0; +} + +void +lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans) +{ + srpc_client_rpc_t *rpc; + lstcon_rpc_t *crpc; + lstcon_rpc_t *tmp; + int count = 0; + + list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, + crp_link) { + rpc = crpc->crp_rpc; + + spin_lock(&rpc->crpc_lock); + + /* free it if not posted or finished already */ + if (!crpc->crp_posted || crpc->crp_finished) { + spin_unlock(&rpc->crpc_lock); + + list_del_init(&crpc->crp_link); + lstcon_rpc_put(crpc); + + continue; + } + + /* rpcs can be still not callbacked (even LNetMDUnlink is called) + * because huge timeout for inaccessible network, don't make + * user wait for them, just abandon them, they will be recycled + * in callback */ + + LASSERT (crpc->crp_status != 0); + + crpc->crp_node = NULL; + crpc->crp_trans = NULL; + list_del_init(&crpc->crp_link); + count ++; + + spin_unlock(&rpc->crpc_lock); + + atomic_dec(&trans->tas_remaining); + } + + LASSERT (atomic_read(&trans->tas_remaining) == 0); + + list_del(&trans->tas_link); + if (!list_empty(&trans->tas_olink)) + list_del(&trans->tas_olink); + + CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n", + lstcon_rpc_trans_name(trans->tas_opc), count); + + LIBCFS_FREE(trans, sizeof(*trans)); + + return; +} + +int +lstcon_sesrpc_prep(lstcon_node_t *nd, int transop, + unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_mksn_reqst_t *msrq; + srpc_rmsn_reqst_t *rsrq; + int rc; + + switch (transop) { + case LST_TRANS_SESNEW: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst; + msrq->mksn_sid = console_session.ses_id; + msrq->mksn_force = console_session.ses_force; + strncpy(msrq->mksn_name, console_session.ses_name, + strlen(console_session.ses_name)); + break; + + case LST_TRANS_SESEND: + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION, + feats, 0, 0, crpc); + if (rc != 0) + return rc; + + rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst; + rsrq->rmsn_sid = console_session.ses_id; + break; + + default: + LBUG(); + } + + return 0; +} + +int +lstcon_dbgrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_debug_reqst_t *drq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + return rc; +} + +int +lstcon_batrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, + lstcon_tsb_hdr_t *tsb, lstcon_rpc_t **crpc) +{ + lstcon_batch_t *batch; + srpc_batch_reqst_t *brq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst; + + brq->bar_sid = console_session.ses_id; + brq->bar_bid = tsb->tsb_id; + brq->bar_testidx = tsb->tsb_index; + brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN : + (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP: + SRPC_BATCH_OPC_QUERY); + + if (transop != LST_TRANS_TSBRUN && + transop != LST_TRANS_TSBSTOP) + return 0; + + LASSERT (tsb->tsb_index == 0); + + batch = (lstcon_batch_t *)tsb; + brq->bar_arg = batch->bat_arg; + + return 0; +} + +int +lstcon_statrpc_prep(lstcon_node_t *nd, unsigned feats, lstcon_rpc_t **crpc) +{ + srpc_stat_reqst_t *srq; + int rc; + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc); + if (rc != 0) + return rc; + + srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst; + + srq->str_sid = console_session.ses_id; + srq->str_type = 0; /* XXX remove it */ + + return 0; +} + +lnet_process_id_packed_t * +lstcon_next_id(int idx, int nkiov, lnet_kiov_t *kiov) +{ + lnet_process_id_packed_t *pid; + int i; + + i = idx / SFW_ID_PER_PAGE; + + LASSERT (i < nkiov); + + pid = (lnet_process_id_packed_t *)page_address(kiov[i].kiov_page); + + return &pid[idx % SFW_ID_PER_PAGE]; +} + +int +lstcon_dstnodes_prep(lstcon_group_t *grp, int idx, + int dist, int span, int nkiov, lnet_kiov_t *kiov) +{ + lnet_process_id_packed_t *pid; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int start; + int end; + int i = 0; + + LASSERT (dist >= 1); + LASSERT (span >= 1); + LASSERT (grp->grp_nnode >= 1); + + if (span > grp->grp_nnode) + return -EINVAL; + + start = ((idx / dist) * span) % grp->grp_nnode; + end = ((idx / dist) * span + span - 1) % grp->grp_nnode; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + nd = ndl->ndl_node; + if (i < start) { + i ++; + continue; + } + + if (i > (end >= start ? end: grp->grp_nnode)) + break; + + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + if (start <= end) /* done */ + return 0; + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { + if (i > grp->grp_nnode + end) + break; + + nd = ndl->ndl_node; + pid = lstcon_next_id((i - start), nkiov, kiov); + pid->nid = nd->nd_id.nid; + pid->pid = nd->nd_id.pid; + i++; + } + + return 0; +} + +int +lstcon_pingrpc_prep(lst_test_ping_param_t *param, srpc_test_reqst_t *req) +{ + test_ping_req_t *prq = &req->tsr_u.ping; + + prq->png_size = param->png_size; + prq->png_flags = param->png_flags; + /* TODO dest */ + return 0; +} + +int +lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req) +{ + test_bulk_req_t *brq = &req->tsr_u.bulk_v0; + + brq->blk_opc = param->blk_opc; + brq->blk_npg = (param->blk_size + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; + brq->blk_flags = param->blk_flags; + + return 0; +} + +int +lstcon_bulkrpc_v1_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req) +{ + test_bulk_req_v1_t *brq = &req->tsr_u.bulk_v1; + + brq->blk_opc = param->blk_opc; + brq->blk_flags = param->blk_flags; + brq->blk_len = param->blk_size; + brq->blk_offset = 0; /* reserved */ + + return 0; +} + +int +lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats, + lstcon_test_t *test, lstcon_rpc_t **crpc) +{ + lstcon_group_t *sgrp = test->tes_src_grp; + lstcon_group_t *dgrp = test->tes_dst_grp; + srpc_test_reqst_t *trq; + srpc_bulk_t *bulk; + int i; + int npg = 0; + int nob = 0; + int rc = 0; + + if (transop == LST_TRANS_TSBCLIADD) { + npg = sfw_id_pages(test->tes_span); + nob = (feats & LST_FEAT_BULK_LEN) == 0 ? + npg * PAGE_CACHE_SIZE : + sizeof(lnet_process_id_packed_t) * test->tes_span; + } + + rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc); + if (rc != 0) + return rc; + + trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst; + + if (transop == LST_TRANS_TSBSRVADD) { + int ndist = (sgrp->grp_nnode + test->tes_dist - 1) / test->tes_dist; + int nspan = (dgrp->grp_nnode + test->tes_span - 1) / test->tes_span; + int nmax = (ndist + nspan - 1) / nspan; + + trq->tsr_ndest = 0; + trq->tsr_loop = nmax * test->tes_dist * test->tes_concur; + + } else { + bulk = &(*crpc)->crp_rpc->crpc_bulk; + + for (i = 0; i < npg; i++) { + int len; + + LASSERT(nob > 0); + + len = (feats & LST_FEAT_BULK_LEN) == 0 ? + PAGE_CACHE_SIZE : min_t(int, nob, PAGE_CACHE_SIZE); + nob -= len; + + bulk->bk_iovs[i].kiov_offset = 0; + bulk->bk_iovs[i].kiov_len = len; + bulk->bk_iovs[i].kiov_page = + alloc_page(GFP_IOFS); + + if (bulk->bk_iovs[i].kiov_page == NULL) { + lstcon_rpc_put(*crpc); + return -ENOMEM; + } + } + + bulk->bk_sink = 0; + + LASSERT (transop == LST_TRANS_TSBCLIADD); + + rc = lstcon_dstnodes_prep(test->tes_dst_grp, + test->tes_cliidx++, + test->tes_dist, + test->tes_span, + npg, &bulk->bk_iovs[0]); + if (rc != 0) { + lstcon_rpc_put(*crpc); + return rc; + } + + trq->tsr_ndest = test->tes_span; + trq->tsr_loop = test->tes_loop; + } + + trq->tsr_sid = console_session.ses_id; + trq->tsr_bid = test->tes_hdr.tsb_id; + trq->tsr_concur = test->tes_concur; + trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0; + trq->tsr_stop_onerr = !!test->tes_stop_onerr; + + switch (test->tes_type) { + case LST_TEST_PING: + trq->tsr_service = SRPC_SERVICE_PING; + rc = lstcon_pingrpc_prep((lst_test_ping_param_t *) + &test->tes_param[0], trq); + break; + + case LST_TEST_BULK: + trq->tsr_service = SRPC_SERVICE_BRW; + if ((feats & LST_FEAT_BULK_LEN) == 0) { + rc = lstcon_bulkrpc_v0_prep((lst_test_bulk_param_t *) + &test->tes_param[0], trq); + } else { + rc = lstcon_bulkrpc_v1_prep((lst_test_bulk_param_t *) + &test->tes_param[0], trq); + } + + break; + default: + LBUG(); + break; + } + + return rc; +} + +int +lstcon_sesnew_stat_reply(lstcon_rpc_trans_t *trans, + lstcon_node_t *nd, srpc_msg_t *reply) +{ + srpc_mksn_reply_t *mksn_rep = &reply->msg_body.mksn_reply; + int status = mksn_rep->mksn_status; + + if (status == 0 && + (reply->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + mksn_rep->mksn_status = EPROTO; + status = EPROTO; + } + + if (status == EPROTO) { + CNETERR("session protocol error from %s: %u\n", + libcfs_nid2str(nd->nd_id.nid), + reply->msg_ses_feats); + } + + if (status != 0) + return status; + + if (!trans->tas_feats_updated) { + trans->tas_feats_updated = 1; + trans->tas_features = reply->msg_ses_feats; + } + + if (reply->msg_ses_feats != trans->tas_features) { + CNETERR("Framework features %x from %s is different with " + "features on this transaction: %x\n", + reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid), + trans->tas_features); + status = mksn_rep->mksn_status = EPROTO; + } + + if (status == 0) { + /* session timeout on remote node */ + nd->nd_timeout = mksn_rep->mksn_timeout; + } + + return status; +} + +void +lstcon_rpc_stat_reply(lstcon_rpc_trans_t *trans, srpc_msg_t *msg, + lstcon_node_t *nd, lstcon_trans_stat_t *stat) +{ + srpc_rmsn_reply_t *rmsn_rep; + srpc_debug_reply_t *dbg_rep; + srpc_batch_reply_t *bat_rep; + srpc_test_reply_t *test_rep; + srpc_stat_reply_t *stat_rep; + int rc = 0; + + switch (trans->tas_opc) { + case LST_TRANS_SESNEW: + rc = lstcon_sesnew_stat_reply(trans, nd, msg); + if (rc == 0) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + break; + + case LST_TRANS_SESEND: + rmsn_rep = &msg->msg_body.rmsn_reply; + /* ESRCH is not an error for end session */ + if (rmsn_rep->rmsn_status == 0 || + rmsn_rep->rmsn_status == ESRCH) { + lstcon_sesop_stat_success(stat, 1); + return; + } + + lstcon_sesop_stat_failure(stat, 1); + rc = rmsn_rep->rmsn_status; + break; + + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + dbg_rep = &msg->msg_body.dbg_reply; + + if (dbg_rep->dbg_status == ESRCH) { + lstcon_sesqry_stat_unknown(stat, 1); + return; + } + + if (lstcon_session_match(dbg_rep->dbg_sid)) + lstcon_sesqry_stat_active(stat, 1); + else + lstcon_sesqry_stat_busy(stat, 1); + return; + + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + if (bat_rep->bar_status == EPERM && + trans->tas_opc == LST_TRANS_TSBSTOP) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + bat_rep = &msg->msg_body.bat_reply; + + if (bat_rep->bar_active != 0) + lstcon_tsbqry_stat_run(stat, 1); + else + lstcon_tsbqry_stat_idle(stat, 1); + + if (bat_rep->bar_status == 0) + return; + + lstcon_tsbqry_stat_failure(stat, 1); + rc = bat_rep->bar_status; + break; + + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + test_rep = &msg->msg_body.tes_reply; + + if (test_rep->tsr_status == 0) { + lstcon_tsbop_stat_success(stat, 1); + return; + } + + lstcon_tsbop_stat_failure(stat, 1); + rc = test_rep->tsr_status; + break; + + case LST_TRANS_STATQRY: + stat_rep = &msg->msg_body.stat_reply; + + if (stat_rep->str_status == 0) { + lstcon_statqry_stat_success(stat, 1); + return; + } + + lstcon_statqry_stat_failure(stat, 1); + rc = stat_rep->str_status; + break; + + default: + LBUG(); + } + + if (stat->trs_fwk_errno == 0) + stat->trs_fwk_errno = rc; + + return; +} + +int +lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + lstcon_rpc_trans_t **transpp) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + lstcon_rpc_t *rpc; + unsigned feats; + int rc; + + /* Creating session RPG for list of nodes */ + + rc = lstcon_rpc_trans_prep(translist, transop, &trans); + if (rc != 0) { + CERROR("Can't create transaction %d: %d\n", transop, rc); + return rc; + } + + feats = trans->tas_features; + list_for_each_entry(ndl, ndlist, ndl_link) { + rc = condition == NULL ? 1 : + condition(transop, ndl->ndl_node, arg); + + if (rc == 0) + continue; + + if (rc < 0) { + CDEBUG(D_NET, "Condition error while creating RPC " + " for transaction %d: %d\n", transop, rc); + break; + } + + nd = ndl->ndl_node; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc); + break; + case LST_TRANS_SESQRY: + case LST_TRANS_SESPING: + rc = lstcon_dbgrpc_prep(nd, feats, &rpc); + break; + case LST_TRANS_TSBCLIADD: + case LST_TRANS_TSBSRVADD: + rc = lstcon_testrpc_prep(nd, transop, feats, + (lstcon_test_t *)arg, &rpc); + break; + case LST_TRANS_TSBRUN: + case LST_TRANS_TSBSTOP: + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + rc = lstcon_batrpc_prep(nd, transop, feats, + (lstcon_tsb_hdr_t *)arg, &rpc); + break; + case LST_TRANS_STATQRY: + rc = lstcon_statrpc_prep(nd, feats, &rpc); + break; + default: + rc = -EINVAL; + break; + } + + if (rc != 0) { + CERROR("Failed to create RPC for transaction %s: %d\n", + lstcon_rpc_trans_name(transop), rc); + break; + } + + lstcon_rpc_trans_addreq(trans, rpc); + } + + if (rc == 0) { + *transpp = trans; + return 0; + } + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +void +lstcon_rpc_pinger(void *arg) +{ + stt_timer_t *ptimer = (stt_timer_t *)arg; + lstcon_rpc_trans_t *trans; + lstcon_rpc_t *crpc; + srpc_msg_t *rep; + srpc_debug_reqst_t *drq; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + time_t intv; + int count = 0; + int rc; + + /* RPC pinger is a special case of transaction, + * it's called by timer at 8 seconds interval. + */ + mutex_lock(&console_session.ses_mutex); + + if (console_session.ses_shutdown || console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + if (!console_session.ses_expired && + cfs_time_current_sec() - console_session.ses_laststamp > + (time_t)console_session.ses_timeout) + console_session.ses_expired = 1; + + trans = console_session.ses_ping; + + LASSERT (trans != NULL); + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) { + nd = ndl->ndl_node; + + if (console_session.ses_expired) { + /* idle console, end session on all nodes */ + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND, + trans->tas_features, &crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + continue; + } + + crpc = &nd->nd_ping; + + if (crpc->crp_rpc != NULL) { + LASSERT (crpc->crp_trans == trans); + LASSERT (!list_empty(&crpc->crp_link)); + + spin_lock(&crpc->crp_rpc->crpc_lock); + + LASSERT(crpc->crp_posted); + + if (!crpc->crp_finished) { + /* in flight */ + spin_unlock(&crpc->crp_rpc->crpc_lock); + continue; + } + + spin_unlock(&crpc->crp_rpc->crpc_lock); + + lstcon_rpc_get_reply(crpc, &rep); + + list_del_init(&crpc->crp_link); + + lstcon_rpc_put(crpc); + } + + if (nd->nd_state != LST_NODE_ACTIVE) + continue; + + intv = cfs_duration_sec(cfs_time_sub(cfs_time_current(), + nd->nd_stamp)); + if (intv < (time_t)nd->nd_timeout / 2) + continue; + + rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG, + trans->tas_features, 0, 0, 1, crpc); + if (rc != 0) { + CERROR("Out of memory\n"); + break; + } + + drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; + + drq->dbg_sid = console_session.ses_id; + drq->dbg_flags = 0; + + lstcon_rpc_trans_addreq(trans, crpc); + lstcon_rpc_post(crpc); + + count ++; + } + + if (console_session.ses_expired) { + mutex_unlock(&console_session.ses_mutex); + return; + } + + CDEBUG(D_NET, "Ping %d nodes in session\n", count); + + ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL); + stt_add_timer(ptimer); + + mutex_unlock(&console_session.ses_mutex); +} + +int +lstcon_rpc_pinger_start(void) +{ + stt_timer_t *ptimer; + int rc; + + LASSERT (list_empty(&console_session.ses_rpc_freelist)); + LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0); + + rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING, + &console_session.ses_ping); + if (rc != 0) { + CERROR("Failed to create console pinger\n"); + return rc; + } + + ptimer = &console_session.ses_ping_timer; + ptimer->stt_expires = (cfs_time_t)(cfs_time_current_sec() + LST_PING_INTERVAL); + + stt_add_timer(ptimer); + + return 0; +} + +void +lstcon_rpc_pinger_stop(void) +{ + LASSERT (console_session.ses_shutdown); + + stt_del_timer(&console_session.ses_ping_timer); + + lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN); + lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat()); + lstcon_rpc_trans_destroy(console_session.ses_ping); + + memset(lstcon_trans_stat(), 0, sizeof(lstcon_trans_stat_t)); + + console_session.ses_ping = NULL; +} + +void +lstcon_rpc_cleanup_wait(void) +{ + lstcon_rpc_trans_t *trans; + lstcon_rpc_t *crpc; + struct list_head *pacer; + struct list_head zlist; + + /* Called with hold of global mutex */ + + LASSERT (console_session.ses_shutdown); + + while (!list_empty(&console_session.ses_trans_list)) { + list_for_each(pacer, &console_session.ses_trans_list) { + trans = list_entry(pacer, lstcon_rpc_trans_t, + tas_link); + + CDEBUG(D_NET, "Session closed, wakeup transaction %s\n", + lstcon_rpc_trans_name(trans->tas_opc)); + + wake_up(&trans->tas_waitq); + } + + mutex_unlock(&console_session.ses_mutex); + + CWARN("Session is shutting down, " + "waiting for termination of transactions\n"); + cfs_pause(cfs_time_seconds(1)); + + mutex_lock(&console_session.ses_mutex); + } + + spin_lock(&console_session.ses_rpc_lock); + + lst_wait_until((atomic_read(&console_session.ses_rpc_counter) == 0), + console_session.ses_rpc_lock, + "Network is not accessable or target is down, " + "waiting for %d console RPCs to being recycled\n", + atomic_read(&console_session.ses_rpc_counter)); + + list_add(&zlist, &console_session.ses_rpc_freelist); + list_del_init(&console_session.ses_rpc_freelist); + + spin_unlock(&console_session.ses_rpc_lock); + + while (!list_empty(&zlist)) { + crpc = list_entry(zlist.next, lstcon_rpc_t, crp_link); + + list_del(&crpc->crp_link); + LIBCFS_FREE(crpc, sizeof(lstcon_rpc_t)); + } +} + +int +lstcon_rpc_module_init(void) +{ + INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list); + console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger; + console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer; + + console_session.ses_ping = NULL; + + spin_lock_init(&console_session.ses_rpc_lock); + atomic_set(&console_session.ses_rpc_counter, 0); + INIT_LIST_HEAD(&console_session.ses_rpc_freelist); + + return 0; +} + +void +lstcon_rpc_module_fini(void) +{ + LASSERT (list_empty(&console_session.ses_rpc_freelist)); + LASSERT (atomic_read(&console_session.ses_rpc_counter) == 0); +} diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h new file mode 100644 index 000000000000..9aba24a2eab9 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/conrpc.h @@ -0,0 +1,146 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * /lnet/selftest/conrpc.h + * + * Console rpc + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONRPC_H__ +#define __LST_CONRPC_H__ + +#include +#include +#include +#include +#include "rpc.h" +#include "selftest.h" + +/* Console rpc and rpc transaction */ +#define LST_TRANS_TIMEOUT 30 +#define LST_TRANS_MIN_TIMEOUT 3 + +#define LST_VALIDATE_TIMEOUT(t) MIN(MAX(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT) + +#define LST_PING_INTERVAL 8 + +struct lstcon_rpc_trans; +struct lstcon_tsb_hdr; +struct lstcon_test; +struct lstcon_node; + +typedef struct lstcon_rpc { + struct list_head crp_link; /* chain on rpc transaction */ + srpc_client_rpc_t *crp_rpc; /* client rpc */ + struct lstcon_node *crp_node; /* destination node */ + struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */ + + unsigned int crp_posted:1; /* rpc is posted */ + unsigned int crp_finished:1; /* rpc is finished */ + unsigned int crp_unpacked:1; /* reply is unpacked */ + /** RPC is embedded in other structure and can't free it */ + unsigned int crp_embedded:1; + int crp_status; /* console rpc errors */ + cfs_time_t crp_stamp; /* replied time stamp */ +} lstcon_rpc_t; + +typedef struct lstcon_rpc_trans { + struct list_head tas_olink; /* link chain on owner list */ + struct list_head tas_link; /* link chain on global list */ + int tas_opc; /* operation code of transaction */ + /* features mask is uptodate */ + unsigned tas_feats_updated; + /* test features mask */ + unsigned tas_features; + wait_queue_head_t tas_waitq; /* wait queue head */ + atomic_t tas_remaining; /* # of un-scheduled rpcs */ + struct list_head tas_rpcs_list; /* queued requests */ +} lstcon_rpc_trans_t; + +#define LST_TRANS_PRIVATE 0x1000 + +#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01) +#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02) +#define LST_TRANS_SESQRY 0x03 +#define LST_TRANS_SESPING 0x04 + +#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11) +#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12) +#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13) +#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14) +#define LST_TRANS_TSBCLIQRY 0x15 +#define LST_TRANS_TSBSRVQRY 0x16 + +#define LST_TRANS_STATQRY 0x21 + +typedef int (* lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *); +typedef int (* lstcon_rpc_readent_func_t)(int, srpc_msg_t *, lstcon_rpc_ent_t *); + +int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, + unsigned version, lstcon_rpc_t **crpc); +int lstcon_dbgrpc_prep(struct lstcon_node *nd, + unsigned version, lstcon_rpc_t **crpc); +int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_tsb_hdr *tsb, lstcon_rpc_t **crpc); +int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned version, + struct lstcon_test *test, lstcon_rpc_t **crpc); +int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned version, + lstcon_rpc_t **crpc); +void lstcon_rpc_put(lstcon_rpc_t *crpc); +int lstcon_rpc_trans_prep(struct list_head *translist, + int transop, lstcon_rpc_trans_t **transpp); +int lstcon_rpc_trans_ndlist(struct list_head *ndlist, + struct list_head *translist, int transop, + void *arg, lstcon_rpc_cond_func_t condition, + lstcon_rpc_trans_t **transpp); +void lstcon_rpc_trans_stat(lstcon_rpc_trans_t *trans, + lstcon_trans_stat_t *stat); +int lstcon_rpc_trans_interpreter(lstcon_rpc_trans_t *trans, + struct list_head *head_up, + lstcon_rpc_readent_func_t readent); +void lstcon_rpc_trans_abort(lstcon_rpc_trans_t *trans, int error); +void lstcon_rpc_trans_destroy(lstcon_rpc_trans_t *trans); +void lstcon_rpc_trans_addreq(lstcon_rpc_trans_t *trans, lstcon_rpc_t *req); +int lstcon_rpc_trans_postwait(lstcon_rpc_trans_t *trans, int timeout); +int lstcon_rpc_pinger_start(void); +void lstcon_rpc_pinger_stop(void); +void lstcon_rpc_cleanup_wait(void); +int lstcon_rpc_module_init(void); +void lstcon_rpc_module_fini(void); + + +#endif diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c new file mode 100644 index 000000000000..78e8d0467267 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/console.c @@ -0,0 +1,2071 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Infrastructure of LST console + * + * Author: Liang Zhen + */ + + +#include +#include +#include "console.h" +#include "conrpc.h" + +#define LST_NODE_STATE_COUNTER(nd, p) \ +do { \ + if ((nd)->nd_state == LST_NODE_ACTIVE) \ + (p)->nle_nactive ++; \ + else if ((nd)->nd_state == LST_NODE_BUSY) \ + (p)->nle_nbusy ++; \ + else if ((nd)->nd_state == LST_NODE_DOWN) \ + (p)->nle_ndown ++; \ + else \ + (p)->nle_nunknown ++; \ + (p)->nle_nnode ++; \ +} while (0) + +lstcon_session_t console_session; + +void +lstcon_node_get(lstcon_node_t *nd) +{ + LASSERT (nd->nd_ref >= 1); + + nd->nd_ref++; +} + +static int +lstcon_node_find(lnet_process_id_t id, lstcon_node_t **ndpp, int create) +{ + lstcon_ndlink_t *ndl; + unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE; + + LASSERT (id.nid != LNET_NID_ANY); + + list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + lstcon_node_get(ndl->ndl_node); + *ndpp = ndl->ndl_node; + return 0; + } + + if (!create) + return -ENOENT; + + LIBCFS_ALLOC(*ndpp, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t)); + if (*ndpp == NULL) + return -ENOMEM; + + ndl = (lstcon_ndlink_t *)(*ndpp + 1); + + ndl->ndl_node = *ndpp; + + ndl->ndl_node->nd_ref = 1; + ndl->ndl_node->nd_id = id; + ndl->ndl_node->nd_stamp = cfs_time_current(); + ndl->ndl_node->nd_state = LST_NODE_UNKNOWN; + ndl->ndl_node->nd_timeout = 0; + memset(&ndl->ndl_node->nd_ping, 0, sizeof(lstcon_rpc_t)); + + /* queued in global hash & list, no refcount is taken by + * global hash & list, if caller release his refcount, + * node will be released */ + list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list); + + return 0; +} + +void +lstcon_node_put(lstcon_node_t *nd) +{ + lstcon_ndlink_t *ndl; + + LASSERT (nd->nd_ref > 0); + + if (--nd->nd_ref > 0) + return; + + ndl = (lstcon_ndlink_t *)(nd + 1); + + LASSERT (!list_empty(&ndl->ndl_link)); + LASSERT (!list_empty(&ndl->ndl_hlink)); + + /* remove from session */ + list_del(&ndl->ndl_link); + list_del(&ndl->ndl_hlink); + + LIBCFS_FREE(nd, sizeof(lstcon_node_t) + sizeof(lstcon_ndlink_t)); +} + +static int +lstcon_ndlink_find(struct list_head *hash, + lnet_process_id_t id, lstcon_ndlink_t **ndlpp, int create) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int rc; + + if (id.nid == LNET_NID_ANY) + return -EINVAL; + + /* search in hash */ + list_for_each_entry(ndl, &hash[idx], ndl_hlink) { + if (ndl->ndl_node->nd_id.nid != id.nid || + ndl->ndl_node->nd_id.pid != id.pid) + continue; + + *ndlpp = ndl; + return 0; + } + + if (create == 0) + return -ENOENT; + + /* find or create in session hash */ + rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0); + if (rc != 0) + return rc; + + LIBCFS_ALLOC(ndl, sizeof(lstcon_ndlink_t)); + if (ndl == NULL) { + lstcon_node_put(nd); + return -ENOMEM; + } + + *ndlpp = ndl; + + ndl->ndl_node = nd; + INIT_LIST_HEAD(&ndl->ndl_link); + list_add_tail(&ndl->ndl_hlink, &hash[idx]); + + return 0; +} + +static void +lstcon_ndlink_release(lstcon_ndlink_t *ndl) +{ + LASSERT (list_empty(&ndl->ndl_link)); + LASSERT (!list_empty(&ndl->ndl_hlink)); + + list_del(&ndl->ndl_hlink); /* delete from hash */ + lstcon_node_put(ndl->ndl_node); + + LIBCFS_FREE(ndl, sizeof(*ndl)); +} + +static int +lstcon_group_alloc(char *name, lstcon_group_t **grpp) +{ + lstcon_group_t *grp; + int i; + + LIBCFS_ALLOC(grp, offsetof(lstcon_group_t, + grp_ndl_hash[LST_NODE_HASHSIZE])); + if (grp == NULL) + return -ENOMEM; + + memset(grp, 0, offsetof(lstcon_group_t, + grp_ndl_hash[LST_NODE_HASHSIZE])); + + grp->grp_ref = 1; + if (name != NULL) + strcpy(grp->grp_name, name); + + INIT_LIST_HEAD(&grp->grp_link); + INIT_LIST_HEAD(&grp->grp_ndl_list); + INIT_LIST_HEAD(&grp->grp_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) + INIT_LIST_HEAD(&grp->grp_ndl_hash[i]); + + *grpp = grp; + + return 0; +} + +static void +lstcon_group_addref(lstcon_group_t *grp) +{ + grp->grp_ref ++; +} + +static void lstcon_group_ndlink_release(lstcon_group_t *, lstcon_ndlink_t *); + +static void +lstcon_group_drain(lstcon_group_t *grp, int keep) +{ + lstcon_ndlink_t *ndl; + lstcon_ndlink_t *tmp; + + list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) { + if ((ndl->ndl_node->nd_state & keep) == 0) + lstcon_group_ndlink_release(grp, ndl); + } +} + +static void +lstcon_group_decref(lstcon_group_t *grp) +{ + int i; + + if (--grp->grp_ref > 0) + return; + + if (!list_empty(&grp->grp_link)) + list_del(&grp->grp_link); + + lstcon_group_drain(grp, 0); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT (list_empty(&grp->grp_ndl_hash[i])); + } + + LIBCFS_FREE(grp, offsetof(lstcon_group_t, + grp_ndl_hash[LST_NODE_HASHSIZE])); +} + +static int +lstcon_group_find(char *name, lstcon_group_t **grpp) +{ + lstcon_group_t *grp; + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (strncmp(grp->grp_name, name, LST_NAME_SIZE) != 0) + continue; + + lstcon_group_addref(grp); /* +1 ref for caller */ + *grpp = grp; + return 0; + } + + return -ENOENT; +} + +static void +lstcon_group_put(lstcon_group_t *grp) +{ + lstcon_group_decref(grp); +} + +static int +lstcon_group_ndlink_find(lstcon_group_t *grp, lnet_process_id_t id, + lstcon_ndlink_t **ndlpp, int create) +{ + int rc; + + rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create); + if (rc != 0) + return rc; + + if (!list_empty(&(*ndlpp)->ndl_link)) + return 0; + + list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list); + grp->grp_nnode ++; + + return 0; +} + +static void +lstcon_group_ndlink_release(lstcon_group_t *grp, lstcon_ndlink_t *ndl) +{ + list_del_init(&ndl->ndl_link); + lstcon_ndlink_release(ndl); + grp->grp_nnode --; +} + +static void +lstcon_group_ndlink_move(lstcon_group_t *old, + lstcon_group_t *new, lstcon_ndlink_t *ndl) +{ + unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) % + LST_NODE_HASHSIZE; + + list_del(&ndl->ndl_hlink); + list_del(&ndl->ndl_link); + old->grp_nnode --; + + list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]); + list_add_tail(&ndl->ndl_link, &new->grp_ndl_list); + new->grp_nnode ++; + + return; +} + +static void +lstcon_group_move(lstcon_group_t *old, lstcon_group_t *new) +{ + lstcon_ndlink_t *ndl; + + while (!list_empty(&old->grp_ndl_list)) { + ndl = list_entry(old->grp_ndl_list.next, + lstcon_ndlink_t, ndl_link); + lstcon_group_ndlink_move(old, new, ndl); + } +} + +int +lstcon_sesrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + lstcon_group_t *grp = (lstcon_group_t *)arg; + + switch (transop) { + case LST_TRANS_SESNEW: + if (nd->nd_state == LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_SESEND: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + + if (grp != NULL && nd->nd_ref > 1) + return 0; + break; + + case LST_TRANS_SESQRY: + break; + + default: + LBUG(); + } + + return 1; +} + +int +lstcon_sesrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_debug_reply_t *rep; + + switch (transop) { + case LST_TRANS_SESNEW: + case LST_TRANS_SESEND: + return 0; + + case LST_TRANS_SESQRY: + rep = &msg->msg_body.dbg_reply; + + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->dbg_timeout, sizeof(int)) || + copy_to_user(&ent_up->rpe_payload[0], + &rep->dbg_name, LST_NAME_SIZE)) + return -EFAULT; + + return 0; + + default: + LBUG(); + } + + return 0; +} + +static int +lstcon_group_nodes_add(lstcon_group_t *grp, + int count, lnet_process_id_t *ids_up, + unsigned *featp, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* skip if it's in this group already */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 0); + if (rc == 0) + continue; + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create ndlink, out of memory\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_put(tmp); + return rc; + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESNEW, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_put(tmp); + return rc; + } + + /* post all RPCs */ + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + *featp = trans->tas_features; + + /* destroy all RPGs */ + lstcon_rpc_trans_destroy(trans); + + lstcon_group_move(tmp, grp); + lstcon_group_put(tmp); + + return rc; +} + +static int +lstcon_group_nodes_remove(lstcon_group_t *grp, + int count, lnet_process_id_t *ids_up, + struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int rc; + int i; + + /* End session and remove node from the group */ + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + goto error; + } + + /* move node to tmp group */ + if (lstcon_group_ndlink_find(grp, id, &ndl, 0) == 0) + lstcon_group_ndlink_move(grp, tmp, ndl); + } + + rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, + &tmp->grp_trans_list, LST_TRANS_SESEND, + tmp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + goto error; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* release nodes anyway, because we can't rollback status */ + lstcon_group_put(tmp); + + return rc; +error: + lstcon_group_move(tmp, grp); + lstcon_group_put(tmp); + + return rc; +} + +int +lstcon_group_add(char *name) +{ + lstcon_group_t *grp; + int rc; + + rc = (lstcon_group_find(name, &grp) == 0)? -EEXIST: 0; + if (rc != 0) { + /* find a group with same name */ + lstcon_group_put(grp); + return rc; + } + + rc = lstcon_group_alloc(name, &grp); + if (rc != 0) { + CERROR("Can't allocate descriptor for group %s\n", name); + return -ENOMEM; + } + + list_add_tail(&grp->grp_link, &console_session.ses_grp_list); + + return rc; +} + +int +lstcon_nodes_add(char *name, int count, lnet_process_id_t *ids_up, + unsigned *featp, struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + LASSERT (count > 0); + LASSERT (ids_up != NULL); + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by other threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + + return -EBUSY; + } + + rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_del(char *name) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by others threads or test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESEND, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + lstcon_group_put(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + + lstcon_group_put(grp); + /* -ref for session, it's destroyed, + * status can't be rolled back, destroy group anway */ + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_clean(char *name, int args) +{ + lstcon_group_t *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + args = (LST_NODE_ACTIVE | LST_NODE_BUSY | + LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args; + + lstcon_group_drain(grp, args); + + lstcon_group_put(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_put(grp); + + return 0; +} + +int +lstcon_nodes_remove(char *name, int count, + lnet_process_id_t *ids_up, struct list_head *result_up) +{ + lstcon_group_t *grp = NULL; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up); + + lstcon_group_put(grp); + /* release empty group */ + if (list_empty(&grp->grp_ndl_list)) + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_refresh(char *name, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group: %s\n", name); + return rc; + } + + if (grp->grp_ref > 2) { + /* referred by test */ + CDEBUG(D_NET, "Group %s is busy\n", name); + lstcon_group_put(grp); + return -EBUSY; + } + + /* re-invite all inactive nodes int the group */ + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &grp->grp_trans_list, LST_TRANS_SESNEW, + grp, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + /* local error, return */ + CDEBUG(D_NET, "Can't create transaction: %d\n", rc); + lstcon_group_put(grp); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* -ref for me */ + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_group_list(int index, int len, char *name_up) +{ + lstcon_group_t *grp; + + LASSERT (index >= 0); + LASSERT (name_up != NULL); + + list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { + if (index-- == 0) { + return copy_to_user(name_up, grp->grp_name, len) ? + -EFAULT : 0; + } + } + + return -ENOENT; +} + +static int +lstcon_nodes_getent(struct list_head *head, int *index_p, + int *count_p, lstcon_node_ent_t *dents_up) +{ + lstcon_ndlink_t *ndl; + lstcon_node_t *nd; + int count = 0; + int index = 0; + + LASSERT (index_p != NULL && count_p != NULL); + LASSERT (dents_up != NULL); + LASSERT (*index_p >= 0); + LASSERT (*count_p > 0); + + list_for_each_entry(ndl, head, ndl_link) { + if (index++ < *index_p) + continue; + + if (count >= *count_p) + break; + + nd = ndl->ndl_node; + if (copy_to_user(&dents_up[count].nde_id, + &nd->nd_id, sizeof(nd->nd_id)) || + copy_to_user(&dents_up[count].nde_state, + &nd->nd_state, sizeof(nd->nd_state))) + return -EFAULT; + + count ++; + } + + if (index <= *index_p) + return -ENOENT; + + *count_p = count; + *index_p = index; + + return 0; +} + +int +lstcon_group_info(char *name, lstcon_ndlist_ent_t *gents_p, + int *index_p, int *count_p, lstcon_node_ent_t *dents_up) +{ + lstcon_ndlist_ent_t *gentp; + lstcon_group_t *grp; + lstcon_ndlink_t *ndl; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", name); + return rc; + } + + if (dents_up != 0) { + /* verbose query */ + rc = lstcon_nodes_getent(&grp->grp_ndl_list, + index_p, count_p, dents_up); + lstcon_group_put(grp); + + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(gentp, sizeof(lstcon_ndlist_ent_t)); + if (gentp == NULL) { + CERROR("Can't allocate ndlist_ent\n"); + lstcon_group_put(grp); + + return -ENOMEM; + } + + memset(gentp, 0, sizeof(lstcon_ndlist_ent_t)); + + list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp); + + rc = copy_to_user(gents_p, gentp, + sizeof(lstcon_ndlist_ent_t)) ? -EFAULT: 0; + + LIBCFS_FREE(gentp, sizeof(lstcon_ndlist_ent_t)); + + lstcon_group_put(grp); + + return 0; +} + +int +lstcon_batch_find(char *name, lstcon_batch_t **batpp) +{ + lstcon_batch_t *bat; + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (strncmp(bat->bat_name, name, LST_NAME_SIZE) == 0) { + *batpp = bat; + return 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_add(char *name) +{ + lstcon_batch_t *bat; + int i; + int rc; + + rc = (lstcon_batch_find(name, &bat) == 0)? -EEXIST: 0; + if (rc != 0) { + CDEBUG(D_NET, "Batch %s already exists\n", name); + return rc; + } + + LIBCFS_ALLOC(bat, sizeof(lstcon_batch_t)); + if (bat == NULL) { + CERROR("Can't allocate descriptor for batch %s\n", name); + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_cli_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); + + return -ENOMEM; + } + + LIBCFS_ALLOC(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + if (bat->bat_srv_hash == NULL) { + CERROR("Can't allocate hash for batch %s\n", name); + LIBCFS_FREE(bat->bat_cli_hash, LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); + + return -ENOMEM; + } + + strcpy(bat->bat_name, name); + bat->bat_hdr.tsb_index = 0; + bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie; + + bat->bat_ntest = 0; + bat->bat_state = LST_BATCH_IDLE; + + INIT_LIST_HEAD(&bat->bat_cli_list); + INIT_LIST_HEAD(&bat->bat_srv_list); + INIT_LIST_HEAD(&bat->bat_test_list); + INIT_LIST_HEAD(&bat->bat_trans_list); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + INIT_LIST_HEAD(&bat->bat_cli_hash[i]); + INIT_LIST_HEAD(&bat->bat_srv_hash[i]); + } + + list_add_tail(&bat->bat_link, &console_session.ses_bat_list); + + return rc; +} + +int +lstcon_batch_list(int index, int len, char *name_up) +{ + lstcon_batch_t *bat; + + LASSERT (name_up != NULL); + LASSERT (index >= 0); + + list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { + if (index-- == 0) { + return copy_to_user(name_up,bat->bat_name, len) ? + -EFAULT: 0; + } + } + + return -ENOENT; +} + +int +lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, int server, + int testidx, int *index_p, int *ndent_p, + lstcon_node_ent_t *dents_up) +{ + lstcon_test_batch_ent_t *entp; + struct list_head *clilst; + struct list_head *srvlst; + lstcon_test_t *test = NULL; + lstcon_batch_t *bat; + lstcon_ndlink_t *ndl; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + if (testidx > 0) { + /* query test, test index start from 1 */ + list_for_each_entry(test, &bat->bat_test_list, tes_link) { + if (testidx-- == 1) + break; + } + + if (testidx > 0) { + CDEBUG(D_NET, "Can't find specified test in batch\n"); + return -ENOENT; + } + } + + clilst = (test == NULL) ? &bat->bat_cli_list : + &test->tes_src_grp->grp_ndl_list; + srvlst = (test == NULL) ? &bat->bat_srv_list : + &test->tes_dst_grp->grp_ndl_list; + + if (dents_up != NULL) { + rc = lstcon_nodes_getent((server ? srvlst: clilst), + index_p, ndent_p, dents_up); + return rc; + } + + /* non-verbose query */ + LIBCFS_ALLOC(entp, sizeof(lstcon_test_batch_ent_t)); + if (entp == NULL) + return -ENOMEM; + + memset(entp, 0, sizeof(lstcon_test_batch_ent_t)); + + if (test == NULL) { + entp->u.tbe_batch.bae_ntest = bat->bat_ntest; + entp->u.tbe_batch.bae_state = bat->bat_state; + + } else { + + entp->u.tbe_test.tse_type = test->tes_type; + entp->u.tbe_test.tse_loop = test->tes_loop; + entp->u.tbe_test.tse_concur = test->tes_concur; + } + + list_for_each_entry(ndl, clilst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle); + + list_for_each_entry(ndl, srvlst, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle); + + rc = copy_to_user(ent_up, entp, + sizeof(lstcon_test_batch_ent_t)) ? -EFAULT : 0; + + LIBCFS_FREE(entp, sizeof(lstcon_test_batch_ent_t)); + + return rc; +} + +int +lstcon_batrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + switch (transop) { + case LST_TRANS_TSBRUN: + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + break; + + case LST_TRANS_TSBSTOP: + if (nd->nd_state != LST_NODE_ACTIVE) + return 0; + break; + + case LST_TRANS_TSBCLIQRY: + case LST_TRANS_TSBSRVQRY: + break; + } + + return 1; +} + +static int +lstcon_batch_op(lstcon_batch_t *bat, int transop, + struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list, + &bat->bat_trans_list, transop, + bat, lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_batch_run(char *name, int timeout, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = timeout; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up); + + /* mark batch as running if it's started in any node */ + if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0) != 0) + bat->bat_state = LST_BATCH_RUNNING; + + return rc; +} + +int +lstcon_batch_stop(char *name, int force, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + if (lstcon_batch_find(name, &bat) != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return -ENOENT; + } + + bat->bat_arg = force; + + rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up); + + /* mark batch as stopped if all RPCs finished */ + if (lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0) == 0) + bat->bat_state = LST_BATCH_IDLE; + + return rc; +} + +static void +lstcon_batch_destroy(lstcon_batch_t *bat) +{ + lstcon_ndlink_t *ndl; + lstcon_test_t *test; + int i; + + list_del(&bat->bat_link); + + while (!list_empty(&bat->bat_test_list)) { + test = list_entry(bat->bat_test_list.next, + lstcon_test_t, tes_link); + LASSERT (list_empty(&test->tes_trans_list)); + + list_del(&test->tes_link); + + lstcon_group_put(test->tes_src_grp); + lstcon_group_put(test->tes_dst_grp); + + LIBCFS_FREE(test, offsetof(lstcon_test_t, + tes_param[test->tes_paramlen])); + } + + LASSERT (list_empty(&bat->bat_trans_list)); + + while (!list_empty(&bat->bat_cli_list)) { + ndl = list_entry(bat->bat_cli_list.next, + lstcon_ndlink_t, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + while (!list_empty(&bat->bat_srv_list)) { + ndl = list_entry(bat->bat_srv_list.next, + lstcon_ndlink_t, ndl_link); + list_del_init(&ndl->ndl_link); + + lstcon_ndlink_release(ndl); + } + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT (list_empty(&bat->bat_cli_hash[i])); + LASSERT (list_empty(&bat->bat_srv_hash[i])); + } + + LIBCFS_FREE(bat->bat_cli_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat->bat_srv_hash, + sizeof(struct list_head) * LST_NODE_HASHSIZE); + LIBCFS_FREE(bat, sizeof(lstcon_batch_t)); +} + +int +lstcon_testrpc_condition(int transop, lstcon_node_t *nd, void *arg) +{ + lstcon_test_t *test; + lstcon_batch_t *batch; + lstcon_ndlink_t *ndl; + struct list_head *hash; + struct list_head *head; + + test = (lstcon_test_t *)arg; + LASSERT (test != NULL); + + batch = test->tes_batch; + LASSERT (batch != NULL); + + if (test->tes_oneside && + transop == LST_TRANS_TSBSRVADD) + return 0; + + if (nd->nd_state != LST_NODE_ACTIVE) + return -ENETDOWN; + + if (transop == LST_TRANS_TSBCLIADD) { + hash = batch->bat_cli_hash; + head = &batch->bat_cli_list; + + } else { + LASSERT (transop == LST_TRANS_TSBSRVADD); + + hash = batch->bat_srv_hash; + head = &batch->bat_srv_list; + } + + LASSERT (nd->nd_id.nid != LNET_NID_ANY); + + if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1) != 0) + return -ENOMEM; + + if (list_empty(&ndl->ndl_link)) + list_add_tail(&ndl->ndl_link, head); + + return 1; +} + +static int +lstcon_test_nodes_add(lstcon_test_t *test, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + int transop; + int rc; + + LASSERT (test->tes_src_grp != NULL); + LASSERT (test->tes_dst_grp != NULL); + + transop = LST_TRANS_TSBSRVADD; + grp = test->tes_dst_grp; +again: + rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, + &test->tes_trans_list, transop, + test, lstcon_testrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) { + lstcon_rpc_trans_interpreter(trans, result_up, NULL); + + lstcon_rpc_trans_destroy(trans); + /* return if any error */ + CDEBUG(D_NET, "Failed to add test %s, " + "RPC error %d, framework error %d\n", + transop == LST_TRANS_TSBCLIADD ? "client" : "server", + lstcon_trans_stat()->trs_rpc_errno, + lstcon_trans_stat()->trs_fwk_errno); + + return rc; + } + + lstcon_rpc_trans_destroy(trans); + + if (transop == LST_TRANS_TSBCLIADD) + return rc; + + transop = LST_TRANS_TSBCLIADD; + grp = test->tes_src_grp; + test->tes_cliidx = 0; + + /* requests to test clients */ + goto again; +} + +int +lstcon_test_add(char *name, int type, int loop, int concur, + int dist, int span, char *src_name, char * dst_name, + void *param, int paramlen, int *retp, + struct list_head *result_up) +{ + lstcon_group_t *src_grp = NULL; + lstcon_group_t *dst_grp = NULL; + lstcon_test_t *test = NULL; + lstcon_batch_t *batch; + int rc; + + rc = lstcon_batch_find(name, &batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch %s\n", name); + return rc; + } + + if (batch->bat_state != LST_BATCH_IDLE) { + CDEBUG(D_NET, "Can't change running batch %s\n", name); + return rc; + } + + rc = lstcon_group_find(src_name, &src_grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", src_name); + goto out; + } + + rc = lstcon_group_find(dst_name, &dst_grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", dst_name); + goto out; + } + + if (dst_grp->grp_userland) + *retp = 1; + + LIBCFS_ALLOC(test, offsetof(lstcon_test_t, tes_param[paramlen])); + if (!test) { + CERROR("Can't allocate test descriptor\n"); + rc = -ENOMEM; + + goto out; + } + + memset(test, 0, offsetof(lstcon_test_t, tes_param[paramlen])); + test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id; + test->tes_batch = batch; + test->tes_type = type; + test->tes_oneside = 0; /* TODO */ + test->tes_loop = loop; + test->tes_concur = concur; + test->tes_stop_onerr = 1; /* TODO */ + test->tes_span = span; + test->tes_dist = dist; + test->tes_cliidx = 0; /* just used for creating RPC */ + test->tes_src_grp = src_grp; + test->tes_dst_grp = dst_grp; + INIT_LIST_HEAD(&test->tes_trans_list); + + if (param != NULL) { + test->tes_paramlen = paramlen; + memcpy(&test->tes_param[0], param, paramlen); + } + + rc = lstcon_test_nodes_add(test, result_up); + + if (rc != 0) + goto out; + + if (lstcon_trans_stat()->trs_rpc_errno != 0 || + lstcon_trans_stat()->trs_fwk_errno != 0) + CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, name); + + /* add to test list anyway, so user can check what's going on */ + list_add_tail(&test->tes_link, &batch->bat_test_list); + + batch->bat_ntest ++; + test->tes_hdr.tsb_index = batch->bat_ntest; + + /* hold groups so nobody can change them */ + return rc; +out: + if (test != NULL) + LIBCFS_FREE(test, offsetof(lstcon_test_t, tes_param[paramlen])); + + if (dst_grp != NULL) + lstcon_group_put(dst_grp); + + if (src_grp != NULL) + lstcon_group_put(src_grp); + + return rc; +} + +int +lstcon_test_find(lstcon_batch_t *batch, int idx, lstcon_test_t **testpp) +{ + lstcon_test_t *test; + + list_for_each_entry(test, &batch->bat_test_list, tes_link) { + if (idx == test->tes_hdr.tsb_index) { + *testpp = test; + return 0; + } + } + + return -ENOENT; +} + +int +lstcon_tsbrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_batch_reply_t *rep = &msg->msg_body.bat_reply; + + LASSERT (transop == LST_TRANS_TSBCLIQRY || + transop == LST_TRANS_TSBSRVQRY); + + /* positive errno, framework error code */ + if (copy_to_user(&ent_up->rpe_priv[0], + &rep->bar_active, sizeof(rep->bar_active))) + return -EFAULT; + + return 0; +} + +int +lstcon_test_batch_query(char *name, int testidx, int client, + int timeout, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + struct list_head *translist; + struct list_head *ndlist; + lstcon_tsb_hdr_t *hdr; + lstcon_batch_t *batch; + lstcon_test_t *test = NULL; + int transop; + int rc; + + rc = lstcon_batch_find(name, &batch); + if (rc != 0) { + CDEBUG(D_NET, "Can't find batch: %s\n", name); + return rc; + } + + if (testidx == 0) { + translist = &batch->bat_trans_list; + ndlist = &batch->bat_cli_list; + hdr = &batch->bat_hdr; + + } else { + /* query specified test only */ + rc = lstcon_test_find(batch, testidx, &test); + if (rc != 0) { + CDEBUG(D_NET, "Can't find test: %d\n", testidx); + return rc; + } + + translist = &test->tes_trans_list; + ndlist = &test->tes_src_grp->grp_ndl_list; + hdr = &test->tes_hdr; + } + + transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr, + lstcon_batrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, timeout); + + if (testidx == 0 && /* query a batch, not a test */ + lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) == 0 && + lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0) == 0) { + /* all RPCs finished, and no active test */ + batch->bat_state = LST_BATCH_IDLE; + } + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_tsbrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_statrpc_readent(int transop, srpc_msg_t *msg, + lstcon_rpc_ent_t *ent_up) +{ + srpc_stat_reply_t *rep = &msg->msg_body.stat_reply; + sfw_counters_t *sfwk_stat; + srpc_counters_t *srpc_stat; + lnet_counters_t *lnet_stat; + + if (rep->str_status != 0) + return 0; + + sfwk_stat = (sfw_counters_t *)&ent_up->rpe_payload[0]; + srpc_stat = (srpc_counters_t *)((char *)sfwk_stat + sizeof(*sfwk_stat)); + lnet_stat = (lnet_counters_t *)((char *)srpc_stat + sizeof(*srpc_stat)); + + if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) || + copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) || + copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat))) + return -EFAULT; + + return 0; +} + +int +lstcon_ndlist_stat(struct list_head *ndlist, + int timeout, struct list_head *result_up) +{ + struct list_head head; + lstcon_rpc_trans_t *trans; + int rc; + + INIT_LIST_HEAD(&head); + + rc = lstcon_rpc_trans_ndlist(ndlist, &head, + LST_TRANS_STATQRY, NULL, NULL, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_statrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_group_stat(char *grp_name, int timeout, struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(grp_name, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Can't find group %s\n", grp_name); + return rc; + } + + rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_nodes_stat(int count, lnet_process_id_t *ids_up, + int timeout, struct list_head *result_up) +{ + lstcon_ndlink_t *ndl; + lstcon_group_t *tmp; + lnet_process_id_t id; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &tmp); + if (rc != 0) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + for (i = 0 ; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* add to tmp group */ + rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2); + if (rc != 0) { + CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET, + "Failed to find or create %s: %d\n", + libcfs_id2str(id), rc); + break; + } + } + + if (rc != 0) { + lstcon_group_put(tmp); + return rc; + } + + rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up); + + lstcon_group_put(tmp); + + return rc; +} + +int +lstcon_debug_ndlist(struct list_head *ndlist, + struct list_head *translist, + int timeout, struct list_head *result_up) +{ + lstcon_rpc_trans_t *trans; + int rc; + + rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY, + NULL, lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); + + rc = lstcon_rpc_trans_interpreter(trans, result_up, + lstcon_sesrpc_readent); + lstcon_rpc_trans_destroy(trans); + + return rc; +} + +int +lstcon_session_debug(int timeout, struct list_head *result_up) +{ + return lstcon_debug_ndlist(&console_session.ses_ndl_list, + NULL, timeout, result_up); +} + +int +lstcon_batch_debug(int timeout, char *name, + int client, struct list_head *result_up) +{ + lstcon_batch_t *bat; + int rc; + + rc = lstcon_batch_find(name, &bat); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list : + &bat->bat_srv_list, + NULL, timeout, result_up); + + return rc; +} + +int +lstcon_group_debug(int timeout, char *name, + struct list_head *result_up) +{ + lstcon_group_t *grp; + int rc; + + rc = lstcon_group_find(name, &grp); + if (rc != 0) + return -ENOENT; + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_nodes_debug(int timeout, + int count, lnet_process_id_t *ids_up, + struct list_head *result_up) +{ + lnet_process_id_t id; + lstcon_ndlink_t *ndl; + lstcon_group_t *grp; + int i; + int rc; + + rc = lstcon_group_alloc(NULL, &grp); + if (rc != 0) { + CDEBUG(D_NET, "Out of memory\n"); + return rc; + } + + for (i = 0; i < count; i++) { + if (copy_from_user(&id, &ids_up[i], sizeof(id))) { + rc = -EFAULT; + break; + } + + /* node is added to tmp group */ + rc = lstcon_group_ndlink_find(grp, id, &ndl, 1); + if (rc != 0) { + CERROR("Can't create node link\n"); + break; + } + } + + if (rc != 0) { + lstcon_group_put(grp); + return rc; + } + + rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, + timeout, result_up); + + lstcon_group_put(grp); + + return rc; +} + +int +lstcon_session_match(lst_sid_t sid) +{ + return (console_session.ses_id.ses_nid == sid.ses_nid && + console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1: 0; +} + +static void +lstcon_new_session_id(lst_sid_t *sid) +{ + lnet_process_id_t id; + + LASSERT (console_session.ses_state == LST_SESSION_NONE); + + LNetGetId(1, &id); + sid->ses_nid = id.nid; + sid->ses_stamp = cfs_time_current(); +} + +extern srpc_service_t lstcon_acceptor_service; + +int +lstcon_session_new(char *name, int key, unsigned feats, + int timeout, int force, lst_sid_t *sid_up) +{ + int rc = 0; + int i; + + if (console_session.ses_state != LST_SESSION_NONE) { + /* session exists */ + if (!force) { + CNETERR("Session %s already exists\n", + console_session.ses_name); + return -EEXIST; + } + + rc = lstcon_session_end(); + + /* lstcon_session_end() only return local error */ + if (rc != 0) + return rc; + } + + if ((feats & ~LST_FEATS_MASK) != 0) { + CNETERR("Unknown session features %x\n", + (feats & ~LST_FEATS_MASK)); + return -EINVAL; + } + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + LASSERT(list_empty(&console_session.ses_ndl_hash[i])); + + lstcon_new_session_id(&console_session.ses_id); + + console_session.ses_key = key; + console_session.ses_state = LST_SESSION_ACTIVE; + console_session.ses_force = !!force; + console_session.ses_features = feats; + console_session.ses_feats_updated = 0; + console_session.ses_timeout = (timeout <= 0) ? + LST_CONSOLE_TIMEOUT : timeout; + strcpy(console_session.ses_name, name); + + rc = lstcon_batch_add(LST_DEFAULT_BATCH); + if (rc != 0) + return rc; + + rc = lstcon_rpc_pinger_start(); + if (rc != 0) { + lstcon_batch_t *bat = NULL; + + lstcon_batch_find(LST_DEFAULT_BATCH, &bat); + lstcon_batch_destroy(bat); + + return rc; + } + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(lst_sid_t)) == 0) + return rc; + + lstcon_session_end(); + + return -EFAULT; +} + +int +lstcon_session_info(lst_sid_t *sid_up, int *key_up, unsigned *featp, + lstcon_ndlist_ent_t *ndinfo_up, char *name_up, int len) +{ + lstcon_ndlist_ent_t *entp; + lstcon_ndlink_t *ndl; + int rc = 0; + + if (console_session.ses_state != LST_SESSION_ACTIVE) + return -ESRCH; + + LIBCFS_ALLOC(entp, sizeof(*entp)); + if (entp == NULL) + return -ENOMEM; + + memset(entp, 0, sizeof(*entp)); + + list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) + LST_NODE_STATE_COUNTER(ndl->ndl_node, entp); + + if (copy_to_user(sid_up, &console_session.ses_id, + sizeof(lst_sid_t)) || + copy_to_user(key_up, &console_session.ses_key, + sizeof(*key_up)) || + copy_to_user(featp, &console_session.ses_features, + sizeof(*featp)) || + copy_to_user(ndinfo_up, entp, sizeof(*entp)) || + copy_to_user(name_up, console_session.ses_name, len)) + rc = -EFAULT; + + LIBCFS_FREE(entp, sizeof(*entp)); + + return rc; +} + +int +lstcon_session_end() +{ + lstcon_rpc_trans_t *trans; + lstcon_group_t *grp; + lstcon_batch_t *bat; + int rc = 0; + + LASSERT (console_session.ses_state == LST_SESSION_ACTIVE); + + rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list, + NULL, LST_TRANS_SESEND, NULL, + lstcon_sesrpc_condition, &trans); + if (rc != 0) { + CERROR("Can't create transaction: %d\n", rc); + return rc; + } + + console_session.ses_shutdown = 1; + + lstcon_rpc_pinger_stop(); + + lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); + + lstcon_rpc_trans_destroy(trans); + /* User can do nothing even rpc failed, so go on */ + + /* waiting for orphan rpcs to die */ + lstcon_rpc_cleanup_wait(); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_key = 0; + console_session.ses_force = 0; + console_session.ses_feats_updated = 0; + + /* destroy all batches */ + while (!list_empty(&console_session.ses_bat_list)) { + bat = list_entry(console_session.ses_bat_list.next, + lstcon_batch_t, bat_link); + + lstcon_batch_destroy(bat); + } + + /* destroy all groups */ + while (!list_empty(&console_session.ses_grp_list)) { + grp = list_entry(console_session.ses_grp_list.next, + lstcon_group_t, grp_link); + LASSERT (grp->grp_ref == 1); + + lstcon_group_put(grp); + } + + /* all nodes should be released */ + LASSERT (list_empty(&console_session.ses_ndl_list)); + + console_session.ses_shutdown = 0; + console_session.ses_expired = 0; + + return rc; +} + +int +lstcon_session_feats_check(unsigned feats) +{ + int rc = 0; + + if ((feats & ~LST_FEATS_MASK) != 0) { + CERROR("Can't support these features: %x\n", + (feats & ~LST_FEATS_MASK)); + return -EPROTO; + } + + spin_lock(&console_session.ses_rpc_lock); + + if (!console_session.ses_feats_updated) { + console_session.ses_feats_updated = 1; + console_session.ses_features = feats; + } + + if (console_session.ses_features != feats) + rc = -EPROTO; + + spin_unlock(&console_session.ses_rpc_lock); + + if (rc != 0) { + CERROR("remote features %x do not match with " + "session features %x of console\n", + feats, console_session.ses_features); + } + + return rc; +} + +static int +lstcon_acceptor_handle (srpc_server_rpc_t *rpc) +{ + srpc_msg_t *rep = &rpc->srpc_replymsg; + srpc_msg_t *req = &rpc->srpc_reqstbuf->buf_msg; + srpc_join_reqst_t *jreq = &req->msg_body.join_reqst; + srpc_join_reply_t *jrep = &rep->msg_body.join_reply; + lstcon_group_t *grp = NULL; + lstcon_ndlink_t *ndl; + int rc = 0; + + sfw_unpack_message(req); + + mutex_lock(&console_session.ses_mutex); + + jrep->join_sid = console_session.ses_id; + + if (console_session.ses_id.ses_nid == LNET_NID_ANY) { + jrep->join_status = ESRCH; + goto out; + } + + if (lstcon_session_feats_check(req->msg_ses_feats) != 0) { + jrep->join_status = EPROTO; + goto out; + } + + if (jreq->join_sid.ses_nid != LNET_NID_ANY && + !lstcon_session_match(jreq->join_sid)) { + jrep->join_status = EBUSY; + goto out; + } + + if (lstcon_group_find(jreq->join_group, &grp) != 0) { + rc = lstcon_group_alloc(jreq->join_group, &grp); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + list_add_tail(&grp->grp_link, + &console_session.ses_grp_list); + lstcon_group_addref(grp); + } + + if (grp->grp_ref > 2) { + /* Group in using */ + jrep->join_status = EBUSY; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0); + if (rc == 0) { + jrep->join_status = EEXIST; + goto out; + } + + rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1); + if (rc != 0) { + CERROR("Out of memory\n"); + goto out; + } + + ndl->ndl_node->nd_state = LST_NODE_ACTIVE; + ndl->ndl_node->nd_timeout = console_session.ses_timeout; + + if (grp->grp_userland == 0) + grp->grp_userland = 1; + + strcpy(jrep->join_session, console_session.ses_name); + jrep->join_timeout = console_session.ses_timeout; + jrep->join_status = 0; + +out: + rep->msg_ses_feats = console_session.ses_features; + if (grp != NULL) + lstcon_group_put(grp); + + mutex_unlock(&console_session.ses_mutex); + + return rc; +} + +srpc_service_t lstcon_acceptor_service; +void lstcon_init_acceptor_service(void) +{ + /* initialize selftest console acceptor service table */ + lstcon_acceptor_service.sv_name = "join session"; + lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle; + lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN; + lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; +} + +extern int lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_data *data); + +DECLARE_IOCTL_HANDLER(lstcon_ioctl_handler, lstcon_ioctl_entry); + +/* initialize console */ +int +lstcon_console_init(void) +{ + int i; + int rc; + + memset(&console_session, 0, sizeof(lstcon_session_t)); + + console_session.ses_id = LST_INVALID_SID; + console_session.ses_state = LST_SESSION_NONE; + console_session.ses_timeout = 0; + console_session.ses_force = 0; + console_session.ses_expired = 0; + console_session.ses_feats_updated = 0; + console_session.ses_features = LST_FEATS_MASK; + console_session.ses_laststamp = cfs_time_current_sec(); + + mutex_init(&console_session.ses_mutex); + + INIT_LIST_HEAD(&console_session.ses_ndl_list); + INIT_LIST_HEAD(&console_session.ses_grp_list); + INIT_LIST_HEAD(&console_session.ses_bat_list); + INIT_LIST_HEAD(&console_session.ses_trans_list); + + LIBCFS_ALLOC(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + if (console_session.ses_ndl_hash == NULL) + return -ENOMEM; + + for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) + INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]); + + + /* initialize acceptor service table */ + lstcon_init_acceptor_service(); + + rc = srpc_add_service(&lstcon_acceptor_service); + LASSERT (rc != -EBUSY); + if (rc != 0) { + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + return rc; + } + + rc = srpc_service_add_buffers(&lstcon_acceptor_service, + lstcon_acceptor_service.sv_wi_total); + if (rc != 0) { + rc = -ENOMEM; + goto out; + } + + rc = libcfs_register_ioctl(&lstcon_ioctl_handler); + + if (rc == 0) { + lstcon_rpc_module_init(); + return 0; + } + +out: + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return rc; +} + +int +lstcon_console_fini(void) +{ + int i; + + libcfs_deregister_ioctl(&lstcon_ioctl_handler); + + mutex_lock(&console_session.ses_mutex); + + srpc_shutdown_service(&lstcon_acceptor_service); + srpc_remove_service(&lstcon_acceptor_service); + + if (console_session.ses_state != LST_SESSION_NONE) + lstcon_session_end(); + + lstcon_rpc_module_fini(); + + mutex_unlock(&console_session.ses_mutex); + + LASSERT (list_empty(&console_session.ses_ndl_list)); + LASSERT (list_empty(&console_session.ses_grp_list)); + LASSERT (list_empty(&console_session.ses_bat_list)); + LASSERT (list_empty(&console_session.ses_trans_list)); + + for (i = 0; i < LST_NODE_HASHSIZE; i++) { + LASSERT (list_empty(&console_session.ses_ndl_hash[i])); + } + + LIBCFS_FREE(console_session.ses_ndl_hash, + sizeof(struct list_head) * LST_GLOBAL_HASHSIZE); + + srpc_wait_service_shutdown(&lstcon_acceptor_service); + + return 0; +} diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h new file mode 100644 index 000000000000..e61b26687dbb --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/console.h @@ -0,0 +1,232 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/console.h + * + * kernel structure for LST console + * + * Author: Liang Zhen + */ + +#ifndef __LST_CONSOLE_H__ +#define __LST_CONSOLE_H__ + + +#include +#include +#include +#include +#include "selftest.h" +#include "conrpc.h" + +typedef struct lstcon_node { + lnet_process_id_t nd_id; /* id of the node */ + int nd_ref; /* reference count */ + int nd_state; /* state of the node */ + int nd_timeout; /* session timeout */ + cfs_time_t nd_stamp; /* timestamp of last replied RPC */ + struct lstcon_rpc nd_ping; /* ping rpc */ +} lstcon_node_t; /*** node descriptor */ + +typedef struct { + struct list_head ndl_link; /* chain on list */ + struct list_head ndl_hlink; /* chain on hash */ + lstcon_node_t *ndl_node; /* pointer to node */ +} lstcon_ndlink_t; /*** node link descriptor */ + +typedef struct { + struct list_head grp_link; /* chain on global group list */ + int grp_ref; /* reference count */ + int grp_userland; /* has userland nodes */ + int grp_nnode; /* # of nodes */ + char grp_name[LST_NAME_SIZE]; /* group name */ + + struct list_head grp_trans_list; /* transaction list */ + struct list_head grp_ndl_list; /* nodes list */ + struct list_head grp_ndl_hash[0];/* hash table for nodes */ +} lstcon_group_t; /*** (alias of nodes) group descriptor */ + +#define LST_BATCH_IDLE 0xB0 /* idle batch */ +#define LST_BATCH_RUNNING 0xB1 /* running batch */ + +typedef struct lstcon_tsb_hdr { + lst_bid_t tsb_id; /* batch ID */ + int tsb_index; /* test index */ +} lstcon_tsb_hdr_t; + +typedef struct { + lstcon_tsb_hdr_t bat_hdr; /* test_batch header */ + struct list_head bat_link; /* chain on session's batches list */ + int bat_ntest; /* # of test */ + int bat_state; /* state of the batch */ + int bat_arg; /* parameter for run|stop, timeout for run, force for stop */ + char bat_name[LST_NAME_SIZE]; /* name of batch */ + + struct list_head bat_test_list; /* list head of tests (lstcon_test_t) */ + struct list_head bat_trans_list; /* list head of transaction */ + struct list_head bat_cli_list; /* list head of client nodes (lstcon_node_t) */ + struct list_head *bat_cli_hash; /* hash table of client nodes */ + struct list_head bat_srv_list; /* list head of server nodes */ + struct list_head *bat_srv_hash; /* hash table of server nodes */ +} lstcon_batch_t; /*** (tests ) batch descritptor */ + +typedef struct lstcon_test { + lstcon_tsb_hdr_t tes_hdr; /* test batch header */ + struct list_head tes_link; /* chain on batch's tests list */ + lstcon_batch_t *tes_batch; /* pointer to batch */ + + int tes_type; /* type of the test, i.e: bulk, ping */ + int tes_stop_onerr; /* stop on error */ + int tes_oneside; /* one-sided test */ + int tes_concur; /* concurrency */ + int tes_loop; /* loop count */ + int tes_dist; /* nodes distribution of target group */ + int tes_span; /* nodes span of target group */ + int tes_cliidx; /* client index, used for RPC creating */ + + struct list_head tes_trans_list; /* transaction list */ + lstcon_group_t *tes_src_grp; /* group run the test */ + lstcon_group_t *tes_dst_grp; /* target group */ + + int tes_paramlen; /* test parameter length */ + char tes_param[0]; /* test parameter */ +} lstcon_test_t; /*** a single test descriptor */ + +#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */ +#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */ + +#define LST_SESSION_NONE 0x0 /* no session */ +#define LST_SESSION_ACTIVE 0x1 /* working session */ + +#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */ + +typedef struct { + struct mutex ses_mutex; /* only 1 thread in session */ + lst_sid_t ses_id; /* global session id */ + int ses_key; /* local session key */ + int ses_state; /* state of session */ + int ses_timeout; /* timeout in seconds */ + time_t ses_laststamp; /* last operation stamp (seconds) */ + /** tests features of the session */ + unsigned ses_features; + /** features are synced with remote test nodes */ + unsigned ses_feats_updated:1; + /** force creating */ + unsigned ses_force:1; + /** session is shutting down */ + unsigned ses_shutdown:1; + /** console is timedout */ + unsigned ses_expired:1; + __u64 ses_id_cookie; /* batch id cookie */ + char ses_name[LST_NAME_SIZE]; /* session name */ + lstcon_rpc_trans_t *ses_ping; /* session pinger */ + stt_timer_t ses_ping_timer; /* timer for pinger */ + lstcon_trans_stat_t ses_trans_stat; /* transaction stats */ + + struct list_head ses_trans_list; /* global list of transaction */ + struct list_head ses_grp_list; /* global list of groups */ + struct list_head ses_bat_list; /* global list of batches */ + struct list_head ses_ndl_list; /* global list of nodes */ + struct list_head *ses_ndl_hash; /* hash table of nodes */ + + spinlock_t ses_rpc_lock; /* serialize */ + atomic_t ses_rpc_counter;/* # of initialized RPCs */ + struct list_head ses_rpc_freelist; /* idle console rpc */ +} lstcon_session_t; /*** session descriptor */ + +extern lstcon_session_t console_session; + +static inline lstcon_trans_stat_t * +lstcon_trans_stat(void) +{ + return &console_session.ses_trans_stat; +} + +static inline struct list_head * +lstcon_id2hash (lnet_process_id_t id, struct list_head *hash) +{ + unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; + + return &hash[idx]; +} + +extern int lstcon_session_match(lst_sid_t sid); +extern int lstcon_session_new(char *name, int key, unsigned version, + int timeout, int flags, lst_sid_t *sid_up); +extern int lstcon_session_info(lst_sid_t *sid_up, int *key, unsigned *verp, + lstcon_ndlist_ent_t *entp, char *name_up, int len); +extern int lstcon_session_end(void); +extern int lstcon_session_debug(int timeout, struct list_head *result_up); +extern int lstcon_session_feats_check(unsigned feats); +extern int lstcon_batch_debug(int timeout, char *name, + int client, struct list_head *result_up); +extern int lstcon_group_debug(int timeout, char *name, + struct list_head *result_up); +extern int lstcon_nodes_debug(int timeout, int nnd, lnet_process_id_t *nds_up, + struct list_head *result_up); +extern int lstcon_group_add(char *name); +extern int lstcon_group_del(char *name); +extern int lstcon_group_clean(char *name, int args); +extern int lstcon_group_refresh(char *name, struct list_head *result_up); +extern int lstcon_nodes_add(char *name, int nnd, lnet_process_id_t *nds_up, + unsigned *featp, struct list_head *result_up); +extern int lstcon_nodes_remove(char *name, int nnd, lnet_process_id_t *nds_up, + struct list_head *result_up); +extern int lstcon_group_info(char *name, lstcon_ndlist_ent_t *gent_up, + int *index_p, int *ndent_p, lstcon_node_ent_t *ndents_up); +extern int lstcon_group_list(int idx, int len, char *name_up); +extern int lstcon_batch_add(char *name); +extern int lstcon_batch_run(char *name, int timeout, + struct list_head *result_up); +extern int lstcon_batch_stop(char *name, int force, + struct list_head *result_up); +extern int lstcon_test_batch_query(char *name, int testidx, + int client, int timeout, + struct list_head *result_up); +extern int lstcon_batch_del(char *name); +extern int lstcon_batch_list(int idx, int namelen, char *name_up); +extern int lstcon_batch_info(char *name, lstcon_test_batch_ent_t *ent_up, + int server, int testidx, int *index_p, + int *ndent_p, lstcon_node_ent_t *dents_up); +extern int lstcon_group_stat(char *grp_name, int timeout, + struct list_head *result_up); +extern int lstcon_nodes_stat(int count, lnet_process_id_t *ids_up, + int timeout, struct list_head *result_up); +extern int lstcon_test_add(char *name, int type, int loop, int concur, + int dist, int span, char *src_name, char * dst_name, + void *param, int paramlen, int *retp, + struct list_head *result_up); + +#endif diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c new file mode 100644 index 000000000000..483c78564dae --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/framework.c @@ -0,0 +1,1814 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/framework.c + * + * Author: Isaac Huang + * Author: Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +lst_sid_t LST_INVALID_SID = {LNET_NID_ANY, -1}; + +static int session_timeout = 100; +CFS_MODULE_PARM(session_timeout, "i", int, 0444, + "test session timeout in seconds (100 by default, 0 == never)"); + +static int rpc_timeout = 64; +CFS_MODULE_PARM(rpc_timeout, "i", int, 0644, + "rpc timeout in seconds (64 by default, 0 == never)"); + +#define sfw_unpack_id(id) \ +do { \ + __swab64s(&(id).nid); \ + __swab32s(&(id).pid); \ +} while (0) + +#define sfw_unpack_sid(sid) \ +do { \ + __swab64s(&(sid).ses_nid); \ + __swab64s(&(sid).ses_stamp); \ +} while (0) + +#define sfw_unpack_fw_counters(fc) \ +do { \ + __swab32s(&(fc).running_ms); \ + __swab32s(&(fc).active_batches); \ + __swab32s(&(fc).zombie_sessions); \ + __swab32s(&(fc).brw_errors); \ + __swab32s(&(fc).ping_errors); \ +} while (0) + +#define sfw_unpack_rpc_counters(rc) \ +do { \ + __swab32s(&(rc).errors); \ + __swab32s(&(rc).rpcs_sent); \ + __swab32s(&(rc).rpcs_rcvd); \ + __swab32s(&(rc).rpcs_dropped); \ + __swab32s(&(rc).rpcs_expired); \ + __swab64s(&(rc).bulk_get); \ + __swab64s(&(rc).bulk_put); \ +} while (0) + +#define sfw_unpack_lnet_counters(lc) \ +do { \ + __swab32s(&(lc).errors); \ + __swab32s(&(lc).msgs_max); \ + __swab32s(&(lc).msgs_alloc); \ + __swab32s(&(lc).send_count); \ + __swab32s(&(lc).recv_count); \ + __swab32s(&(lc).drop_count); \ + __swab32s(&(lc).route_count); \ + __swab64s(&(lc).send_length); \ + __swab64s(&(lc).recv_length); \ + __swab64s(&(lc).drop_length); \ + __swab64s(&(lc).route_length); \ +} while (0) + +#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive) != 0) +#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive) != 0) + +struct smoketest_framework { + struct list_head fw_zombie_rpcs; /* RPCs to be recycled */ + struct list_head fw_zombie_sessions; /* stopping sessions */ + struct list_head fw_tests; /* registered test cases */ + atomic_t fw_nzombies; /* # zombie sessions */ + spinlock_t fw_lock; /* serialise */ + sfw_session_t *fw_session; /* _the_ session */ + int fw_shuttingdown; /* shutdown in progress */ + srpc_server_rpc_t *fw_active_srpc; /* running RPC */ +} sfw_data; + +/* forward ref's */ +int sfw_stop_batch (sfw_batch_t *tsb, int force); +void sfw_destroy_session (sfw_session_t *sn); + +static inline sfw_test_case_t * +sfw_find_test_case(int id) +{ + sfw_test_case_t *tsc; + + LASSERT (id <= SRPC_SERVICE_MAX_ID); + LASSERT (id > SRPC_FRAMEWORK_SERVICE_MAX_ID); + + list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) { + if (tsc->tsc_srv_service->sv_id == id) + return tsc; + } + + return NULL; +} + +static int +sfw_register_test (srpc_service_t *service, sfw_test_client_ops_t *cliops) +{ + sfw_test_case_t *tsc; + + if (sfw_find_test_case(service->sv_id) != NULL) { + CERROR ("Failed to register test %s (%d)\n", + service->sv_name, service->sv_id); + return -EEXIST; + } + + LIBCFS_ALLOC(tsc, sizeof(sfw_test_case_t)); + if (tsc == NULL) + return -ENOMEM; + + memset(tsc, 0, sizeof(sfw_test_case_t)); + tsc->tsc_cli_ops = cliops; + tsc->tsc_srv_service = service; + + list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests); + return 0; +} + +void +sfw_add_session_timer (void) +{ + sfw_session_t *sn = sfw_data.fw_session; + stt_timer_t *timer = &sn->sn_timer; + + LASSERT (!sfw_data.fw_shuttingdown); + + if (sn == NULL || sn->sn_timeout == 0) + return; + + LASSERT (!sn->sn_timer_active); + + sn->sn_timer_active = 1; + timer->stt_expires = cfs_time_add(sn->sn_timeout, + cfs_time_current_sec()); + stt_add_timer(timer); + return; +} + +int +sfw_del_session_timer (void) +{ + sfw_session_t *sn = sfw_data.fw_session; + + if (sn == NULL || !sn->sn_timer_active) + return 0; + + LASSERT (sn->sn_timeout != 0); + + if (stt_del_timer(&sn->sn_timer)) { /* timer defused */ + sn->sn_timer_active = 0; + return 0; + } + + return EBUSY; /* racing with sfw_session_expired() */ +} + +/* called with sfw_data.fw_lock held */ +static void +sfw_deactivate_session (void) +{ + sfw_session_t *sn = sfw_data.fw_session; + int nactive = 0; + sfw_batch_t *tsb; + sfw_test_case_t *tsc; + + if (sn == NULL) return; + + LASSERT (!sn->sn_timer_active); + + sfw_data.fw_session = NULL; + atomic_inc(&sfw_data.fw_nzombies); + list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions); + + spin_unlock(&sfw_data.fw_lock); + + list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { + srpc_abort_service(tsc->tsc_srv_service); + } + + spin_lock(&sfw_data.fw_lock); + + list_for_each_entry (tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + nactive++; + sfw_stop_batch(tsb, 1); + } + } + + if (nactive != 0) + return; /* wait for active batches to stop */ + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + + spin_lock(&sfw_data.fw_lock); +} + + +void +sfw_session_expired (void *data) +{ + sfw_session_t *sn = data; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (sn->sn_timer_active); + LASSERT (sn == sfw_data.fw_session); + + CWARN ("Session expired! sid: %s-"LPU64", name: %s\n", + libcfs_nid2str(sn->sn_id.ses_nid), + sn->sn_id.ses_stamp, &sn->sn_name[0]); + + sn->sn_timer_active = 0; + sfw_deactivate_session(); + + spin_unlock(&sfw_data.fw_lock); +} + +static inline void +sfw_init_session(sfw_session_t *sn, lst_sid_t sid, + unsigned features, const char *name) +{ + stt_timer_t *timer = &sn->sn_timer; + + memset(sn, 0, sizeof(sfw_session_t)); + INIT_LIST_HEAD(&sn->sn_list); + INIT_LIST_HEAD(&sn->sn_batches); + atomic_set(&sn->sn_refcount, 1); /* +1 for caller */ + atomic_set(&sn->sn_brw_errors, 0); + atomic_set(&sn->sn_ping_errors, 0); + strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name)); + + sn->sn_timer_active = 0; + sn->sn_id = sid; + sn->sn_features = features; + sn->sn_timeout = session_timeout; + sn->sn_started = cfs_time_current(); + + timer->stt_data = sn; + timer->stt_func = sfw_session_expired; + INIT_LIST_HEAD(&timer->stt_list); +} + +/* completion handler for incoming framework RPCs */ +void +sfw_server_rpc_done(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int status = rpc->srpc_status; + + CDEBUG (D_NET, + "Incoming framework RPC done: " + "service %s, peer %s, status %s:%d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + status); + + if (rpc->srpc_bulk != NULL) + sfw_free_pages(rpc); + return; +} + +void +sfw_client_rpc_fini (srpc_client_rpc_t *rpc) +{ + LASSERT (rpc->crpc_bulk.bk_niov == 0); + LASSERT (list_empty(&rpc->crpc_list)); + LASSERT (atomic_read(&rpc->crpc_refcount) == 0); + + CDEBUG (D_NET, + "Outgoing framework RPC done: " + "service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), + rpc->crpc_aborted, rpc->crpc_status); + + spin_lock(&sfw_data.fw_lock); + + /* my callers must finish all RPCs before shutting me down */ + LASSERT(!sfw_data.fw_shuttingdown); + list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs); + + spin_unlock(&sfw_data.fw_lock); +} + +sfw_batch_t * +sfw_find_batch (lst_bid_t bid) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_batch_t *bat; + + LASSERT (sn != NULL); + + list_for_each_entry (bat, &sn->sn_batches, bat_list) { + if (bat->bat_id.bat_id == bid.bat_id) + return bat; + } + + return NULL; +} + +sfw_batch_t * +sfw_bid2batch (lst_bid_t bid) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_batch_t *bat; + + LASSERT (sn != NULL); + + bat = sfw_find_batch(bid); + if (bat != NULL) + return bat; + + LIBCFS_ALLOC(bat, sizeof(sfw_batch_t)); + if (bat == NULL) + return NULL; + + bat->bat_error = 0; + bat->bat_session = sn; + bat->bat_id = bid; + atomic_set(&bat->bat_nactive, 0); + INIT_LIST_HEAD(&bat->bat_tests); + + list_add_tail(&bat->bat_list, &sn->sn_batches); + return bat; +} + +int +sfw_get_stats (srpc_stat_reqst_t *request, srpc_stat_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + sfw_counters_t *cnt = &reply->str_fw; + sfw_batch_t *bat; + struct timeval tv; + + reply->str_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->str_sid.ses_nid == LNET_NID_ANY) { + reply->str_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->str_sid, sn->sn_id)) { + reply->str_status = ESRCH; + return 0; + } + + lnet_counters_get(&reply->str_lnet); + srpc_get_counters(&reply->str_rpc); + + /* send over the msecs since the session was started + - with 32 bits to send, this is ~49 days */ + cfs_duration_usec(cfs_time_sub(cfs_time_current(), + sn->sn_started), &tv); + + cnt->running_ms = (__u32)(tv.tv_sec * 1000 + tv.tv_usec / 1000); + cnt->brw_errors = atomic_read(&sn->sn_brw_errors); + cnt->ping_errors = atomic_read(&sn->sn_ping_errors); + cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); + + cnt->active_batches = 0; + list_for_each_entry (bat, &sn->sn_batches, bat_list) { + if (atomic_read(&bat->bat_nactive) > 0) + cnt->active_batches++; + } + + reply->str_status = 0; + return 0; +} + +int +sfw_make_session(srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + srpc_msg_t *msg = container_of(request, srpc_msg_t, + msg_body.mksn_reqst); + int cplen = 0; + + if (request->mksn_sid.ses_nid == LNET_NID_ANY) { + reply->mksn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + reply->mksn_status = EINVAL; + return 0; + } + + if (sn != NULL) { + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + + if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) { + atomic_inc(&sn->sn_refcount); + return 0; + } + + if (!request->mksn_force) { + reply->mksn_status = EBUSY; + cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0], + sizeof(reply->mksn_name)); + if (cplen >= sizeof(reply->mksn_name)) + return -E2BIG; + return 0; + } + } + + /* reject the request if it requires unknown features + * NB: old version will always accept all features because it's not + * aware of srpc_msg_t::msg_ses_feats, it's a defect but it's also + * harmless because it will return zero feature to console, and it's + * console's responsibility to make sure all nodes in a session have + * same feature mask. */ + if ((msg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + reply->mksn_status = EPROTO; + return 0; + } + + /* brand new or create by force */ + LIBCFS_ALLOC(sn, sizeof(sfw_session_t)); + if (sn == NULL) { + CERROR ("Dropping RPC (mksn) under memory pressure.\n"); + return -ENOMEM; + } + + sfw_init_session(sn, request->mksn_sid, + msg->msg_ses_feats, &request->mksn_name[0]); + + spin_lock(&sfw_data.fw_lock); + + sfw_deactivate_session(); + LASSERT(sfw_data.fw_session == NULL); + sfw_data.fw_session = sn; + + spin_unlock(&sfw_data.fw_lock); + + reply->mksn_status = 0; + reply->mksn_sid = sn->sn_id; + reply->mksn_timeout = sn->sn_timeout; + return 0; +} + +int +sfw_remove_session (srpc_rmsn_reqst_t *request, srpc_rmsn_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + + reply->rmsn_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->rmsn_sid.ses_nid == LNET_NID_ANY) { + reply->rmsn_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) { + reply->rmsn_status = (sn == NULL) ? ESRCH : EBUSY; + return 0; + } + + if (!atomic_dec_and_test(&sn->sn_refcount)) { + reply->rmsn_status = 0; + return 0; + } + + spin_lock(&sfw_data.fw_lock); + sfw_deactivate_session(); + spin_unlock(&sfw_data.fw_lock); + + reply->rmsn_status = 0; + reply->rmsn_sid = LST_INVALID_SID; + LASSERT(sfw_data.fw_session == NULL); + return 0; +} + +int +sfw_debug_session (srpc_debug_reqst_t *request, srpc_debug_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + + if (sn == NULL) { + reply->dbg_status = ESRCH; + reply->dbg_sid = LST_INVALID_SID; + return 0; + } + + reply->dbg_status = 0; + reply->dbg_sid = sn->sn_id; + reply->dbg_timeout = sn->sn_timeout; + if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name)) + >= sizeof(reply->dbg_name)) + return -E2BIG; + + return 0; +} + +void +sfw_test_rpc_fini (srpc_client_rpc_t *rpc) +{ + sfw_test_unit_t *tsu = rpc->crpc_priv; + sfw_test_instance_t *tsi = tsu->tsu_instance; + + /* Called with hold of tsi->tsi_lock */ + LASSERT (list_empty(&rpc->crpc_list)); + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); +} + +static inline int +sfw_test_buffers(sfw_test_instance_t *tsi) +{ + struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service); + struct srpc_service *svc = tsc->tsc_srv_service; + int nbuf; + + nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts; + return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA); +} + +int +sfw_load_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc; + struct srpc_service *svc; + int nbuf; + int rc; + + LASSERT(tsi != NULL); + tsc = sfw_find_test_case(tsi->tsi_service); + nbuf = sfw_test_buffers(tsi); + LASSERT(tsc != NULL); + svc = tsc->tsc_srv_service; + + if (tsi->tsi_is_client) { + tsi->tsi_ops = tsc->tsc_cli_ops; + return 0; + } + + rc = srpc_service_add_buffers(svc, nbuf); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", svc->sv_name, nbuf, rc); + /* NB: this error handler is not strictly correct, because + * it may release more buffers than already allocated, + * but it doesn't matter because request portal should + * be lazy portal and will grow buffers if necessary. */ + srpc_service_remove_buffers(svc, nbuf); + return -ENOMEM; + } + + CDEBUG(D_NET, "Reserved %d buffers for test %s\n", + nbuf * (srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(cfs_cpt_table)), svc->sv_name); + return 0; +} + +void +sfw_unload_test(struct sfw_test_instance *tsi) +{ + struct sfw_test_case *tsc = sfw_find_test_case(tsi->tsi_service); + + LASSERT(tsc != NULL); + + if (tsi->tsi_is_client) + return; + + /* shrink buffers, because request portal is lazy portal + * which can grow buffers at runtime so we may leave + * some buffers behind, but never mind... */ + srpc_service_remove_buffers(tsc->tsc_srv_service, + sfw_test_buffers(tsi)); + return; +} + +void +sfw_destroy_test_instance (sfw_test_instance_t *tsi) +{ + srpc_client_rpc_t *rpc; + sfw_test_unit_t *tsu; + + if (!tsi->tsi_is_client) goto clean; + + tsi->tsi_ops->tso_fini(tsi); + + LASSERT (!tsi->tsi_stopping); + LASSERT (list_empty(&tsi->tsi_active_rpcs)); + LASSERT (!sfw_test_active(tsi)); + + while (!list_empty(&tsi->tsi_units)) { + tsu = list_entry(tsi->tsi_units.next, + sfw_test_unit_t, tsu_list); + list_del(&tsu->tsu_list); + LIBCFS_FREE(tsu, sizeof(*tsu)); + } + + while (!list_empty(&tsi->tsi_free_rpcs)) { + rpc = list_entry(tsi->tsi_free_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + +clean: + sfw_unload_test(tsi); + LIBCFS_FREE(tsi, sizeof(*tsi)); + return; +} + +void +sfw_destroy_batch (sfw_batch_t *tsb) +{ + sfw_test_instance_t *tsi; + + LASSERT (!sfw_batch_active(tsb)); + LASSERT (list_empty(&tsb->bat_list)); + + while (!list_empty(&tsb->bat_tests)) { + tsi = list_entry(tsb->bat_tests.next, + sfw_test_instance_t, tsi_list); + list_del_init(&tsi->tsi_list); + sfw_destroy_test_instance(tsi); + } + + LIBCFS_FREE(tsb, sizeof(sfw_batch_t)); + return; +} + +void +sfw_destroy_session (sfw_session_t *sn) +{ + sfw_batch_t *batch; + + LASSERT (list_empty(&sn->sn_list)); + LASSERT (sn != sfw_data.fw_session); + + while (!list_empty(&sn->sn_batches)) { + batch = list_entry(sn->sn_batches.next, + sfw_batch_t, bat_list); + list_del_init(&batch->bat_list); + sfw_destroy_batch(batch); + } + + LIBCFS_FREE(sn, sizeof(*sn)); + atomic_dec(&sfw_data.fw_nzombies); + return; +} + +void +sfw_unpack_addtest_req(srpc_msg_t *msg) +{ + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + + LASSERT (msg->msg_type == SRPC_MSG_TEST_REQST); + LASSERT (req->tsr_is_client); + + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (req->tsr_service == SRPC_SERVICE_BRW) { + if ((msg->msg_ses_feats & LST_FEAT_BULK_LEN) == 0) { + test_bulk_req_t *bulk = &req->tsr_u.bulk_v0; + + __swab32s(&bulk->blk_opc); + __swab32s(&bulk->blk_npg); + __swab32s(&bulk->blk_flags); + + } else { + test_bulk_req_v1_t *bulk = &req->tsr_u.bulk_v1; + + __swab16s(&bulk->blk_opc); + __swab16s(&bulk->blk_flags); + __swab32s(&bulk->blk_offset); + __swab32s(&bulk->blk_len); + } + + return; + } + + if (req->tsr_service == SRPC_SERVICE_PING) { + test_ping_req_t *ping = &req->tsr_u.ping; + + __swab32s(&ping->png_size); + __swab32s(&ping->png_flags); + return; + } + + LBUG (); + return; +} + +int +sfw_add_test_instance (sfw_batch_t *tsb, srpc_server_rpc_t *rpc) +{ + srpc_msg_t *msg = &rpc->srpc_reqstbuf->buf_msg; + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + srpc_bulk_t *bk = rpc->srpc_bulk; + int ndest = req->tsr_ndest; + sfw_test_unit_t *tsu; + sfw_test_instance_t *tsi; + int i; + int rc; + + LIBCFS_ALLOC(tsi, sizeof(*tsi)); + if (tsi == NULL) { + CERROR ("Can't allocate test instance for batch: "LPU64"\n", + tsb->bat_id.bat_id); + return -ENOMEM; + } + + memset(tsi, 0, sizeof(*tsi)); + spin_lock_init(&tsi->tsi_lock); + atomic_set(&tsi->tsi_nactive, 0); + INIT_LIST_HEAD(&tsi->tsi_units); + INIT_LIST_HEAD(&tsi->tsi_free_rpcs); + INIT_LIST_HEAD(&tsi->tsi_active_rpcs); + + tsi->tsi_stopping = 0; + tsi->tsi_batch = tsb; + tsi->tsi_loop = req->tsr_loop; + tsi->tsi_concur = req->tsr_concur; + tsi->tsi_service = req->tsr_service; + tsi->tsi_is_client = !!(req->tsr_is_client); + tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); + + rc = sfw_load_test(tsi); + if (rc != 0) { + LIBCFS_FREE(tsi, sizeof(*tsi)); + return rc; + } + + LASSERT (!sfw_batch_active(tsb)); + + if (!tsi->tsi_is_client) { + /* it's test server, just add it to tsb */ + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + + LASSERT (bk != NULL); + LASSERT (bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest); + LASSERT((unsigned int)bk->bk_len >= + sizeof(lnet_process_id_packed_t) * ndest); + + sfw_unpack_addtest_req(msg); + memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u)); + + for (i = 0; i < ndest; i++) { + lnet_process_id_packed_t *dests; + lnet_process_id_packed_t id; + int j; + + dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].kiov_page); + LASSERT (dests != NULL); /* my pages are within KVM always */ + id = dests[i % SFW_ID_PER_PAGE]; + if (msg->msg_magic != SRPC_MSG_MAGIC) + sfw_unpack_id(id); + + for (j = 0; j < tsi->tsi_concur; j++) { + LIBCFS_ALLOC(tsu, sizeof(sfw_test_unit_t)); + if (tsu == NULL) { + rc = -ENOMEM; + CERROR ("Can't allocate tsu for %d\n", + tsi->tsi_service); + goto error; + } + + tsu->tsu_dest.nid = id.nid; + tsu->tsu_dest.pid = id.pid; + tsu->tsu_instance = tsi; + tsu->tsu_private = NULL; + list_add_tail(&tsu->tsu_list, &tsi->tsi_units); + } + } + + rc = tsi->tsi_ops->tso_init(tsi); + if (rc == 0) { + list_add_tail(&tsi->tsi_list, &tsb->bat_tests); + return 0; + } + +error: + LASSERT (rc != 0); + sfw_destroy_test_instance(tsi); + return rc; +} + +static void +sfw_test_unit_done (sfw_test_unit_t *tsu) +{ + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_batch_t *tsb = tsi->tsi_batch; + sfw_session_t *sn = tsb->bat_session; + + LASSERT (sfw_test_active(tsi)); + + if (!atomic_dec_and_test(&tsi->tsi_nactive)) + return; + + /* the test instance is done */ + spin_lock(&tsi->tsi_lock); + + tsi->tsi_stopping = 0; + + spin_unlock(&tsi->tsi_lock); + + spin_lock(&sfw_data.fw_lock); + + if (!atomic_dec_and_test(&tsb->bat_nactive) ||/* tsb still active */ + sn == sfw_data.fw_session) { /* sn also active */ + spin_unlock(&sfw_data.fw_lock); + return; + } + + LASSERT (!list_empty(&sn->sn_list)); /* I'm a zombie! */ + + list_for_each_entry (tsb, &sn->sn_batches, bat_list) { + if (sfw_batch_active(tsb)) { + spin_unlock(&sfw_data.fw_lock); + return; + } + } + + list_del_init(&sn->sn_list); + spin_unlock(&sfw_data.fw_lock); + + sfw_destroy_session(sn); + return; +} + +void +sfw_test_rpc_done (srpc_client_rpc_t *rpc) +{ + sfw_test_unit_t *tsu = rpc->crpc_priv; + sfw_test_instance_t *tsi = tsu->tsu_instance; + int done = 0; + + tsi->tsi_ops->tso_done_rpc(tsu, rpc); + + spin_lock(&tsi->tsi_lock); + + LASSERT (sfw_test_active(tsi)); + LASSERT (!list_empty(&rpc->crpc_list)); + + list_del_init(&rpc->crpc_list); + + /* batch is stopping or loop is done or get error */ + if (tsi->tsi_stopping || + tsu->tsu_loop == 0 || + (rpc->crpc_status != 0 && tsi->tsi_stoptsu_onerr)) + done = 1; + + /* dec ref for poster */ + srpc_client_rpc_decref(rpc); + + spin_unlock(&tsi->tsi_lock); + + if (!done) { + swi_schedule_workitem(&tsu->tsu_worker); + return; + } + + sfw_test_unit_done(tsu); + return; +} + +int +sfw_create_test_rpc(sfw_test_unit_t *tsu, lnet_process_id_t peer, + unsigned features, int nblk, int blklen, + srpc_client_rpc_t **rpcpp) +{ + srpc_client_rpc_t *rpc = NULL; + sfw_test_instance_t *tsi = tsu->tsu_instance; + + spin_lock(&tsi->tsi_lock); + + LASSERT (sfw_test_active(tsi)); + + if (!list_empty(&tsi->tsi_free_rpcs)) { + /* pick request from buffer */ + rpc = list_entry(tsi->tsi_free_rpcs.next, + srpc_client_rpc_t, crpc_list); + LASSERT (nblk == rpc->crpc_bulk.bk_niov); + list_del_init(&rpc->crpc_list); + } + + spin_unlock(&tsi->tsi_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } else { + srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk, + blklen, sfw_test_rpc_done, + sfw_test_rpc_fini, tsu); + } + + if (rpc == NULL) { + CERROR("Can't create rpc for test %d\n", tsi->tsi_service); + return -ENOMEM; + } + + rpc->crpc_reqstmsg.msg_ses_feats = features; + *rpcpp = rpc; + + return 0; +} + +int +sfw_run_test (swi_workitem_t *wi) +{ + sfw_test_unit_t *tsu = wi->swi_workitem.wi_data; + sfw_test_instance_t *tsi = tsu->tsu_instance; + srpc_client_rpc_t *rpc = NULL; + + LASSERT (wi == &tsu->tsu_worker); + + if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc) != 0) { + LASSERT (rpc == NULL); + goto test_done; + } + + LASSERT (rpc != NULL); + + spin_lock(&tsi->tsi_lock); + + if (tsi->tsi_stopping) { + list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); + spin_unlock(&tsi->tsi_lock); + goto test_done; + } + + if (tsu->tsu_loop > 0) + tsu->tsu_loop--; + + list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs); + spin_unlock(&tsi->tsi_lock); + + rpc->crpc_timeout = rpc_timeout; + + spin_lock(&rpc->crpc_lock); + srpc_post_rpc(rpc); + spin_unlock(&rpc->crpc_lock); + return 0; + +test_done: + /* + * No one can schedule me now since: + * - previous RPC, if any, has done and + * - no new RPC is initiated. + * - my batch is still active; no one can run it again now. + * Cancel pending schedules and prevent future schedule attempts: + */ + swi_exit_workitem(wi); + sfw_test_unit_done(tsu); + return 1; +} + +int +sfw_run_batch (sfw_batch_t *tsb) +{ + swi_workitem_t *wi; + sfw_test_unit_t *tsu; + sfw_test_instance_t *tsi; + + if (sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch already active: "LPU64" (%d)\n", + tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive)); + return 0; + } + + list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) { + if (!tsi->tsi_is_client) /* skip server instances */ + continue; + + LASSERT (!tsi->tsi_stopping); + LASSERT (!sfw_test_active(tsi)); + + atomic_inc(&tsb->bat_nactive); + + list_for_each_entry (tsu, &tsi->tsi_units, tsu_list) { + atomic_inc(&tsi->tsi_nactive); + tsu->tsu_loop = tsi->tsi_loop; + wi = &tsu->tsu_worker; + swi_init_workitem(wi, tsu, sfw_run_test, + lst_sched_test[\ + lnet_cpt_of_nid(tsu->tsu_dest.nid)]); + swi_schedule_workitem(wi); + } + } + + return 0; +} + +int +sfw_stop_batch (sfw_batch_t *tsb, int force) +{ + sfw_test_instance_t *tsi; + srpc_client_rpc_t *rpc; + + if (!sfw_batch_active(tsb)) { + CDEBUG(D_NET, "Batch "LPU64" inactive\n", tsb->bat_id.bat_id); + return 0; + } + + list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) { + spin_lock(&tsi->tsi_lock); + + if (!tsi->tsi_is_client || + !sfw_test_active(tsi) || tsi->tsi_stopping) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + tsi->tsi_stopping = 1; + + if (!force) { + spin_unlock(&tsi->tsi_lock); + continue; + } + + /* abort launched rpcs in the test */ + list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) { + spin_lock(&rpc->crpc_lock); + + srpc_abort_rpc(rpc, -EINTR); + + spin_unlock(&rpc->crpc_lock); + } + + spin_unlock(&tsi->tsi_lock); + } + + return 0; +} + +int +sfw_query_batch (sfw_batch_t *tsb, int testidx, srpc_batch_reply_t *reply) +{ + sfw_test_instance_t *tsi; + + if (testidx < 0) + return -EINVAL; + + if (testidx == 0) { + reply->bar_active = atomic_read(&tsb->bat_nactive); + return 0; + } + + list_for_each_entry (tsi, &tsb->bat_tests, tsi_list) { + if (testidx-- > 1) + continue; + + reply->bar_active = atomic_read(&tsi->tsi_nactive); + return 0; + } + + return -ENOENT; +} + +void +sfw_free_pages (srpc_server_rpc_t *rpc) +{ + srpc_free_bulk(rpc->srpc_bulk); + rpc->srpc_bulk = NULL; +} + +int +sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, + int sink) +{ + LASSERT(rpc->srpc_bulk == NULL); + LASSERT(npages > 0 && npages <= LNET_MAX_IOV); + + rpc->srpc_bulk = srpc_alloc_bulk(cpt, npages, len, sink); + if (rpc->srpc_bulk == NULL) + return -ENOMEM; + + return 0; +} + +int +sfw_add_test (srpc_server_rpc_t *rpc) +{ + sfw_session_t *sn = sfw_data.fw_session; + srpc_test_reply_t *reply = &rpc->srpc_replymsg.msg_body.tes_reply; + srpc_test_reqst_t *request; + int rc; + sfw_batch_t *bat; + + request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst; + reply->tsr_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (request->tsr_loop == 0 || + request->tsr_concur == 0 || + request->tsr_sid.ses_nid == LNET_NID_ANY || + request->tsr_ndest > SFW_MAX_NDESTS || + (request->tsr_is_client && request->tsr_ndest == 0) || + request->tsr_concur > SFW_MAX_CONCUR || + request->tsr_service > SRPC_SERVICE_MAX_ID || + request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) { + reply->tsr_status = EINVAL; + return 0; + } + + if (sn == NULL || !sfw_sid_equal(request->tsr_sid, sn->sn_id) || + sfw_find_test_case(request->tsr_service) == NULL) { + reply->tsr_status = ENOENT; + return 0; + } + + bat = sfw_bid2batch(request->tsr_bid); + if (bat == NULL) { + CERROR ("Dropping RPC (%s) from %s under memory pressure.\n", + rpc->srpc_scd->scd_svc->sv_name, + libcfs_id2str(rpc->srpc_peer)); + return -ENOMEM; + } + + if (sfw_batch_active(bat)) { + reply->tsr_status = EBUSY; + return 0; + } + + if (request->tsr_is_client && rpc->srpc_bulk == NULL) { + /* rpc will be resumed later in sfw_bulk_ready */ + int npg = sfw_id_pages(request->tsr_ndest); + int len; + + if ((sn->sn_features & LST_FEAT_BULK_LEN) == 0) { + len = npg * PAGE_CACHE_SIZE; + + } else { + len = sizeof(lnet_process_id_packed_t) * + request->tsr_ndest; + } + + return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1); + } + + rc = sfw_add_test_instance(bat, rpc); + CDEBUG (rc == 0 ? D_NET : D_WARNING, + "%s test: sv %d %s, loop %d, concur %d, ndest %d\n", + rc == 0 ? "Added" : "Failed to add", request->tsr_service, + request->tsr_is_client ? "client" : "server", + request->tsr_loop, request->tsr_concur, request->tsr_ndest); + + reply->tsr_status = (rc < 0) ? -rc : rc; + return 0; +} + +int +sfw_control_batch (srpc_batch_reqst_t *request, srpc_batch_reply_t *reply) +{ + sfw_session_t *sn = sfw_data.fw_session; + int rc = 0; + sfw_batch_t *bat; + + reply->bar_sid = (sn == NULL) ? LST_INVALID_SID : sn->sn_id; + + if (sn == NULL || !sfw_sid_equal(request->bar_sid, sn->sn_id)) { + reply->bar_status = ESRCH; + return 0; + } + + bat = sfw_find_batch(request->bar_bid); + if (bat == NULL) { + reply->bar_status = ENOENT; + return 0; + } + + switch (request->bar_opc) { + case SRPC_BATCH_OPC_RUN: + rc = sfw_run_batch(bat); + break; + + case SRPC_BATCH_OPC_STOP: + rc = sfw_stop_batch(bat, request->bar_arg); + break; + + case SRPC_BATCH_OPC_QUERY: + rc = sfw_query_batch(bat, request->bar_testidx, reply); + break; + + default: + return -EINVAL; /* drop it */ + } + + reply->bar_status = (rc < 0) ? -rc : rc; + return 0; +} + +int +sfw_handle_server_rpc(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *reply = &rpc->srpc_replymsg; + srpc_msg_t *request = &rpc->srpc_reqstbuf->buf_msg; + unsigned features = LST_FEATS_MASK; + int rc = 0; + + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&sfw_data.fw_lock); + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + /* Remove timer to avoid racing with it or expiring active session */ + if (sfw_del_session_timer() != 0) { + CERROR("Dropping RPC (%s) from %s: racing with expiry timer.", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + sfw_unpack_message(request); + LASSERT(request->msg_type == srpc_service2request(sv->sv_id)); + + /* rpc module should have checked this */ + LASSERT(request->msg_version == SRPC_MSG_VERSION); + + if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION && + sv->sv_id != SRPC_SERVICE_DEBUG) { + sfw_session_t *sn = sfw_data.fw_session; + + if (sn != NULL && + sn->sn_features != request->msg_ses_feats) { + CNETERR("Features of framework RPC don't match " + "features of current session: %x/%x\n", + request->msg_ses_feats, sn->sn_features); + reply->msg_body.reply.status = EPROTO; + reply->msg_body.reply.sid = sn->sn_id; + goto out; + } + + } else if ((request->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + /* NB: at this point, old version will ignore features and + * create new session anyway, so console should be able + * to handle this */ + reply->msg_body.reply.status = EPROTO; + goto out; + } + + switch(sv->sv_id) { + default: + LBUG (); + case SRPC_SERVICE_TEST: + rc = sfw_add_test(rpc); + break; + + case SRPC_SERVICE_BATCH: + rc = sfw_control_batch(&request->msg_body.bat_reqst, + &reply->msg_body.bat_reply); + break; + + case SRPC_SERVICE_QUERY_STAT: + rc = sfw_get_stats(&request->msg_body.stat_reqst, + &reply->msg_body.stat_reply); + break; + + case SRPC_SERVICE_DEBUG: + rc = sfw_debug_session(&request->msg_body.dbg_reqst, + &reply->msg_body.dbg_reply); + break; + + case SRPC_SERVICE_MAKE_SESSION: + rc = sfw_make_session(&request->msg_body.mksn_reqst, + &reply->msg_body.mksn_reply); + break; + + case SRPC_SERVICE_REMOVE_SESSION: + rc = sfw_remove_session(&request->msg_body.rmsn_reqst, + &reply->msg_body.rmsn_reply); + break; + } + + if (sfw_data.fw_session != NULL) + features = sfw_data.fw_session->sn_features; + out: + reply->msg_ses_feats = features; + rpc->srpc_done = sfw_server_rpc_done; + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +int +sfw_bulk_ready(struct srpc_server_rpc *rpc, int status) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + int rc; + + LASSERT(rpc->srpc_bulk != NULL); + LASSERT(sv->sv_id == SRPC_SERVICE_TEST); + LASSERT(sfw_data.fw_active_srpc == NULL); + LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client); + + spin_lock(&sfw_data.fw_lock); + + if (status != 0) { + CERROR("Bulk transfer failed for RPC: " + "service %s, peer %s, status %d\n", + sv->sv_name, libcfs_id2str(rpc->srpc_peer), status); + spin_unlock(&sfw_data.fw_lock); + return -EIO; + } + + if (sfw_data.fw_shuttingdown) { + spin_unlock(&sfw_data.fw_lock); + return -ESHUTDOWN; + } + + if (sfw_del_session_timer() != 0) { + CERROR("Dropping RPC (%s) from %s: racing with expiry timer", + sv->sv_name, libcfs_id2str(rpc->srpc_peer)); + spin_unlock(&sfw_data.fw_lock); + return -EAGAIN; + } + + sfw_data.fw_active_srpc = rpc; + spin_unlock(&sfw_data.fw_lock); + + rc = sfw_add_test(rpc); + + spin_lock(&sfw_data.fw_lock); + + if (!sfw_data.fw_shuttingdown) + sfw_add_session_timer(); + + sfw_data.fw_active_srpc = NULL; + spin_unlock(&sfw_data.fw_lock); + return rc; +} + +srpc_client_rpc_t * +sfw_create_rpc(lnet_process_id_t peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done)(srpc_client_rpc_t *), void *priv) +{ + srpc_client_rpc_t *rpc = NULL; + + spin_lock(&sfw_data.fw_lock); + + LASSERT (!sfw_data.fw_shuttingdown); + LASSERT (service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + if (nbulkiov == 0 && !list_empty(&sfw_data.fw_zombie_rpcs)) { + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + + srpc_init_client_rpc(rpc, peer, service, 0, 0, + done, sfw_client_rpc_fini, priv); + } + + spin_unlock(&sfw_data.fw_lock); + + if (rpc == NULL) { + rpc = srpc_create_client_rpc(peer, service, + nbulkiov, bulklen, done, + nbulkiov != 0 ? NULL : + sfw_client_rpc_fini, + priv); + } + + if (rpc != NULL) /* "session" is concept in framework */ + rpc->crpc_reqstmsg.msg_ses_feats = features; + + return rpc; +} + +void +sfw_unpack_message (srpc_msg_t *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* srpc module should guarantee I wouldn't get crap */ + LASSERT (msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + if (msg->msg_type == SRPC_MSG_STAT_REQST) { + srpc_stat_reqst_t *req = &msg->msg_body.stat_reqst; + + __swab32s(&req->str_type); + __swab64s(&req->str_rpyid); + sfw_unpack_sid(req->str_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_STAT_REPLY) { + srpc_stat_reply_t *rep = &msg->msg_body.stat_reply; + + __swab32s(&rep->str_status); + sfw_unpack_sid(rep->str_sid); + sfw_unpack_fw_counters(rep->str_fw); + sfw_unpack_rpc_counters(rep->str_rpc); + sfw_unpack_lnet_counters(rep->str_lnet); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REQST) { + srpc_mksn_reqst_t *req = &msg->msg_body.mksn_reqst; + + __swab64s(&req->mksn_rpyid); + __swab32s(&req->mksn_force); + sfw_unpack_sid(req->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_MKSN_REPLY) { + srpc_mksn_reply_t *rep = &msg->msg_body.mksn_reply; + + __swab32s(&rep->mksn_status); + __swab32s(&rep->mksn_timeout); + sfw_unpack_sid(rep->mksn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REQST) { + srpc_rmsn_reqst_t *req = &msg->msg_body.rmsn_reqst; + + __swab64s(&req->rmsn_rpyid); + sfw_unpack_sid(req->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_RMSN_REPLY) { + srpc_rmsn_reply_t *rep = &msg->msg_body.rmsn_reply; + + __swab32s(&rep->rmsn_status); + sfw_unpack_sid(rep->rmsn_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REQST) { + srpc_debug_reqst_t *req = &msg->msg_body.dbg_reqst; + + __swab64s(&req->dbg_rpyid); + __swab32s(&req->dbg_flags); + sfw_unpack_sid(req->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) { + srpc_debug_reply_t *rep = &msg->msg_body.dbg_reply; + + __swab32s(&rep->dbg_nbatch); + __swab32s(&rep->dbg_timeout); + sfw_unpack_sid(rep->dbg_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REQST) { + srpc_batch_reqst_t *req = &msg->msg_body.bat_reqst; + + __swab32s(&req->bar_opc); + __swab64s(&req->bar_rpyid); + __swab32s(&req->bar_testidx); + __swab32s(&req->bar_arg); + sfw_unpack_sid(req->bar_sid); + __swab64s(&req->bar_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_BATCH_REPLY) { + srpc_batch_reply_t *rep = &msg->msg_body.bat_reply; + + __swab32s(&rep->bar_status); + sfw_unpack_sid(rep->bar_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REQST) { + srpc_test_reqst_t *req = &msg->msg_body.tes_reqst; + + __swab64s(&req->tsr_rpyid); + __swab64s(&req->tsr_bulkid); + __swab32s(&req->tsr_loop); + __swab32s(&req->tsr_ndest); + __swab32s(&req->tsr_concur); + __swab32s(&req->tsr_service); + sfw_unpack_sid(req->tsr_sid); + __swab64s(&req->tsr_bid.bat_id); + return; + } + + if (msg->msg_type == SRPC_MSG_TEST_REPLY) { + srpc_test_reply_t *rep = &msg->msg_body.tes_reply; + + __swab32s(&rep->tsr_status); + sfw_unpack_sid(rep->tsr_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REQST) { + srpc_join_reqst_t *req = &msg->msg_body.join_reqst; + + __swab64s(&req->join_rpyid); + sfw_unpack_sid(req->join_sid); + return; + } + + if (msg->msg_type == SRPC_MSG_JOIN_REPLY) { + srpc_join_reply_t *rep = &msg->msg_body.join_reply; + + __swab32s(&rep->join_status); + __swab32s(&rep->join_timeout); + sfw_unpack_sid(rep->join_sid); + return; + } + + LBUG (); + return; +} + +void +sfw_abort_rpc (srpc_client_rpc_t *rpc) +{ + LASSERT(atomic_read(&rpc->crpc_refcount) > 0); + LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); + + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, -EINTR); + spin_unlock(&rpc->crpc_lock); + return; +} + +void +sfw_post_rpc (srpc_client_rpc_t *rpc) +{ + spin_lock(&rpc->crpc_lock); + + LASSERT (!rpc->crpc_closed); + LASSERT (!rpc->crpc_aborted); + LASSERT (list_empty(&rpc->crpc_list)); + LASSERT (!sfw_data.fw_shuttingdown); + + rpc->crpc_timeout = rpc_timeout; + srpc_post_rpc(rpc); + + spin_unlock(&rpc->crpc_lock); + return; +} + +static srpc_service_t sfw_services[] = +{ + { + /* sv_id */ SRPC_SERVICE_DEBUG, + /* sv_name */ "debug", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_QUERY_STAT, + /* sv_name */ "query stats", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_MAKE_SESSION, + /* sv_name */ "make session", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_REMOVE_SESSION, + /* sv_name */ "remove session", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_BATCH, + /* sv_name */ "batch service", + 0 + }, + { + /* sv_id */ SRPC_SERVICE_TEST, + /* sv_name */ "test service", + 0 + }, + { + /* sv_id */ 0, + /* sv_name */ NULL, + 0 + } +}; + +extern sfw_test_client_ops_t ping_test_client; +extern srpc_service_t ping_test_service; +extern void ping_init_test_client(void); +extern void ping_init_test_service(void); + +extern sfw_test_client_ops_t brw_test_client; +extern srpc_service_t brw_test_service; +extern void brw_init_test_client(void); +extern void brw_init_test_service(void); + + +int +sfw_startup (void) +{ + int i; + int rc; + int error; + srpc_service_t *sv; + sfw_test_case_t *tsc; + + + if (session_timeout < 0) { + CERROR ("Session timeout must be non-negative: %d\n", + session_timeout); + return -EINVAL; + } + + if (rpc_timeout < 0) { + CERROR ("RPC timeout must be non-negative: %d\n", + rpc_timeout); + return -EINVAL; + } + + if (session_timeout == 0) + CWARN ("Zero session_timeout specified " + "- test sessions never expire.\n"); + + if (rpc_timeout == 0) + CWARN ("Zero rpc_timeout specified " + "- test RPC never expire.\n"); + + memset(&sfw_data, 0, sizeof(struct smoketest_framework)); + + sfw_data.fw_session = NULL; + sfw_data.fw_active_srpc = NULL; + spin_lock_init(&sfw_data.fw_lock); + atomic_set(&sfw_data.fw_nzombies, 0); + INIT_LIST_HEAD(&sfw_data.fw_tests); + INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs); + INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions); + + brw_init_test_client(); + brw_init_test_service(); + rc = sfw_register_test(&brw_test_service, &brw_test_client); + LASSERT (rc == 0); + + ping_init_test_client(); + ping_init_test_service(); + rc = sfw_register_test(&ping_test_service, &ping_test_client); + LASSERT (rc == 0); + + error = 0; + list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + + rc = srpc_add_service(sv); + LASSERT (rc != -EBUSY); + if (rc != 0) { + CWARN ("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) break; + + sv->sv_bulk_ready = NULL; + sv->sv_handler = sfw_handle_server_rpc; + sv->sv_wi_total = SFW_FRWK_WI_MAX; + if (sv->sv_id == SRPC_SERVICE_TEST) + sv->sv_bulk_ready = sfw_bulk_ready; + + rc = srpc_add_service(sv); + LASSERT (rc != -EBUSY); + if (rc != 0) { + CWARN ("Failed to add %s service: %d\n", + sv->sv_name, rc); + error = rc; + } + + /* about to sfw_shutdown, no need to add buffer */ + if (error) continue; + + rc = srpc_service_add_buffers(sv, sv->sv_wi_total); + if (rc != 0) { + CWARN("Failed to reserve enough buffers: " + "service %s, %d needed: %d\n", + sv->sv_name, sv->sv_wi_total, rc); + error = -ENOMEM; + } + } + + if (error != 0) + sfw_shutdown(); + return error; +} + +void +sfw_shutdown (void) +{ + srpc_service_t *sv; + sfw_test_case_t *tsc; + int i; + + spin_lock(&sfw_data.fw_lock); + + sfw_data.fw_shuttingdown = 1; + lst_wait_until(sfw_data.fw_active_srpc == NULL, sfw_data.fw_lock, + "waiting for active RPC to finish.\n"); + + if (sfw_del_session_timer() != 0) + lst_wait_until(sfw_data.fw_session == NULL, sfw_data.fw_lock, + "waiting for session timer to explode.\n"); + + sfw_deactivate_session(); + lst_wait_until(atomic_read(&sfw_data.fw_nzombies) == 0, + sfw_data.fw_lock, + "waiting for %d zombie sessions to die.\n", + atomic_read(&sfw_data.fw_nzombies)); + + spin_unlock(&sfw_data.fw_lock); + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + list_for_each_entry (tsc, &sfw_data.fw_tests, tsc_list) { + sv = tsc->tsc_srv_service; + srpc_shutdown_service(sv); + srpc_remove_service(sv); + } + + while (!list_empty(&sfw_data.fw_zombie_rpcs)) { + srpc_client_rpc_t *rpc; + + rpc = list_entry(sfw_data.fw_zombie_rpcs.next, + srpc_client_rpc_t, crpc_list); + list_del(&rpc->crpc_list); + + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } + + for (i = 0; ; i++) { + sv = &sfw_services[i]; + if (sv->sv_name == NULL) + break; + + srpc_wait_service_shutdown(sv); + } + + while (!list_empty(&sfw_data.fw_tests)) { + tsc = list_entry(sfw_data.fw_tests.next, + sfw_test_case_t, tsc_list); + + srpc_wait_service_shutdown(tsc->tsc_srv_service); + + list_del(&tsc->tsc_list); + LIBCFS_FREE(tsc, sizeof(*tsc)); + } + + return; +} diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c new file mode 100644 index 000000000000..5257e5630a0e --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/module.c @@ -0,0 +1,169 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +enum { + LST_INIT_NONE = 0, + LST_INIT_WI_SERIAL, + LST_INIT_WI_TEST, + LST_INIT_RPC, + LST_INIT_FW, + LST_INIT_CONSOLE +}; + +extern int lstcon_console_init(void); +extern int lstcon_console_fini(void); + +static int lst_init_step = LST_INIT_NONE; + +struct cfs_wi_sched *lst_sched_serial; +struct cfs_wi_sched **lst_sched_test; + +void +lnet_selftest_fini(void) +{ + int i; + + switch (lst_init_step) { + case LST_INIT_CONSOLE: + lstcon_console_fini(); + case LST_INIT_FW: + sfw_shutdown(); + case LST_INIT_RPC: + srpc_shutdown(); + case LST_INIT_WI_TEST: + for (i = 0; + i < cfs_cpt_number(lnet_cpt_table()); i++) { + if (lst_sched_test[i] == NULL) + continue; + cfs_wi_sched_destroy(lst_sched_test[i]); + } + LIBCFS_FREE(lst_sched_test, + sizeof(lst_sched_test[0]) * + cfs_cpt_number(lnet_cpt_table())); + lst_sched_test = NULL; + + case LST_INIT_WI_SERIAL: + cfs_wi_sched_destroy(lst_sched_serial); + lst_sched_serial = NULL; + case LST_INIT_NONE: + break; + default: + LBUG(); + } + return; +} + +void +lnet_selftest_structure_assertion(void) +{ + CLASSERT(sizeof(srpc_msg_t) == 160); + CLASSERT(sizeof(srpc_test_reqst_t) == 70); + CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_concur) == 72); + CLASSERT(offsetof(srpc_msg_t, msg_body.tes_reqst.tsr_ndest) == 78); + CLASSERT(sizeof(srpc_stat_reply_t) == 136); + CLASSERT(sizeof(srpc_stat_reqst_t) == 28); +} + +int +lnet_selftest_init(void) +{ + int nscheds; + int rc; + int i; + + rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY, + 1, &lst_sched_serial); + if (rc != 0) { + CERROR("Failed to create serial WI scheduler for LST\n"); + return rc; + } + lst_init_step = LST_INIT_WI_SERIAL; + + nscheds = cfs_cpt_number(lnet_cpt_table()); + LIBCFS_ALLOC(lst_sched_test, sizeof(lst_sched_test[0]) * nscheds); + if (lst_sched_test == NULL) + goto error; + + lst_init_step = LST_INIT_WI_TEST; + for (i = 0; i < nscheds; i++) { + int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + + /* reserve at least one CPU for LND */ + nthrs = max(nthrs - 1, 1); + rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i, + nthrs, &lst_sched_test[i]); + if (rc != 0) { + CERROR("Failed to create CPT affinity WI scheduler " + "%d for LST\n", i); + goto error; + } + } + + rc = srpc_startup(); + if (rc != 0) { + CERROR("LST can't startup rpc\n"); + goto error; + } + lst_init_step = LST_INIT_RPC; + + rc = sfw_startup(); + if (rc != 0) { + CERROR("LST can't startup framework\n"); + goto error; + } + lst_init_step = LST_INIT_FW; + + rc = lstcon_console_init(); + if (rc != 0) { + CERROR("LST can't startup console\n"); + goto error; + } + lst_init_step = LST_INIT_CONSOLE; + return 0; +error: + lnet_selftest_fini(); + return rc; +} + + +MODULE_DESCRIPTION("LNet Selftest"); +MODULE_LICENSE("GPL"); + +cfs_module(lnet, "0.9.0", lnet_selftest_init, lnet_selftest_fini); diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c new file mode 100644 index 000000000000..f0f919482b56 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/ping_test.c @@ -0,0 +1,229 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/conctl.c + * + * Test client & Server + * + * Author: Liang Zhen + */ + +#include "selftest.h" + +#define LST_PING_TEST_MAGIC 0xbabeface + +int ping_srv_workitems = SFW_TEST_WI_MAX; +CFS_MODULE_PARM(ping_srv_workitems, "i", int, 0644, "# PING server workitems"); + +typedef struct { + spinlock_t pnd_lock; /* serialize */ + int pnd_counter; /* sequence counter */ +} lst_ping_data_t; + +static lst_ping_data_t lst_ping_data; + +static int +ping_client_init(sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + + LASSERT(tsi->tsi_is_client); + LASSERT(sn != NULL && (sn->sn_features & ~LST_FEATS_MASK) == 0); + + spin_lock_init(&lst_ping_data.pnd_lock); + lst_ping_data.pnd_counter = 0; + + return 0; +} + +static void +ping_client_fini (sfw_test_instance_t *tsi) +{ + sfw_session_t *sn = tsi->tsi_batch->bat_session; + int errors; + + LASSERT (sn != NULL); + LASSERT (tsi->tsi_is_client); + + errors = atomic_read(&sn->sn_ping_errors); + if (errors) + CWARN ("%d pings have failed.\n", errors); + else + CDEBUG (D_NET, "Ping test finished OK.\n"); +} + +static int +ping_client_prep_rpc(sfw_test_unit_t *tsu, + lnet_process_id_t dest, srpc_client_rpc_t **rpc) +{ + srpc_ping_reqst_t *req; + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + struct timeval tv; + int rc; + + LASSERT(sn != NULL); + LASSERT((sn->sn_features & ~LST_FEATS_MASK) == 0); + + rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc); + if (rc != 0) + return rc; + + req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst; + + req->pnr_magic = LST_PING_TEST_MAGIC; + + spin_lock(&lst_ping_data.pnd_lock); + req->pnr_seq = lst_ping_data.pnd_counter++; + spin_unlock(&lst_ping_data.pnd_lock); + + cfs_fs_timeval(&tv); + req->pnr_time_sec = tv.tv_sec; + req->pnr_time_usec = tv.tv_usec; + + return rc; +} + +static void +ping_client_done_rpc (sfw_test_unit_t *tsu, srpc_client_rpc_t *rpc) +{ + sfw_test_instance_t *tsi = tsu->tsu_instance; + sfw_session_t *sn = tsi->tsi_batch->bat_session; + srpc_ping_reqst_t *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; + srpc_ping_reply_t *reply = &rpc->crpc_replymsg.msg_body.ping_reply; + struct timeval tv; + + LASSERT (sn != NULL); + + if (rpc->crpc_status != 0) { + if (!tsi->tsi_stopping) /* rpc could have been aborted */ + atomic_inc(&sn->sn_ping_errors); + CERROR ("Unable to ping %s (%d): %d\n", + libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq, rpc->crpc_status); + return; + } + + if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { + __swab32s(&reply->pnr_seq); + __swab32s(&reply->pnr_magic); + __swab32s(&reply->pnr_status); + } + + if (reply->pnr_magic != LST_PING_TEST_MAGIC) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad magic %u from %s, %u expected.\n", + reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), + LST_PING_TEST_MAGIC); + return; + } + + if (reply->pnr_seq != reqst->pnr_seq) { + rpc->crpc_status = -EBADMSG; + atomic_inc(&sn->sn_ping_errors); + CERROR ("Bad seq %u from %s, %u expected.\n", + reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), + reqst->pnr_seq); + return; + } + + cfs_fs_timeval(&tv); + CDEBUG (D_NET, "%d reply in %u usec\n", reply->pnr_seq, + (unsigned)((tv.tv_sec - (unsigned)reqst->pnr_time_sec) * 1000000 + + (tv.tv_usec - reqst->pnr_time_usec))); + return; +} + +static int +ping_server_handle(struct srpc_server_rpc *rpc) +{ + struct srpc_service *sv = rpc->srpc_scd->scd_svc; + srpc_msg_t *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; + srpc_msg_t *replymsg = &rpc->srpc_replymsg; + srpc_ping_reqst_t *req = &reqstmsg->msg_body.ping_reqst; + srpc_ping_reply_t *rep = &rpc->srpc_replymsg.msg_body.ping_reply; + + LASSERT (sv->sv_id == SRPC_SERVICE_PING); + + if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { + LASSERT (reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); + + __swab32s(&req->pnr_seq); + __swab32s(&req->pnr_magic); + __swab64s(&req->pnr_time_sec); + __swab64s(&req->pnr_time_usec); + } + LASSERT (reqstmsg->msg_type == srpc_service2request(sv->sv_id)); + + if (req->pnr_magic != LST_PING_TEST_MAGIC) { + CERROR ("Unexpect magic %08x from %s\n", + req->pnr_magic, libcfs_id2str(rpc->srpc_peer)); + return -EINVAL; + } + + rep->pnr_seq = req->pnr_seq; + rep->pnr_magic = LST_PING_TEST_MAGIC; + + if ((reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) != 0) { + replymsg->msg_ses_feats = LST_FEATS_MASK; + rep->pnr_status = EPROTO; + return 0; + } + + replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; + + CDEBUG(D_NET, "Get ping %d from %s\n", + req->pnr_seq, libcfs_id2str(rpc->srpc_peer)); + return 0; +} + +sfw_test_client_ops_t ping_test_client; +void ping_init_test_client(void) +{ + ping_test_client.tso_init = ping_client_init; + ping_test_client.tso_fini = ping_client_fini; + ping_test_client.tso_prep_rpc = ping_client_prep_rpc; + ping_test_client.tso_done_rpc = ping_client_done_rpc; +} + +srpc_service_t ping_test_service; +void ping_init_test_service(void) +{ + ping_test_service.sv_id = SRPC_SERVICE_PING; + ping_test_service.sv_name = "ping_test"; + ping_test_service.sv_handler = ping_server_handle; + ping_test_service.sv_wi_total = ping_srv_workitems; +} diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c new file mode 100644 index 000000000000..91d83f4b746e --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/rpc.c @@ -0,0 +1,1665 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/rpc.c + * + * Author: Isaac Huang + * + * 2012-05-13: Liang Zhen + * - percpt data for service to improve smp performance + * - code cleanup + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + +typedef enum { + SRPC_STATE_NONE, + SRPC_STATE_NI_INIT, + SRPC_STATE_EQ_INIT, + SRPC_STATE_RUNNING, + SRPC_STATE_STOPPING, +} srpc_state_t; + +struct smoketest_rpc { + spinlock_t rpc_glock; /* global lock */ + srpc_service_t *rpc_services[SRPC_SERVICE_MAX_ID + 1]; + lnet_handle_eq_t rpc_lnet_eq; /* _the_ LNet event queue */ + srpc_state_t rpc_state; + srpc_counters_t rpc_counters; + __u64 rpc_matchbits; /* matchbits counter */ +} srpc_data; + +static inline int +srpc_serv_portal(int svc_id) +{ + return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ? + SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL; +} + +/* forward ref's */ +int srpc_handle_rpc (swi_workitem_t *wi); + +void srpc_get_counters (srpc_counters_t *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + *cnt = srpc_data.rpc_counters; + spin_unlock(&srpc_data.rpc_glock); +} + +void srpc_set_counters (const srpc_counters_t *cnt) +{ + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters = *cnt; + spin_unlock(&srpc_data.rpc_glock); +} + +int +srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob) +{ + nob = min(nob, (int)PAGE_CACHE_SIZE); + + LASSERT(nob > 0); + LASSERT(i >= 0 && i < bk->bk_niov); + + bk->bk_iovs[i].kiov_offset = 0; + bk->bk_iovs[i].kiov_page = pg; + bk->bk_iovs[i].kiov_len = nob; + return nob; +} + +void +srpc_free_bulk (srpc_bulk_t *bk) +{ + int i; + struct page *pg; + + LASSERT (bk != NULL); + + for (i = 0; i < bk->bk_niov; i++) { + pg = bk->bk_iovs[i].kiov_page; + if (pg == NULL) break; + + __free_page(pg); + } + + LIBCFS_FREE(bk, offsetof(srpc_bulk_t, bk_iovs[bk->bk_niov])); + return; +} + +srpc_bulk_t * +srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, int sink) +{ + srpc_bulk_t *bk; + struct page **pages; + int i; + + LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV); + + LIBCFS_CPT_ALLOC(bk, lnet_cpt_table(), cpt, + offsetof(srpc_bulk_t, bk_iovs[bulk_npg])); + if (bk == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", bulk_npg); + return NULL; + } + + memset(bk, 0, offsetof(srpc_bulk_t, bk_iovs[bulk_npg])); + bk->bk_sink = sink; + bk->bk_len = bulk_len; + bk->bk_niov = bulk_npg; + UNUSED(pages); + + for (i = 0; i < bulk_npg; i++) { + struct page *pg; + int nob; + + pg = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, GFP_IOFS); + if (pg == NULL) { + CERROR("Can't allocate page %d of %d\n", i, bulk_npg); + srpc_free_bulk(bk); + return NULL; + } + + nob = srpc_add_bulk_page(bk, pg, i, bulk_len); + bulk_len -= nob; + } + + return bk; +} + +static inline __u64 +srpc_next_id (void) +{ + __u64 id; + + spin_lock(&srpc_data.rpc_glock); + id = srpc_data.rpc_matchbits++; + spin_unlock(&srpc_data.rpc_glock); + return id; +} + +void +srpc_init_server_rpc(struct srpc_server_rpc *rpc, + struct srpc_service_cd *scd, + struct srpc_buffer *buffer) +{ + memset(rpc, 0, sizeof(*rpc)); + swi_init_workitem(&rpc->srpc_wi, rpc, srpc_handle_rpc, + srpc_serv_is_framework(scd->scd_svc) ? + lst_sched_serial : lst_sched_test[scd->scd_cpt]); + + rpc->srpc_ev.ev_fired = 1; /* no event expected now */ + + rpc->srpc_scd = scd; + rpc->srpc_reqstbuf = buffer; + rpc->srpc_peer = buffer->buf_peer; + rpc->srpc_self = buffer->buf_self; + LNetInvalidateHandle(&rpc->srpc_replymdh); +} + +static void +srpc_service_fini(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + struct srpc_buffer *buf; + struct list_head *q; + int i; + + if (svc->sv_cpt_data == NULL) + return; + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + while (1) { + if (!list_empty(&scd->scd_buf_posted)) + q = &scd->scd_buf_posted; + else if (!list_empty(&scd->scd_buf_blocked)) + q = &scd->scd_buf_blocked; + else + break; + + while (!list_empty(q)) { + buf = list_entry(q->next, + struct srpc_buffer, + buf_list); + list_del(&buf->buf_list); + LIBCFS_FREE(buf, sizeof(*buf)); + } + } + + LASSERT(list_empty(&scd->scd_rpc_active)); + + while (!list_empty(&scd->scd_rpc_free)) { + rpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&rpc->srpc_list); + LIBCFS_FREE(rpc, sizeof(*rpc)); + } + } + + cfs_percpt_free(svc->sv_cpt_data); + svc->sv_cpt_data = NULL; +} + +static int +srpc_service_nrpcs(struct srpc_service *svc) +{ + int nrpcs = svc->sv_wi_total / svc->sv_ncpts; + + return srpc_serv_is_framework(svc) ? + max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); +} + +int srpc_add_buffer(struct swi_workitem *wi); + +static int +srpc_service_init(struct srpc_service *svc) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int nrpcs; + int i; + int j; + + svc->sv_shuttingdown = 0; + + svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(struct srpc_service_cd)); + if (svc->sv_cpt_data == NULL) + return -ENOMEM; + + svc->sv_ncpts = srpc_serv_is_framework(svc) ? + 1 : cfs_cpt_number(lnet_cpt_table()); + nrpcs = srpc_service_nrpcs(svc); + + cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { + scd->scd_cpt = i; + scd->scd_svc = svc; + spin_lock_init(&scd->scd_lock); + INIT_LIST_HEAD(&scd->scd_rpc_free); + INIT_LIST_HEAD(&scd->scd_rpc_active); + INIT_LIST_HEAD(&scd->scd_buf_posted); + INIT_LIST_HEAD(&scd->scd_buf_blocked); + + scd->scd_ev.ev_data = scd; + scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; + + /* NB: don't use lst_sched_serial for adding buffer, + * see details in srpc_service_add_buffers() */ + swi_init_workitem(&scd->scd_buf_wi, scd, + srpc_add_buffer, lst_sched_test[i]); + + if (i != 0 && srpc_serv_is_framework(svc)) { + /* NB: framework service only needs srpc_service_cd for + * one partition, but we allocate for all to make + * it easier to implement, it will waste a little + * memory but nobody should care about this */ + continue; + } + + for (j = 0; j < nrpcs; j++) { + LIBCFS_CPT_ALLOC(rpc, lnet_cpt_table(), + i, sizeof(*rpc)); + if (rpc == NULL) { + srpc_service_fini(svc); + return -ENOMEM; + } + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + } + + return 0; +} + +int +srpc_add_service(struct srpc_service *sv) +{ + int id = sv->sv_id; + + LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID); + + if (srpc_service_init(sv) != 0) + return -ENOMEM; + + spin_lock(&srpc_data.rpc_glock); + + LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); + + if (srpc_data.rpc_services[id] != NULL) { + spin_unlock(&srpc_data.rpc_glock); + goto failed; + } + + srpc_data.rpc_services[id] = sv; + spin_unlock(&srpc_data.rpc_glock); + + CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name); + return 0; + + failed: + srpc_service_fini(sv); + return -EBUSY; +} + +int +srpc_remove_service (srpc_service_t *sv) +{ + int id = sv->sv_id; + + spin_lock(&srpc_data.rpc_glock); + + if (srpc_data.rpc_services[id] != sv) { + spin_unlock(&srpc_data.rpc_glock); + return -ENOENT; + } + + srpc_data.rpc_services[id] = NULL; + spin_unlock(&srpc_data.rpc_glock); + return 0; +} + +int +srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf, + int len, int options, lnet_process_id_t peer, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + int rc; + lnet_md_t md; + lnet_handle_me_t meh; + + rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK, + local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh); + if (rc != 0) { + CERROR ("LNetMEAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + return -ENOMEM; + } + + md.threshold = 1; + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.options = options; + md.eq_handle = srpc_data.rpc_lnet_eq; + + rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR ("LNetMDAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + + rc = LNetMEUnlink(meh); + LASSERT (rc == 0); + return -ENOMEM; + } + + CDEBUG (D_NET, + "Posted passive RDMA: peer %s, portal %d, matchbits "LPX64"\n", + libcfs_id2str(peer), portal, matchbits); + return 0; +} + +int +srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, + int options, lnet_process_id_t peer, lnet_nid_t self, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + int rc; + lnet_md_t md; + + md.user_ptr = ev; + md.start = buf; + md.length = len; + md.eq_handle = srpc_data.rpc_lnet_eq; + md.threshold = ((options & LNET_MD_OP_GET) != 0) ? 2 : 1; + md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET); + + rc = LNetMDBind(md, LNET_UNLINK, mdh); + if (rc != 0) { + CERROR ("LNetMDBind failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + return -ENOMEM; + } + + /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. + * they're only meaningful for MDs attached to an ME (i.e. passive + * buffers... */ + if ((options & LNET_MD_OP_PUT) != 0) { + rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, + portal, matchbits, 0, 0); + } else { + LASSERT ((options & LNET_MD_OP_GET) != 0); + + rc = LNetGet(self, *mdh, peer, portal, matchbits, 0); + } + + if (rc != 0) { + CERROR ("LNet%s(%s, %d, "LPD64") failed: %d\n", + ((options & LNET_MD_OP_PUT) != 0) ? "Put" : "Get", + libcfs_id2str(peer), portal, matchbits, rc); + + /* The forthcoming unlink event will complete this operation + * with failure, so fall through and return success here. + */ + rc = LNetMDUnlink(*mdh); + LASSERT (rc == 0); + } else { + CDEBUG (D_NET, + "Posted active RDMA: peer %s, portal %u, matchbits "LPX64"\n", + libcfs_id2str(peer), portal, matchbits); + } + return 0; +} + +int +srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf, + int len, lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + return srpc_post_active_rdma(srpc_serv_portal(service), service, + buf, len, LNET_MD_OP_PUT, peer, + LNET_NID_ANY, mdh, ev); +} + +int +srpc_post_passive_rqtbuf(int service, int local, void *buf, int len, + lnet_handle_md_t *mdh, srpc_event_t *ev) +{ + lnet_process_id_t any = {0}; + + any.nid = LNET_NID_ANY; + any.pid = LNET_PID_ANY; + + return srpc_post_passive_rdma(srpc_serv_portal(service), + local, service, buf, len, + LNET_MD_OP_PUT, any, mdh, ev); +} + +int +srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf) +{ + struct srpc_service *sv = scd->scd_svc; + struct srpc_msg *msg = &buf->buf_msg; + int rc; + + LNetInvalidateHandle(&buf->buf_mdh); + list_add(&buf->buf_list, &scd->scd_buf_posted); + scd->scd_buf_nposted++; + spin_unlock(&scd->scd_lock); + + rc = srpc_post_passive_rqtbuf(sv->sv_id, + !srpc_serv_is_framework(sv), + msg, sizeof(*msg), &buf->buf_mdh, + &scd->scd_ev); + + /* At this point, a RPC (new or delayed) may have arrived in + * msg and its event handler has been called. So we must add + * buf to scd_buf_posted _before_ dropping scd_lock */ + + spin_lock(&scd->scd_lock); + + if (rc == 0) { + if (!sv->sv_shuttingdown) + return 0; + + spin_unlock(&scd->scd_lock); + /* srpc_shutdown_service might have tried to unlink me + * when my buf_mdh was still invalid */ + LNetMDUnlink(buf->buf_mdh); + spin_lock(&scd->scd_lock); + return 0; + } + + scd->scd_buf_nposted--; + if (sv->sv_shuttingdown) + return rc; /* don't allow to change scd_buf_posted */ + + list_del(&buf->buf_list); + spin_unlock(&scd->scd_lock); + + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + return rc; +} + +int +srpc_add_buffer(struct swi_workitem *wi) +{ + struct srpc_service_cd *scd = wi->swi_workitem.wi_data; + struct srpc_buffer *buf; + int rc = 0; + + /* it's called by workitem scheduler threads, these threads + * should have been set CPT affinity, so buffers will be posted + * on CPT local list of Portal */ + spin_lock(&scd->scd_lock); + + while (scd->scd_buf_adjust > 0 && + !scd->scd_svc->sv_shuttingdown) { + scd->scd_buf_adjust--; /* consume it */ + scd->scd_buf_posting++; + + spin_unlock(&scd->scd_lock); + + LIBCFS_ALLOC(buf, sizeof(*buf)); + if (buf == NULL) { + CERROR("Failed to add new buf to service: %s\n", + scd->scd_svc->sv_name); + spin_lock(&scd->scd_lock); + rc = -ENOMEM; + break; + } + + spin_lock(&scd->scd_lock); + if (scd->scd_svc->sv_shuttingdown) { + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + + spin_lock(&scd->scd_lock); + rc = -ESHUTDOWN; + break; + } + + rc = srpc_service_post_buffer(scd, buf); + if (rc != 0) + break; /* buf has been freed inside */ + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + scd->scd_buf_total++; + scd->scd_buf_low = MAX(2, scd->scd_buf_total / 4); + } + + if (rc != 0) { + scd->scd_buf_err_stamp = cfs_time_current_sec(); + scd->scd_buf_err = rc; + + LASSERT(scd->scd_buf_posting > 0); + scd->scd_buf_posting--; + } + + spin_unlock(&scd->scd_lock); + return 0; +} + +int +srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int rc = 0; + int i; + + LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + scd->scd_buf_err = 0; + scd->scd_buf_err_stamp = 0; + scd->scd_buf_posting = 0; + scd->scd_buf_adjust = nbuffer; + /* start to post buffers */ + swi_schedule_workitem(&scd->scd_buf_wi); + spin_unlock(&scd->scd_lock); + + /* framework service only post buffer for one partition */ + if (srpc_serv_is_framework(sv)) + break; + } + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + /* + * NB: srpc_service_add_buffers() can be called inside + * thread context of lst_sched_serial, and we don't normally + * allow to sleep inside thread context of WI scheduler + * because it will block current scheduler thread from doing + * anything else, even worse, it could deadlock if it's + * waiting on result from another WI of the same scheduler. + * However, it's safe at here because scd_buf_wi is scheduled + * by thread in a different WI scheduler (lst_sched_test), + * so we don't have any risk of deadlock, though this could + * block all WIs pending on lst_sched_serial for a moment + * which is not good but not fatal. + */ + lst_wait_until(scd->scd_buf_err != 0 || + (scd->scd_buf_adjust == 0 && + scd->scd_buf_posting == 0), + scd->scd_lock, "waiting for adding buffer\n"); + + if (scd->scd_buf_err != 0 && rc == 0) + rc = scd->scd_buf_err; + + spin_unlock(&scd->scd_lock); + } + + return rc; +} + +void +srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer) +{ + struct srpc_service_cd *scd; + int num; + int i; + + LASSERT(!sv->sv_shuttingdown); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + num = scd->scd_buf_total + scd->scd_buf_posting; + scd->scd_buf_adjust -= min(nbuffer, num); + + spin_unlock(&scd->scd_lock); + } +} + +/* returns 1 if sv has finished, otherwise 0 */ +int +srpc_finish_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + if (!swi_deschedule_workitem(&scd->scd_buf_wi)) + return 0; + + if (scd->scd_buf_nposted > 0) { + CDEBUG(D_NET, "waiting for %d posted buffers to unlink", + scd->scd_buf_nposted); + spin_unlock(&scd->scd_lock); + return 0; + } + + if (list_empty(&scd->scd_rpc_active)) { + spin_unlock(&scd->scd_lock); + continue; + } + + rpc = list_entry(scd->scd_rpc_active.next, + struct srpc_server_rpc, srpc_list); + CNETERR("Active RPC %p on shutdown: sv %s, peer %s, " + "wi %s scheduled %d running %d, " + "ev fired %d type %d status %d lnet %d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), + rpc->srpc_wi.swi_workitem.wi_scheduled, + rpc->srpc_wi.swi_workitem.wi_running, + rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, + rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); + spin_unlock(&scd->scd_lock); + return 0; + } + + /* no lock needed from now on */ + srpc_service_fini(sv); + return 1; +} + +/* called with sv->sv_lock held */ +void +srpc_service_recycle_buffer(struct srpc_service_cd *scd, srpc_buffer_t *buf) +{ + if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) { + if (srpc_service_post_buffer(scd, buf) != 0) { + CWARN("Failed to post %s buffer\n", + scd->scd_svc->sv_name); + } + return; + } + + /* service is shutting down, or we want to recycle some buffers */ + scd->scd_buf_total--; + + if (scd->scd_buf_adjust < 0) { + scd->scd_buf_adjust++; + if (scd->scd_buf_adjust < 0 && + scd->scd_buf_total == 0 && scd->scd_buf_posting == 0) { + CDEBUG(D_INFO, + "Try to recyle %d buffers but nothing left\n", + scd->scd_buf_adjust); + scd->scd_buf_adjust = 0; + } + } + + spin_unlock(&scd->scd_lock); + LIBCFS_FREE(buf, sizeof(*buf)); + spin_lock(&scd->scd_lock); +} + +void +srpc_abort_service(struct srpc_service *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + int i; + + CDEBUG(D_NET, "Aborting service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the abort, NB: + * racing with incoming RPCs; complete fix should make test + * RPCs carry session ID in its headers */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) { + rpc->srpc_aborted = 1; + swi_schedule_workitem(&rpc->srpc_wi); + } + + spin_unlock(&scd->scd_lock); + } +} + +void +srpc_shutdown_service(srpc_service_t *sv) +{ + struct srpc_service_cd *scd; + struct srpc_server_rpc *rpc; + srpc_buffer_t *buf; + int i; + + CDEBUG(D_NET, "Shutting down service: id %d, name %s\n", + sv->sv_id, sv->sv_name); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_lock(&scd->scd_lock); + + sv->sv_shuttingdown = 1; /* i.e. no new active RPC */ + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) + spin_unlock(&scd->scd_lock); + + cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + spin_lock(&scd->scd_lock); + + /* schedule in-flight RPCs to notice the shutdown */ + list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) + swi_schedule_workitem(&rpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + + /* OK to traverse scd_buf_posted without lock, since no one + * touches scd_buf_posted now */ + list_for_each_entry(buf, &scd->scd_buf_posted, buf_list) + LNetMDUnlink(buf->buf_mdh); + } +} + +int +srpc_send_request (srpc_client_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->crpc_reqstev; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REQUEST_SENT; + + rc = srpc_post_active_rqtbuf(rpc->crpc_dest, rpc->crpc_service, + &rpc->crpc_reqstmsg, sizeof(srpc_msg_t), + &rpc->crpc_reqstmdh, ev); + if (rc != 0) { + LASSERT (rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +int +srpc_prepare_reply (srpc_client_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->crpc_replyev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid; + int rc; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &rpc->crpc_replymsg, sizeof(srpc_msg_t), + LNET_MD_OP_PUT, rpc->crpc_dest, + &rpc->crpc_replymdh, ev); + if (rc != 0) { + LASSERT (rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +int +srpc_prepare_bulk (srpc_client_rpc_t *rpc) +{ + srpc_bulk_t *bk = &rpc->crpc_bulk; + srpc_event_t *ev = &rpc->crpc_bulkev; + __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT (bk->bk_niov <= LNET_MAX_IOV); + + if (bk->bk_niov == 0) return 0; /* nothing to do */ + + opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_BULK_REQ_RCVD; + + *id = srpc_next_id(); + + rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->crpc_dest, &bk->bk_mdh, ev); + if (rc != 0) { + LASSERT (rc == -ENOMEM); + ev->ev_fired = 1; /* no more event expected */ + } + return rc; +} + +int +srpc_do_bulk (srpc_server_rpc_t *rpc) +{ + srpc_event_t *ev = &rpc->srpc_ev; + srpc_bulk_t *bk = rpc->srpc_bulk; + __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid; + int rc; + int opt; + + LASSERT (bk != NULL); + + opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT; + opt |= LNET_MD_KIOV; + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT; + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id, + &bk->bk_iovs[0], bk->bk_niov, opt, + rpc->srpc_peer, rpc->srpc_self, + &bk->bk_mdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* only called from srpc_handle_rpc */ +void +srpc_server_rpc_done(srpc_server_rpc_t *rpc, int status) +{ + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + srpc_buffer_t *buffer; + + LASSERT (status != 0 || rpc->srpc_wi.swi_state == SWI_STATE_DONE); + + rpc->srpc_status = status; + + CDEBUG_LIMIT (status == 0 ? D_NET : D_NETERROR, + "Server RPC %p done: service %s, peer %s, status %s:%d\n", + rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), + swi_state2str(rpc->srpc_wi.swi_state), status); + + if (status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_dropped++; + spin_unlock(&srpc_data.rpc_glock); + } + + if (rpc->srpc_done != NULL) + (*rpc->srpc_done) (rpc); + LASSERT(rpc->srpc_bulk == NULL); + + spin_lock(&scd->scd_lock); + + if (rpc->srpc_reqstbuf != NULL) { + /* NB might drop sv_lock in srpc_service_recycle_buffer, but + * sv won't go away for scd_rpc_active must not be empty */ + srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf); + rpc->srpc_reqstbuf = NULL; + } + + list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */ + + /* + * No one can schedule me now since: + * - I'm not on scd_rpc_active. + * - all LNet events have been fired. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT(rpc->srpc_ev.ev_fired); + swi_exit_workitem(&rpc->srpc_wi); + + if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { + buffer = list_entry(scd->scd_buf_blocked.next, + srpc_buffer_t, buf_list); + list_del(&buffer->buf_list); + + srpc_init_server_rpc(rpc, scd, buffer); + list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active); + swi_schedule_workitem(&rpc->srpc_wi); + } else { + list_add(&rpc->srpc_list, &scd->scd_rpc_free); + } + + spin_unlock(&scd->scd_lock); + return; +} + +/* handles an incoming RPC */ +int +srpc_handle_rpc(swi_workitem_t *wi) +{ + struct srpc_server_rpc *rpc = wi->swi_workitem.wi_data; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + srpc_event_t *ev = &rpc->srpc_ev; + int rc = 0; + + LASSERT(wi == &rpc->srpc_wi); + + spin_lock(&scd->scd_lock); + + if (sv->sv_shuttingdown || rpc->srpc_aborted) { + spin_unlock(&scd->scd_lock); + + if (rpc->srpc_bulk != NULL) + LNetMDUnlink(rpc->srpc_bulk->bk_mdh); + LNetMDUnlink(rpc->srpc_replymdh); + + if (ev->ev_fired) { /* no more event, OK to finish */ + srpc_server_rpc_done(rpc, -ESHUTDOWN); + return 1; + } + return 0; + } + + spin_unlock(&scd->scd_lock); + + switch (wi->swi_state) { + default: + LBUG (); + case SWI_STATE_NEWBORN: { + srpc_msg_t *msg; + srpc_generic_reply_t *reply; + + msg = &rpc->srpc_reqstbuf->buf_msg; + reply = &rpc->srpc_replymsg.msg_body.reply; + + if (msg->msg_magic == 0) { + /* moaned already in srpc_lnet_ev_handler */ + srpc_server_rpc_done(rpc, EBADMSG); + return 1; + } + + srpc_unpack_msg_hdr(msg); + if (msg->msg_version != SRPC_MSG_VERSION) { + CWARN("Version mismatch: %u, %u expected, from %s\n", + msg->msg_version, SRPC_MSG_VERSION, + libcfs_id2str(rpc->srpc_peer)); + reply->status = EPROTO; + /* drop through and send reply */ + } else { + reply->status = 0; + rc = (*sv->sv_handler)(rpc); + LASSERT(reply->status == 0 || !rpc->srpc_bulk); + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_BULK_STARTED; + + if (rpc->srpc_bulk != NULL) { + rc = srpc_do_bulk(rpc); + if (rc == 0) + return 0; /* wait for bulk */ + + LASSERT (ev->ev_fired); + ev->ev_status = rc; + } + } + case SWI_STATE_BULK_STARTED: + LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired); + + if (rpc->srpc_bulk != NULL) { + rc = ev->ev_status; + + if (sv->sv_bulk_ready != NULL) + rc = (*sv->sv_bulk_ready) (rpc, rc); + + if (rc != 0) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + } + + wi->swi_state = SWI_STATE_REPLY_SUBMITTED; + rc = srpc_send_reply(rpc); + if (rc == 0) + return 0; /* wait for reply */ + srpc_server_rpc_done(rpc, rc); + return 1; + + case SWI_STATE_REPLY_SUBMITTED: + if (!ev->ev_fired) { + CERROR("RPC %p: bulk %p, service %d\n", + rpc, rpc->srpc_bulk, sv->sv_id); + CERROR("Event: status %d, type %d, lnet %d\n", + ev->ev_status, ev->ev_type, ev->ev_lnet); + LASSERT (ev->ev_fired); + } + + wi->swi_state = SWI_STATE_DONE; + srpc_server_rpc_done(rpc, ev->ev_status); + return 1; + } + + return 0; +} + +void +srpc_client_rpc_expired (void *data) +{ + srpc_client_rpc_t *rpc = data; + + CWARN ("Client RPC expired: service %d, peer %s, timeout %d.\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + rpc->crpc_timeout); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_timeout = 0; + srpc_abort_rpc(rpc, -ETIMEDOUT); + + spin_unlock(&rpc->crpc_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_expired++; + spin_unlock(&srpc_data.rpc_glock); +} + +inline void +srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc) +{ + stt_timer_t *timer = &rpc->crpc_timer; + + if (rpc->crpc_timeout == 0) return; + + INIT_LIST_HEAD(&timer->stt_list); + timer->stt_data = rpc; + timer->stt_func = srpc_client_rpc_expired; + timer->stt_expires = cfs_time_add(rpc->crpc_timeout, + cfs_time_current_sec()); + stt_add_timer(timer); + return; +} + +/* + * Called with rpc->crpc_lock held. + * + * Upon exit the RPC expiry timer is not queued and the handler is not + * running on any CPU. */ +void +srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc) +{ + /* timer not planted or already exploded */ + if (rpc->crpc_timeout == 0) + return; + + /* timer sucessfully defused */ + if (stt_del_timer(&rpc->crpc_timer)) + return; + + /* timer detonated, wait for it to explode */ + while (rpc->crpc_timeout != 0) { + spin_unlock(&rpc->crpc_lock); + + schedule(); + + spin_lock(&rpc->crpc_lock); + } +} + +void +srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status) +{ + swi_workitem_t *wi = &rpc->crpc_wi; + + LASSERT(status != 0 || wi->swi_state == SWI_STATE_DONE); + + spin_lock(&rpc->crpc_lock); + + rpc->crpc_closed = 1; + if (rpc->crpc_status == 0) + rpc->crpc_status = status; + + srpc_del_client_rpc_timer(rpc); + + CDEBUG_LIMIT ((status == 0) ? D_NET : D_NETERROR, + "Client RPC done: service %d, peer %s, status %s:%d:%d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(wi->swi_state), rpc->crpc_aborted, status); + + /* + * No one can schedule me now since: + * - RPC timer has been defused. + * - all LNet events have been fired. + * - crpc_closed has been set, preventing srpc_abort_rpc from + * scheduling me. + * Cancel pending schedules and prevent future schedule attempts: + */ + LASSERT (!srpc_event_pending(rpc)); + swi_exit_workitem(wi); + + spin_unlock(&rpc->crpc_lock); + + (*rpc->crpc_done)(rpc); + return; +} + +/* sends an outgoing RPC */ +int +srpc_send_rpc (swi_workitem_t *wi) +{ + int rc = 0; + srpc_client_rpc_t *rpc; + srpc_msg_t *reply; + int do_bulk; + + LASSERT(wi != NULL); + + rpc = wi->swi_workitem.wi_data; + + LASSERT (rpc != NULL); + LASSERT (wi == &rpc->crpc_wi); + + reply = &rpc->crpc_replymsg; + do_bulk = rpc->crpc_bulk.bk_niov > 0; + + spin_lock(&rpc->crpc_lock); + + if (rpc->crpc_aborted) { + spin_unlock(&rpc->crpc_lock); + goto abort; + } + + spin_unlock(&rpc->crpc_lock); + + switch (wi->swi_state) { + default: + LBUG (); + case SWI_STATE_NEWBORN: + LASSERT (!srpc_event_pending(rpc)); + + rc = srpc_prepare_reply(rpc); + if (rc != 0) { + srpc_client_rpc_done(rpc, rc); + return 1; + } + + rc = srpc_prepare_bulk(rpc); + if (rc != 0) break; + + wi->swi_state = SWI_STATE_REQUEST_SUBMITTED; + rc = srpc_send_request(rpc); + break; + + case SWI_STATE_REQUEST_SUBMITTED: + /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any + * order; however, they're processed in a strict order: + * rqt, rpy, and bulk. */ + if (!rpc->crpc_reqstev.ev_fired) break; + + rc = rpc->crpc_reqstev.ev_status; + if (rc != 0) break; + + wi->swi_state = SWI_STATE_REQUEST_SENT; + /* perhaps more events, fall thru */ + case SWI_STATE_REQUEST_SENT: { + srpc_msg_type_t type = srpc_service2reply(rpc->crpc_service); + + if (!rpc->crpc_replyev.ev_fired) break; + + rc = rpc->crpc_replyev.ev_status; + if (rc != 0) break; + + srpc_unpack_msg_hdr(reply); + if (reply->msg_type != type || + (reply->msg_magic != SRPC_MSG_MAGIC && + reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CWARN ("Bad message from %s: type %u (%d expected)," + " magic %u (%d expected).\n", + libcfs_id2str(rpc->crpc_dest), + reply->msg_type, type, + reply->msg_magic, SRPC_MSG_MAGIC); + rc = -EBADMSG; + break; + } + + if (do_bulk && reply->msg_body.reply.status != 0) { + CWARN ("Remote error %d at %s, unlink bulk buffer in " + "case peer didn't initiate bulk transfer\n", + reply->msg_body.reply.status, + libcfs_id2str(rpc->crpc_dest)); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + } + + wi->swi_state = SWI_STATE_REPLY_RECEIVED; + } + case SWI_STATE_REPLY_RECEIVED: + if (do_bulk && !rpc->crpc_bulkev.ev_fired) break; + + rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0; + + /* Bulk buffer was unlinked due to remote error. Clear error + * since reply buffer still contains valid data. + * NB rpc->crpc_done shouldn't look into bulk data in case of + * remote error. */ + if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK && + rpc->crpc_status == 0 && reply->msg_body.reply.status != 0) + rc = 0; + + wi->swi_state = SWI_STATE_DONE; + srpc_client_rpc_done(rpc, rc); + return 1; + } + + if (rc != 0) { + spin_lock(&rpc->crpc_lock); + srpc_abort_rpc(rpc, rc); + spin_unlock(&rpc->crpc_lock); + } + +abort: + if (rpc->crpc_aborted) { + LNetMDUnlink(rpc->crpc_reqstmdh); + LNetMDUnlink(rpc->crpc_replymdh); + LNetMDUnlink(rpc->crpc_bulk.bk_mdh); + + if (!srpc_event_pending(rpc)) { + srpc_client_rpc_done(rpc, -EINTR); + return 1; + } + } + return 0; +} + +srpc_client_rpc_t * +srpc_create_client_rpc (lnet_process_id_t peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv) +{ + srpc_client_rpc_t *rpc; + + LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t, + crpc_bulk.bk_iovs[nbulkiov])); + if (rpc == NULL) + return NULL; + + srpc_init_client_rpc(rpc, peer, service, nbulkiov, + bulklen, rpc_done, rpc_fini, priv); + return rpc; +} + +/* called with rpc->crpc_lock held */ +void +srpc_abort_rpc (srpc_client_rpc_t *rpc, int why) +{ + LASSERT (why != 0); + + if (rpc->crpc_aborted || /* already aborted */ + rpc->crpc_closed) /* callback imminent */ + return; + + CDEBUG (D_NET, + "Aborting RPC: service %d, peer %s, state %s, why %d\n", + rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), + swi_state2str(rpc->crpc_wi.swi_state), why); + + rpc->crpc_aborted = 1; + rpc->crpc_status = why; + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + +/* called with rpc->crpc_lock held */ +void +srpc_post_rpc (srpc_client_rpc_t *rpc) +{ + LASSERT (!rpc->crpc_aborted); + LASSERT (srpc_data.rpc_state == SRPC_STATE_RUNNING); + + CDEBUG (D_NET, "Posting RPC: peer %s, service %d, timeout %d\n", + libcfs_id2str(rpc->crpc_dest), rpc->crpc_service, + rpc->crpc_timeout); + + srpc_add_client_rpc_timer(rpc); + swi_schedule_workitem(&rpc->crpc_wi); + return; +} + + +int +srpc_send_reply(struct srpc_server_rpc *rpc) +{ + srpc_event_t *ev = &rpc->srpc_ev; + struct srpc_msg *msg = &rpc->srpc_replymsg; + struct srpc_buffer *buffer = rpc->srpc_reqstbuf; + struct srpc_service_cd *scd = rpc->srpc_scd; + struct srpc_service *sv = scd->scd_svc; + __u64 rpyid; + int rc; + + LASSERT(buffer != NULL); + rpyid = buffer->buf_msg.msg_body.reqst.rpyid; + + spin_lock(&scd->scd_lock); + + if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) { + /* Repost buffer before replying since test client + * might send me another RPC once it gets the reply */ + if (srpc_service_post_buffer(scd, buffer) != 0) + CWARN("Failed to repost %s buffer\n", sv->sv_name); + rpc->srpc_reqstbuf = NULL; + } + + spin_unlock(&scd->scd_lock); + + ev->ev_fired = 0; + ev->ev_data = rpc; + ev->ev_type = SRPC_REPLY_SENT; + + msg->msg_magic = SRPC_MSG_MAGIC; + msg->msg_version = SRPC_MSG_VERSION; + msg->msg_type = srpc_service2reply(sv->sv_id); + + rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg, + sizeof(*msg), LNET_MD_OP_PUT, + rpc->srpc_peer, rpc->srpc_self, + &rpc->srpc_replymdh, ev); + if (rc != 0) + ev->ev_fired = 1; /* no more event expected */ + return rc; +} + +/* when in kernel always called with LNET_LOCK() held, and in thread context */ +void +srpc_lnet_ev_handler(lnet_event_t *ev) +{ + struct srpc_service_cd *scd; + srpc_event_t *rpcev = ev->md.user_ptr; + srpc_client_rpc_t *crpc; + srpc_server_rpc_t *srpc; + srpc_buffer_t *buffer; + srpc_service_t *sv; + srpc_msg_t *msg; + srpc_msg_type_t type; + + LASSERT (!in_interrupt()); + + if (ev->status != 0) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.errors++; + spin_unlock(&srpc_data.rpc_glock); + } + + rpcev->ev_lnet = ev->type; + + switch (rpcev->ev_type) { + default: + CERROR("Unknown event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG (); + case SRPC_REQUEST_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_sent++; + spin_unlock(&srpc_data.rpc_glock); + } + case SRPC_REPLY_RCVD: + case SRPC_BULK_REQ_RCVD: + crpc = rpcev->ev_data; + + if (rpcev != &crpc->crpc_reqstev && + rpcev != &crpc->crpc_replyev && + rpcev != &crpc->crpc_bulkev) { + CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n", + rpcev, crpc, &crpc->crpc_reqstev, + &crpc->crpc_replyev, &crpc->crpc_bulkev); + CERROR("Bad event: status %d, type %d, lnet %d\n", + rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); + LBUG (); + } + + spin_lock(&crpc->crpc_lock); + + LASSERT(rpcev->ev_fired == 0); + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&crpc->crpc_wi); + + spin_unlock(&crpc->crpc_lock); + break; + + case SRPC_REQUEST_RCVD: + scd = rpcev->ev_data; + sv = scd->scd_svc; + + LASSERT(rpcev == &scd->scd_ev); + + spin_lock(&scd->scd_lock); + + LASSERT (ev->unlinked); + LASSERT (ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->type != LNET_EVENT_UNLINK || + sv->sv_shuttingdown); + + buffer = container_of(ev->md.start, srpc_buffer_t, buf_msg); + buffer->buf_peer = ev->initiator; + buffer->buf_self = ev->target.nid; + + LASSERT(scd->scd_buf_nposted > 0); + scd->scd_buf_nposted--; + + if (sv->sv_shuttingdown) { + /* Leave buffer on scd->scd_buf_nposted since + * srpc_finish_service needs to traverse it. */ + spin_unlock(&scd->scd_lock); + break; + } + + if (scd->scd_buf_err_stamp != 0 && + scd->scd_buf_err_stamp < cfs_time_current_sec()) { + /* re-enable adding buffer */ + scd->scd_buf_err_stamp = 0; + scd->scd_buf_err = 0; + } + + if (scd->scd_buf_err == 0 && /* adding buffer is enabled */ + scd->scd_buf_adjust == 0 && + scd->scd_buf_nposted < scd->scd_buf_low) { + scd->scd_buf_adjust = MAX(scd->scd_buf_total / 2, + SFW_TEST_WI_MIN); + swi_schedule_workitem(&scd->scd_buf_wi); + } + + list_del(&buffer->buf_list); /* from scd->scd_buf_posted */ + msg = &buffer->buf_msg; + type = srpc_service2request(sv->sv_id); + + if (ev->status != 0 || ev->mlength != sizeof(*msg) || + (msg->msg_type != type && + msg->msg_type != __swab32(type)) || + (msg->msg_magic != SRPC_MSG_MAGIC && + msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { + CERROR ("Dropping RPC (%s) from %s: " + "status %d mlength %d type %u magic %u.\n", + sv->sv_name, libcfs_id2str(ev->initiator), + ev->status, ev->mlength, + msg->msg_type, msg->msg_magic); + + /* NB can't call srpc_service_recycle_buffer here since + * it may call LNetM[DE]Attach. The invalid magic tells + * srpc_handle_rpc to drop this RPC */ + msg->msg_magic = 0; + } + + if (!list_empty(&scd->scd_rpc_free)) { + srpc = list_entry(scd->scd_rpc_free.next, + struct srpc_server_rpc, + srpc_list); + list_del(&srpc->srpc_list); + + srpc_init_server_rpc(srpc, scd, buffer); + list_add_tail(&srpc->srpc_list, + &scd->scd_rpc_active); + swi_schedule_workitem(&srpc->srpc_wi); + } else { + list_add_tail(&buffer->buf_list, + &scd->scd_buf_blocked); + } + + spin_unlock(&scd->scd_lock); + + spin_lock(&srpc_data.rpc_glock); + srpc_data.rpc_counters.rpcs_rcvd++; + spin_unlock(&srpc_data.rpc_glock); + break; + + case SRPC_BULK_GET_RPLD: + LASSERT (ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_REPLY || + ev->type == LNET_EVENT_UNLINK); + + if (!ev->unlinked) + break; /* wait for final event */ + + case SRPC_BULK_PUT_SENT: + if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { + spin_lock(&srpc_data.rpc_glock); + + if (rpcev->ev_type == SRPC_BULK_GET_RPLD) + srpc_data.rpc_counters.bulk_get += ev->mlength; + else + srpc_data.rpc_counters.bulk_put += ev->mlength; + + spin_unlock(&srpc_data.rpc_glock); + } + case SRPC_REPLY_SENT: + srpc = rpcev->ev_data; + scd = srpc->srpc_scd; + + LASSERT(rpcev == &srpc->srpc_ev); + + spin_lock(&scd->scd_lock); + + rpcev->ev_fired = 1; + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + -EINTR : ev->status; + swi_schedule_workitem(&srpc->srpc_wi); + + spin_unlock(&scd->scd_lock); + break; + } +} + + +int +srpc_startup (void) +{ + int rc; + + memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); + spin_lock_init(&srpc_data.rpc_glock); + + /* 1 second pause to avoid timestamp reuse */ + cfs_pause(cfs_time_seconds(1)); + srpc_data.rpc_matchbits = ((__u64) cfs_time_current_sec()) << 48; + + srpc_data.rpc_state = SRPC_STATE_NONE; + + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc < 0) { + CERROR ("LNetNIInit() has failed: %d\n", rc); + return rc; + } + + srpc_data.rpc_state = SRPC_STATE_NI_INIT; + + LNetInvalidateHandle(&srpc_data.rpc_lnet_eq); + rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq); + if (rc != 0) { + CERROR("LNetEQAlloc() has failed: %d\n", rc); + goto bail; + } + + rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + LASSERT(rc == 0); + rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT(rc == 0); + + srpc_data.rpc_state = SRPC_STATE_EQ_INIT; + + rc = stt_startup(); + +bail: + if (rc != 0) + srpc_shutdown(); + else + srpc_data.rpc_state = SRPC_STATE_RUNNING; + + return rc; +} + +void +srpc_shutdown (void) +{ + int i; + int rc; + int state; + + state = srpc_data.rpc_state; + srpc_data.rpc_state = SRPC_STATE_STOPPING; + + switch (state) { + default: + LBUG (); + case SRPC_STATE_RUNNING: + spin_lock(&srpc_data.rpc_glock); + + for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) { + srpc_service_t *sv = srpc_data.rpc_services[i]; + + LASSERTF (sv == NULL, + "service not empty: id %d, name %s\n", + i, sv->sv_name); + } + + spin_unlock(&srpc_data.rpc_glock); + + stt_shutdown(); + + case SRPC_STATE_EQ_INIT: + rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); + rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); + LASSERT (rc == 0); + rc = LNetEQFree(srpc_data.rpc_lnet_eq); + LASSERT (rc == 0); /* the EQ should have no user by now */ + + case SRPC_STATE_NI_INIT: + LNetNIFini(); + } + + return; +} diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h new file mode 100644 index 000000000000..b905d49a351f --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/rpc.h @@ -0,0 +1,302 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __SELFTEST_RPC_H__ +#define __SELFTEST_RPC_H__ + +#include + +/* + * LST wired structures + * + * XXX: *REPLY == *REQST + 1 + */ +typedef enum { + SRPC_MSG_MKSN_REQST = 0, + SRPC_MSG_MKSN_REPLY = 1, + SRPC_MSG_RMSN_REQST = 2, + SRPC_MSG_RMSN_REPLY = 3, + SRPC_MSG_BATCH_REQST = 4, + SRPC_MSG_BATCH_REPLY = 5, + SRPC_MSG_STAT_REQST = 6, + SRPC_MSG_STAT_REPLY = 7, + SRPC_MSG_TEST_REQST = 8, + SRPC_MSG_TEST_REPLY = 9, + SRPC_MSG_DEBUG_REQST = 10, + SRPC_MSG_DEBUG_REPLY = 11, + SRPC_MSG_BRW_REQST = 12, + SRPC_MSG_BRW_REPLY = 13, + SRPC_MSG_PING_REQST = 14, + SRPC_MSG_PING_REPLY = 15, + SRPC_MSG_JOIN_REQST = 16, + SRPC_MSG_JOIN_REPLY = 17, +} srpc_msg_type_t; + + +/* CAVEAT EMPTOR: + * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer, + * and 2nd field matchbits of bulk buffer if any. + * + * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field + * session id if needed. + */ +typedef struct { + __u64 rpyid; /* reply buffer matchbits */ + __u64 bulkid; /* bulk buffer matchbits */ +} WIRE_ATTR srpc_generic_reqst_t; + +typedef struct { + __u32 status; + lst_sid_t sid; +} WIRE_ATTR srpc_generic_reply_t; + +/* FRAMEWORK RPCs */ +typedef struct { + __u64 mksn_rpyid; /* reply buffer matchbits */ + lst_sid_t mksn_sid; /* session id */ + __u32 mksn_force; /* use brute force */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR srpc_mksn_reqst_t; /* make session request */ + +typedef struct { + __u32 mksn_status; /* session status */ + lst_sid_t mksn_sid; /* session id */ + __u32 mksn_timeout; /* session timeout */ + char mksn_name[LST_NAME_SIZE]; +} WIRE_ATTR srpc_mksn_reply_t; /* make session reply */ + +typedef struct { + __u64 rmsn_rpyid; /* reply buffer matchbits */ + lst_sid_t rmsn_sid; /* session id */ +} WIRE_ATTR srpc_rmsn_reqst_t; /* remove session request */ + +typedef struct { + __u32 rmsn_status; + lst_sid_t rmsn_sid; /* session id */ +} WIRE_ATTR srpc_rmsn_reply_t; /* remove session reply */ + +typedef struct { + __u64 join_rpyid; /* reply buffer matchbits */ + lst_sid_t join_sid; /* session id to join */ + char join_group[LST_NAME_SIZE]; /* group name */ +} WIRE_ATTR srpc_join_reqst_t; + +typedef struct { + __u32 join_status; /* returned status */ + lst_sid_t join_sid; /* session id */ + __u32 join_timeout; /* # seconds' inactivity to expire */ + char join_session[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR srpc_join_reply_t; + +typedef struct { + __u64 dbg_rpyid; /* reply buffer matchbits */ + lst_sid_t dbg_sid; /* session id */ + __u32 dbg_flags; /* bitmap of debug */ +} WIRE_ATTR srpc_debug_reqst_t; + +typedef struct { + __u32 dbg_status; /* returned code */ + lst_sid_t dbg_sid; /* session id */ + __u32 dbg_timeout; /* session timeout */ + __u32 dbg_nbatch; /* # of batches in the node */ + char dbg_name[LST_NAME_SIZE]; /* session name */ +} WIRE_ATTR srpc_debug_reply_t; + +#define SRPC_BATCH_OPC_RUN 1 +#define SRPC_BATCH_OPC_STOP 2 +#define SRPC_BATCH_OPC_QUERY 3 + +typedef struct { + __u64 bar_rpyid; /* reply buffer matchbits */ + lst_sid_t bar_sid; /* session id */ + lst_bid_t bar_bid; /* batch id */ + __u32 bar_opc; /* create/start/stop batch */ + __u32 bar_testidx; /* index of test */ + __u32 bar_arg; /* parameters */ +} WIRE_ATTR srpc_batch_reqst_t; + +typedef struct { + __u32 bar_status; /* status of request */ + lst_sid_t bar_sid; /* session id */ + __u32 bar_active; /* # of active tests in batch/test */ + __u32 bar_time; /* remained time */ +} WIRE_ATTR srpc_batch_reply_t; + +typedef struct { + __u64 str_rpyid; /* reply buffer matchbits */ + lst_sid_t str_sid; /* session id */ + __u32 str_type; /* type of stat */ +} WIRE_ATTR srpc_stat_reqst_t; + +typedef struct { + __u32 str_status; + lst_sid_t str_sid; + sfw_counters_t str_fw; + srpc_counters_t str_rpc; + lnet_counters_t str_lnet; +} WIRE_ATTR srpc_stat_reply_t; + +typedef struct { + __u32 blk_opc; /* bulk operation code */ + __u32 blk_npg; /* # of pages */ + __u32 blk_flags; /* reserved flags */ +} WIRE_ATTR test_bulk_req_t; + +typedef struct { + /** bulk operation code */ + __u16 blk_opc; + /** data check flags */ + __u16 blk_flags; + /** data length */ + __u32 blk_len; + /** reserved: offset */ + __u32 blk_offset; +} WIRE_ATTR test_bulk_req_v1_t; + +typedef struct { + __u32 png_size; /* size of ping message */ + __u32 png_flags; /* reserved flags */ +} WIRE_ATTR test_ping_req_t; + +typedef struct { + __u64 tsr_rpyid; /* reply buffer matchbits */ + __u64 tsr_bulkid; /* bulk buffer matchbits */ + lst_sid_t tsr_sid; /* session id */ + lst_bid_t tsr_bid; /* batch id */ + __u32 tsr_service; /* test type: bulk|ping|... */ + /* test client loop count or # server buffers needed */ + __u32 tsr_loop; + __u32 tsr_concur; /* concurrency of test */ + __u8 tsr_is_client; /* is test client or not */ + __u8 tsr_stop_onerr; /* stop on error */ + __u32 tsr_ndest; /* # of dest nodes */ + + union { + test_ping_req_t ping; + test_bulk_req_t bulk_v0; + test_bulk_req_v1_t bulk_v1; + } tsr_u; +} WIRE_ATTR srpc_test_reqst_t; + +typedef struct { + __u32 tsr_status; /* returned code */ + lst_sid_t tsr_sid; +} WIRE_ATTR srpc_test_reply_t; + +/* TEST RPCs */ +typedef struct { + __u64 pnr_rpyid; + __u32 pnr_magic; + __u32 pnr_seq; + __u64 pnr_time_sec; + __u64 pnr_time_usec; +} WIRE_ATTR srpc_ping_reqst_t; + +typedef struct { + __u32 pnr_status; + __u32 pnr_magic; + __u32 pnr_seq; +} WIRE_ATTR srpc_ping_reply_t; + +typedef struct { + __u64 brw_rpyid; /* reply buffer matchbits */ + __u64 brw_bulkid; /* bulk buffer matchbits */ + __u32 brw_rw; /* read or write */ + __u32 brw_len; /* bulk data len */ + __u32 brw_flags; /* bulk data patterns */ +} WIRE_ATTR srpc_brw_reqst_t; /* bulk r/w request */ + +typedef struct { + __u32 brw_status; +} WIRE_ATTR srpc_brw_reply_t; /* bulk r/w reply */ + +#define SRPC_MSG_MAGIC 0xeeb0f00d +#define SRPC_MSG_VERSION 1 + +typedef struct srpc_msg { + /** magic number */ + __u32 msg_magic; + /** message version number */ + __u32 msg_version; + /** type of message body: srpc_msg_type_t */ + __u32 msg_type; + __u32 msg_reserved0; + __u32 msg_reserved1; + /** test session features */ + __u32 msg_ses_feats; + union { + srpc_generic_reqst_t reqst; + srpc_generic_reply_t reply; + + srpc_mksn_reqst_t mksn_reqst; + srpc_mksn_reply_t mksn_reply; + srpc_rmsn_reqst_t rmsn_reqst; + srpc_rmsn_reply_t rmsn_reply; + srpc_debug_reqst_t dbg_reqst; + srpc_debug_reply_t dbg_reply; + srpc_batch_reqst_t bat_reqst; + srpc_batch_reply_t bat_reply; + srpc_stat_reqst_t stat_reqst; + srpc_stat_reply_t stat_reply; + srpc_test_reqst_t tes_reqst; + srpc_test_reply_t tes_reply; + srpc_join_reqst_t join_reqst; + srpc_join_reply_t join_reply; + + srpc_ping_reqst_t ping_reqst; + srpc_ping_reply_t ping_reply; + srpc_brw_reqst_t brw_reqst; + srpc_brw_reply_t brw_reply; + } msg_body; +} WIRE_ATTR srpc_msg_t; + +static inline void +srpc_unpack_msg_hdr(srpc_msg_t *msg) +{ + if (msg->msg_magic == SRPC_MSG_MAGIC) + return; /* no flipping needed */ + + /* We do not swap the magic number here as it is needed to + determine whether the body needs to be swapped. */ + /* __swab32s(&msg->msg_magic); */ + __swab32s(&msg->msg_type); + __swab32s(&msg->msg_version); + __swab32s(&msg->msg_ses_feats); + __swab32s(&msg->msg_reserved0); + __swab32s(&msg->msg_reserved1); +} + +#endif /* __SELFTEST_RPC_H__ */ diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h new file mode 100644 index 000000000000..8053b0563ff3 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/selftest.h @@ -0,0 +1,611 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/selftest.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_SELFTEST_H__ +#define __SELFTEST_SELFTEST_H__ + +#define LNET_ONLY + +#include +#include +#include +#include +#include + +#include "rpc.h" +#include "timer.h" + +#ifndef MADE_WITHOUT_COMPROMISE +#define MADE_WITHOUT_COMPROMISE +#endif + + +#define SWI_STATE_NEWBORN 0 +#define SWI_STATE_REPLY_SUBMITTED 1 +#define SWI_STATE_REPLY_SENT 2 +#define SWI_STATE_REQUEST_SUBMITTED 3 +#define SWI_STATE_REQUEST_SENT 4 +#define SWI_STATE_REPLY_RECEIVED 5 +#define SWI_STATE_BULK_STARTED 6 +#define SWI_STATE_DONE 10 + +/* forward refs */ +struct srpc_service; +struct srpc_service_cd; +struct sfw_test_unit; +struct sfw_test_instance; + +/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework + * services, e.g. create/modify session. + */ +#define SRPC_SERVICE_DEBUG 0 +#define SRPC_SERVICE_MAKE_SESSION 1 +#define SRPC_SERVICE_REMOVE_SESSION 2 +#define SRPC_SERVICE_BATCH 3 +#define SRPC_SERVICE_TEST 4 +#define SRPC_SERVICE_QUERY_STAT 5 +#define SRPC_SERVICE_JOIN 6 +#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10 +/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */ +#define SRPC_SERVICE_BRW 11 +#define SRPC_SERVICE_PING 12 +#define SRPC_SERVICE_MAX_ID 12 + +#define SRPC_REQUEST_PORTAL 50 +/* a lazy portal for framework RPC requests */ +#define SRPC_FRAMEWORK_REQUEST_PORTAL 51 +/* all reply/bulk RDMAs go to this portal */ +#define SRPC_RDMA_PORTAL 52 + +static inline srpc_msg_type_t +srpc_service2request (int service) +{ + switch (service) { + default: + LBUG (); + case SRPC_SERVICE_DEBUG: + return SRPC_MSG_DEBUG_REQST; + + case SRPC_SERVICE_MAKE_SESSION: + return SRPC_MSG_MKSN_REQST; + + case SRPC_SERVICE_REMOVE_SESSION: + return SRPC_MSG_RMSN_REQST; + + case SRPC_SERVICE_BATCH: + return SRPC_MSG_BATCH_REQST; + + case SRPC_SERVICE_TEST: + return SRPC_MSG_TEST_REQST; + + case SRPC_SERVICE_QUERY_STAT: + return SRPC_MSG_STAT_REQST; + + case SRPC_SERVICE_BRW: + return SRPC_MSG_BRW_REQST; + + case SRPC_SERVICE_PING: + return SRPC_MSG_PING_REQST; + + case SRPC_SERVICE_JOIN: + return SRPC_MSG_JOIN_REQST; + } +} + +static inline srpc_msg_type_t +srpc_service2reply (int service) +{ + return srpc_service2request(service) + 1; +} + +typedef enum { + SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) received */ + SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */ + SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */ + SRPC_REPLY_RCVD = 4, /* incoming reply received */ + SRPC_REPLY_SENT = 5, /* outgoing reply sent */ + SRPC_REQUEST_RCVD = 6, /* incoming request received */ + SRPC_REQUEST_SENT = 7, /* outgoing request sent */ +} srpc_event_type_t; + +/* RPC event */ +typedef struct { + srpc_event_type_t ev_type; /* what's up */ + lnet_event_kind_t ev_lnet; /* LNet event type */ + int ev_fired; /* LNet event fired? */ + int ev_status; /* LNet event status */ + void *ev_data; /* owning server/client RPC */ +} srpc_event_t; + +typedef struct { + int bk_len; /* len of bulk data */ + lnet_handle_md_t bk_mdh; + int bk_sink; /* sink/source */ + int bk_niov; /* # iov in bk_iovs */ + lnet_kiov_t bk_iovs[0]; +} srpc_bulk_t; /* bulk descriptor */ + +/* message buffer descriptor */ +typedef struct srpc_buffer { + struct list_head buf_list; /* chain on srpc_service::*_msgq */ + srpc_msg_t buf_msg; + lnet_handle_md_t buf_mdh; + lnet_nid_t buf_self; + lnet_process_id_t buf_peer; +} srpc_buffer_t; + +struct swi_workitem; +typedef int (*swi_action_t) (struct swi_workitem *); + +typedef struct swi_workitem { + struct cfs_wi_sched *swi_sched; + cfs_workitem_t swi_workitem; + swi_action_t swi_action; + int swi_state; +} swi_workitem_t; + +/* server-side state of a RPC */ +typedef struct srpc_server_rpc { + /* chain on srpc_service::*_rpcq */ + struct list_head srpc_list; + struct srpc_service_cd *srpc_scd; + swi_workitem_t srpc_wi; + srpc_event_t srpc_ev; /* bulk/reply event */ + lnet_nid_t srpc_self; + lnet_process_id_t srpc_peer; + srpc_msg_t srpc_replymsg; + lnet_handle_md_t srpc_replymdh; + srpc_buffer_t *srpc_reqstbuf; + srpc_bulk_t *srpc_bulk; + + unsigned int srpc_aborted; /* being given up */ + int srpc_status; + void (*srpc_done)(struct srpc_server_rpc *); +} srpc_server_rpc_t; + +/* client-side state of a RPC */ +typedef struct srpc_client_rpc { + struct list_head crpc_list; /* chain on user's lists */ + spinlock_t crpc_lock; /* serialize */ + int crpc_service; + atomic_t crpc_refcount; + int crpc_timeout; /* # seconds to wait for reply */ + stt_timer_t crpc_timer; + swi_workitem_t crpc_wi; + lnet_process_id_t crpc_dest; + + void (*crpc_done)(struct srpc_client_rpc *); + void (*crpc_fini)(struct srpc_client_rpc *); + int crpc_status; /* completion status */ + void *crpc_priv; /* caller data */ + + /* state flags */ + unsigned int crpc_aborted:1; /* being given up */ + unsigned int crpc_closed:1; /* completed */ + + /* RPC events */ + srpc_event_t crpc_bulkev; /* bulk event */ + srpc_event_t crpc_reqstev; /* request event */ + srpc_event_t crpc_replyev; /* reply event */ + + /* bulk, request(reqst), and reply exchanged on wire */ + srpc_msg_t crpc_reqstmsg; + srpc_msg_t crpc_replymsg; + lnet_handle_md_t crpc_reqstmdh; + lnet_handle_md_t crpc_replymdh; + srpc_bulk_t crpc_bulk; +} srpc_client_rpc_t; + +#define srpc_client_rpc_size(rpc) \ +offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov]) + +#define srpc_client_rpc_addref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + atomic_inc(&(rpc)->crpc_refcount); \ +} while (0) + +#define srpc_client_rpc_decref(rpc) \ +do { \ + CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \ + (rpc), libcfs_id2str((rpc)->crpc_dest), \ + atomic_read(&(rpc)->crpc_refcount)); \ + LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ + if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \ + srpc_destroy_client_rpc(rpc); \ +} while (0) + +#define srpc_event_pending(rpc) ((rpc)->crpc_bulkev.ev_fired == 0 || \ + (rpc)->crpc_reqstev.ev_fired == 0 || \ + (rpc)->crpc_replyev.ev_fired == 0) + +/* CPU partition data of srpc service */ +struct srpc_service_cd { + /** serialize */ + spinlock_t scd_lock; + /** backref to service */ + struct srpc_service *scd_svc; + /** event buffer */ + srpc_event_t scd_ev; + /** free RPC descriptors */ + struct list_head scd_rpc_free; + /** in-flight RPCs */ + struct list_head scd_rpc_active; + /** workitem for posting buffer */ + swi_workitem_t scd_buf_wi; + /** CPT id */ + int scd_cpt; + /** error code for scd_buf_wi */ + int scd_buf_err; + /** timestamp for scd_buf_err */ + unsigned long scd_buf_err_stamp; + /** total # request buffers */ + int scd_buf_total; + /** # posted request buffers */ + int scd_buf_nposted; + /** in progress of buffer posting */ + int scd_buf_posting; + /** allocate more buffers if scd_buf_nposted < scd_buf_low */ + int scd_buf_low; + /** increase/decrease some buffers */ + int scd_buf_adjust; + /** posted message buffers */ + struct list_head scd_buf_posted; + /** blocked for RPC descriptor */ + struct list_head scd_buf_blocked; +}; + +/* number of server workitems (mini-thread) for testing service */ +#define SFW_TEST_WI_MIN 256 +#define SFW_TEST_WI_MAX 2048 +/* extra buffers for tolerating buggy peers, or unbalanced number + * of peers between partitions */ +#define SFW_TEST_WI_EXTRA 64 + +/* number of server workitems (mini-thread) for framework service */ +#define SFW_FRWK_WI_MIN 16 +#define SFW_FRWK_WI_MAX 256 + +typedef struct srpc_service { + int sv_id; /* service id */ + const char *sv_name; /* human readable name */ + int sv_wi_total; /* total server workitems */ + int sv_shuttingdown; + int sv_ncpts; + /* percpt data for srpc_service */ + struct srpc_service_cd **sv_cpt_data; + /* Service callbacks: + * - sv_handler: process incoming RPC request + * - sv_bulk_ready: notify bulk data + */ + int (*sv_handler) (srpc_server_rpc_t *); + int (*sv_bulk_ready) (srpc_server_rpc_t *, int); +} srpc_service_t; + +typedef struct { + struct list_head sn_list; /* chain on fw_zombie_sessions */ + lst_sid_t sn_id; /* unique identifier */ + unsigned int sn_timeout; /* # seconds' inactivity to expire */ + int sn_timer_active; + unsigned int sn_features; + stt_timer_t sn_timer; + struct list_head sn_batches; /* list of batches */ + char sn_name[LST_NAME_SIZE]; + atomic_t sn_refcount; + atomic_t sn_brw_errors; + atomic_t sn_ping_errors; + cfs_time_t sn_started; +} sfw_session_t; + +#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ + (sid0).ses_stamp == (sid1).ses_stamp) + +typedef struct { + struct list_head bat_list; /* chain on sn_batches */ + lst_bid_t bat_id; /* batch id */ + int bat_error; /* error code of batch */ + sfw_session_t *bat_session; /* batch's session */ + atomic_t bat_nactive; /* # of active tests */ + struct list_head bat_tests; /* test instances */ +} sfw_batch_t; + +typedef struct { + int (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */ + void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */ + int (*tso_prep_rpc)(struct sfw_test_unit *tsu, + lnet_process_id_t dest, + srpc_client_rpc_t **rpc); /* prep a tests rpc */ + void (*tso_done_rpc)(struct sfw_test_unit *tsu, + srpc_client_rpc_t *rpc); /* done a test rpc */ +} sfw_test_client_ops_t; + +typedef struct sfw_test_instance { + struct list_head tsi_list; /* chain on batch */ + int tsi_service; /* test type */ + sfw_batch_t *tsi_batch; /* batch */ + sfw_test_client_ops_t *tsi_ops; /* test client operations */ + + /* public parameter for all test units */ + unsigned int tsi_is_client:1; /* is test client */ + unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */ + int tsi_concur; /* concurrency */ + int tsi_loop; /* loop count */ + + /* status of test instance */ + spinlock_t tsi_lock; /* serialize */ + unsigned int tsi_stopping:1; /* test is stopping */ + atomic_t tsi_nactive; /* # of active test unit */ + struct list_head tsi_units; /* test units */ + struct list_head tsi_free_rpcs; /* free rpcs */ + struct list_head tsi_active_rpcs; /* active rpcs */ + + union { + test_ping_req_t ping; /* ping parameter */ + test_bulk_req_t bulk_v0; /* bulk parameter */ + test_bulk_req_v1_t bulk_v1; /* bulk v1 parameter */ + } tsi_u; +} sfw_test_instance_t; + +/* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at + * the end of pages are not used */ +#define SFW_MAX_CONCUR LST_MAX_CONCUR +#define SFW_ID_PER_PAGE (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t)) +#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE) +#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE) + +typedef struct sfw_test_unit { + struct list_head tsu_list; /* chain on lst_test_instance */ + lnet_process_id_t tsu_dest; /* id of dest node */ + int tsu_loop; /* loop count of the test */ + sfw_test_instance_t *tsu_instance; /* pointer to test instance */ + void *tsu_private; /* private data */ + swi_workitem_t tsu_worker; /* workitem of the test unit */ +} sfw_test_unit_t; + +typedef struct sfw_test_case { + struct list_head tsc_list; /* chain on fw_tests */ + srpc_service_t *tsc_srv_service; /* test service */ + sfw_test_client_ops_t *tsc_cli_ops; /* ops of test client */ +} sfw_test_case_t; + +srpc_client_rpc_t * +sfw_create_rpc(lnet_process_id_t peer, int service, + unsigned features, int nbulkiov, int bulklen, + void (*done) (srpc_client_rpc_t *), void *priv); +int sfw_create_test_rpc(sfw_test_unit_t *tsu, + lnet_process_id_t peer, unsigned features, + int nblk, int blklen, srpc_client_rpc_t **rpc); +void sfw_abort_rpc(srpc_client_rpc_t *rpc); +void sfw_post_rpc(srpc_client_rpc_t *rpc); +void sfw_client_rpc_done(srpc_client_rpc_t *rpc); +void sfw_unpack_message(srpc_msg_t *msg); +void sfw_free_pages(srpc_server_rpc_t *rpc); +void sfw_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i); +int sfw_alloc_pages(srpc_server_rpc_t *rpc, int cpt, int npages, int len, + int sink); +int sfw_make_session (srpc_mksn_reqst_t *request, srpc_mksn_reply_t *reply); + +srpc_client_rpc_t * +srpc_create_client_rpc(lnet_process_id_t peer, int service, + int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv); +void srpc_post_rpc(srpc_client_rpc_t *rpc); +void srpc_abort_rpc(srpc_client_rpc_t *rpc, int why); +void srpc_free_bulk(srpc_bulk_t *bk); +srpc_bulk_t *srpc_alloc_bulk(int cpt, unsigned bulk_npg, unsigned bulk_len, + int sink); +int srpc_send_rpc(swi_workitem_t *wi); +int srpc_send_reply(srpc_server_rpc_t *rpc); +int srpc_add_service(srpc_service_t *sv); +int srpc_remove_service(srpc_service_t *sv); +void srpc_shutdown_service(srpc_service_t *sv); +void srpc_abort_service(srpc_service_t *sv); +int srpc_finish_service(srpc_service_t *sv); +int srpc_service_add_buffers(srpc_service_t *sv, int nbuffer); +void srpc_service_remove_buffers(srpc_service_t *sv, int nbuffer); +void srpc_get_counters(srpc_counters_t *cnt); +void srpc_set_counters(const srpc_counters_t *cnt); + +extern struct cfs_wi_sched *lst_sched_serial; +extern struct cfs_wi_sched **lst_sched_test; + +static inline int +srpc_serv_is_framework(struct srpc_service *svc) +{ + return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; +} + +static inline int +swi_wi_action(cfs_workitem_t *wi) +{ + swi_workitem_t *swi = container_of(wi, swi_workitem_t, swi_workitem); + + return swi->swi_action(swi); +} + +static inline void +swi_init_workitem(swi_workitem_t *swi, void *data, + swi_action_t action, struct cfs_wi_sched *sched) +{ + swi->swi_sched = sched; + swi->swi_action = action; + swi->swi_state = SWI_STATE_NEWBORN; + cfs_wi_init(&swi->swi_workitem, data, swi_wi_action); +} + +static inline void +swi_schedule_workitem(swi_workitem_t *wi) +{ + cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem); +} + +static inline void +swi_exit_workitem(swi_workitem_t *swi) +{ + cfs_wi_exit(swi->swi_sched, &swi->swi_workitem); +} + +static inline int +swi_deschedule_workitem(swi_workitem_t *swi) +{ + return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem); +} + + +int sfw_startup(void); +int srpc_startup(void); +void sfw_shutdown(void); +void srpc_shutdown(void); + +static inline void +srpc_destroy_client_rpc (srpc_client_rpc_t *rpc) +{ + LASSERT (rpc != NULL); + LASSERT (!srpc_event_pending(rpc)); + LASSERT (atomic_read(&rpc->crpc_refcount) == 0); + + if (rpc->crpc_fini == NULL) { + LIBCFS_FREE(rpc, srpc_client_rpc_size(rpc)); + } else { + (*rpc->crpc_fini) (rpc); + } + + return; +} + +static inline void +srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer, + int service, int nbulkiov, int bulklen, + void (*rpc_done)(srpc_client_rpc_t *), + void (*rpc_fini)(srpc_client_rpc_t *), void *priv) +{ + LASSERT (nbulkiov <= LNET_MAX_IOV); + + memset(rpc, 0, offsetof(srpc_client_rpc_t, + crpc_bulk.bk_iovs[nbulkiov])); + + INIT_LIST_HEAD(&rpc->crpc_list); + swi_init_workitem(&rpc->crpc_wi, rpc, srpc_send_rpc, + lst_sched_test[lnet_cpt_of_nid(peer.nid)]); + spin_lock_init(&rpc->crpc_lock); + atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ + + rpc->crpc_dest = peer; + rpc->crpc_priv = priv; + rpc->crpc_service = service; + rpc->crpc_bulk.bk_len = bulklen; + rpc->crpc_bulk.bk_niov = nbulkiov; + rpc->crpc_done = rpc_done; + rpc->crpc_fini = rpc_fini; + LNetInvalidateHandle(&rpc->crpc_reqstmdh); + LNetInvalidateHandle(&rpc->crpc_replymdh); + LNetInvalidateHandle(&rpc->crpc_bulk.bk_mdh); + + /* no event is expected at this point */ + rpc->crpc_bulkev.ev_fired = + rpc->crpc_reqstev.ev_fired = + rpc->crpc_replyev.ev_fired = 1; + + rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC; + rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION; + rpc->crpc_reqstmsg.msg_type = srpc_service2request(service); + return; +} + +static inline const char * +swi_state2str (int state) +{ +#define STATE2STR(x) case x: return #x + switch(state) { + default: + LBUG(); + STATE2STR(SWI_STATE_NEWBORN); + STATE2STR(SWI_STATE_REPLY_SUBMITTED); + STATE2STR(SWI_STATE_REPLY_SENT); + STATE2STR(SWI_STATE_REQUEST_SUBMITTED); + STATE2STR(SWI_STATE_REQUEST_SENT); + STATE2STR(SWI_STATE_REPLY_RECEIVED); + STATE2STR(SWI_STATE_BULK_STARTED); + STATE2STR(SWI_STATE_DONE); + } +#undef STATE2STR +} + +#define UNUSED(x) ( (void)(x) ) + + +#define selftest_wait_events() cfs_pause(cfs_time_seconds(1) / 10) + + +#define lst_wait_until(cond, lock, fmt, ...) \ +do { \ + int __I = 2; \ + while (!(cond)) { \ + CDEBUG(IS_PO2(++__I) ? D_WARNING : D_NET, \ + fmt, ## __VA_ARGS__); \ + spin_unlock(&(lock)); \ + \ + selftest_wait_events(); \ + \ + spin_lock(&(lock)); \ + } \ +} while (0) + +static inline void +srpc_wait_service_shutdown(srpc_service_t *sv) +{ + int i = 2; + + LASSERT(sv->sv_shuttingdown); + + while (srpc_finish_service(sv) == 0) { + i++; + CDEBUG (((i & -i) == i) ? D_WARNING : D_NET, + "Waiting for %s service to shutdown...\n", + sv->sv_name); + selftest_wait_events(); + } +} + +#endif /* __SELFTEST_SELFTEST_H__ */ diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c new file mode 100644 index 000000000000..2c078550277b --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/timer.c @@ -0,0 +1,253 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.c + * + * Author: Isaac Huang + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include "selftest.h" + + +/* + * Timers are implemented as a sorted queue of expiry times. The queue + * is slotted, with each slot holding timers which expire in a + * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are + * sorted by increasing expiry time. The number of slots is 2**7 (128), + * to cover a time period of 1024 seconds into the future before wrapping. + */ +#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */ +#define STTIMER_SLOTTIME (1 << STTIMER_MINPOLL) +#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1)) +#define STTIMER_NSLOTS (1 << 7) +#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \ + (STTIMER_NSLOTS - 1))]) + +struct st_timer_data { + spinlock_t stt_lock; + /* start time of the slot processed previously */ + cfs_time_t stt_prev_slot; + struct list_head stt_hash[STTIMER_NSLOTS]; + int stt_shuttingdown; + wait_queue_head_t stt_waitq; + int stt_nthreads; +} stt_data; + +void +stt_add_timer(stt_timer_t *timer) +{ + struct list_head *pos; + + spin_lock(&stt_data.stt_lock); + + LASSERT (stt_data.stt_nthreads > 0); + LASSERT (!stt_data.stt_shuttingdown); + LASSERT (timer->stt_func != NULL); + LASSERT (list_empty(&timer->stt_list)); + LASSERT (cfs_time_after(timer->stt_expires, cfs_time_current_sec())); + + /* a simple insertion sort */ + list_for_each_prev (pos, STTIMER_SLOT(timer->stt_expires)) { + stt_timer_t *old = list_entry(pos, stt_timer_t, stt_list); + + if (cfs_time_aftereq(timer->stt_expires, old->stt_expires)) + break; + } + list_add(&timer->stt_list, pos); + + spin_unlock(&stt_data.stt_lock); +} + +/* + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + * + * CAVEAT EMPTOR: + * When 0 is returned, it is possible that timer->stt_func _is_ running on + * another CPU. + */ +int +stt_del_timer (stt_timer_t *timer) +{ + int ret = 0; + + spin_lock(&stt_data.stt_lock); + + LASSERT (stt_data.stt_nthreads > 0); + LASSERT (!stt_data.stt_shuttingdown); + + if (!list_empty(&timer->stt_list)) { + ret = 1; + list_del_init(&timer->stt_list); + } + + spin_unlock(&stt_data.stt_lock); + return ret; +} + +/* called with stt_data.stt_lock held */ +int +stt_expire_list (struct list_head *slot, cfs_time_t now) +{ + int expired = 0; + stt_timer_t *timer; + + while (!list_empty(slot)) { + timer = list_entry(slot->next, stt_timer_t, stt_list); + + if (cfs_time_after(timer->stt_expires, now)) + break; + + list_del_init(&timer->stt_list); + spin_unlock(&stt_data.stt_lock); + + expired++; + (*timer->stt_func) (timer->stt_data); + + spin_lock(&stt_data.stt_lock); + } + + return expired; +} + +int +stt_check_timers (cfs_time_t *last) +{ + int expired = 0; + cfs_time_t now; + cfs_time_t this_slot; + + now = cfs_time_current_sec(); + this_slot = now & STTIMER_SLOTTIMEMASK; + + spin_lock(&stt_data.stt_lock); + + while (cfs_time_aftereq(this_slot, *last)) { + expired += stt_expire_list(STTIMER_SLOT(this_slot), now); + this_slot = cfs_time_sub(this_slot, STTIMER_SLOTTIME); + } + + *last = now & STTIMER_SLOTTIMEMASK; + spin_unlock(&stt_data.stt_lock); + return expired; +} + + +int +stt_timer_main (void *arg) +{ + int rc = 0; + UNUSED(arg); + + SET_BUT_UNUSED(rc); + + cfs_block_allsigs(); + + while (!stt_data.stt_shuttingdown) { + stt_check_timers(&stt_data.stt_prev_slot); + + rc = wait_event_timeout(stt_data.stt_waitq, + stt_data.stt_shuttingdown, + cfs_time_seconds(STTIMER_SLOTTIME)); + } + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads--; + spin_unlock(&stt_data.stt_lock); + return 0; +} + +int +stt_start_timer_thread (void) +{ + task_t *task; + + LASSERT(!stt_data.stt_shuttingdown); + + task = kthread_run(stt_timer_main, NULL, "st_timer"); + if (IS_ERR(task)) + return PTR_ERR(task); + + spin_lock(&stt_data.stt_lock); + stt_data.stt_nthreads++; + spin_unlock(&stt_data.stt_lock); + return 0; +} + + +int +stt_startup (void) +{ + int rc = 0; + int i; + + stt_data.stt_shuttingdown = 0; + stt_data.stt_prev_slot = cfs_time_current_sec() & STTIMER_SLOTTIMEMASK; + + spin_lock_init(&stt_data.stt_lock); + for (i = 0; i < STTIMER_NSLOTS; i++) + INIT_LIST_HEAD(&stt_data.stt_hash[i]); + + stt_data.stt_nthreads = 0; + init_waitqueue_head(&stt_data.stt_waitq); + rc = stt_start_timer_thread(); + if (rc != 0) + CERROR ("Can't spawn timer thread: %d\n", rc); + + return rc; +} + +void +stt_shutdown (void) +{ + int i; + + spin_lock(&stt_data.stt_lock); + + for (i = 0; i < STTIMER_NSLOTS; i++) + LASSERT (list_empty(&stt_data.stt_hash[i])); + + stt_data.stt_shuttingdown = 1; + + wake_up(&stt_data.stt_waitq); + lst_wait_until(stt_data.stt_nthreads == 0, stt_data.stt_lock, + "waiting for %d threads to terminate\n", + stt_data.stt_nthreads); + + spin_unlock(&stt_data.stt_lock); +} diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h new file mode 100644 index 000000000000..56dbfe5ea1e5 --- /dev/null +++ b/drivers/staging/lustre/lnet/selftest/timer.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/selftest/timer.h + * + * Author: Isaac Huang + */ +#ifndef __SELFTEST_TIMER_H__ +#define __SELFTEST_TIMER_H__ + +typedef struct { + struct list_head stt_list; + cfs_time_t stt_expires; + void (*stt_func) (void *); + void *stt_data; +} stt_timer_t; + +void stt_add_timer (stt_timer_t *timer); +int stt_del_timer (stt_timer_t *timer); +int stt_startup (void); +void stt_shutdown (void); + +#endif /* __SELFTEST_TIMER_H__ */ diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig new file mode 100644 index 000000000000..d0a0e08afbc7 --- /dev/null +++ b/drivers/staging/lustre/lustre/Kconfig @@ -0,0 +1,33 @@ +config LUSTRE_FS + tristate "Lustre file system client support" + depends on STAGING && INET && BROKEN + select LNET + help + This option enables Lustre file system client support. Choose Y + here if you want to access a Lustre file system cluster. To compile + this file system support as a module, choose M here: the module will + be called lustre. + + To mount Lustre file systems , you also need to install the user space + mount.lustre and other user space commands which can be found in the + lustre-client package, available from + http://downloads.whamcloud.com/public/lustre/ + + Lustre file system is the most popular cluster file system in high + performance computing. Source code of both kernel space and user space + Lustre components can also be found at + http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary + + If unsure, say N. + + See also http://wiki.lustre.org/ + +config LUSTRE_OBD_MAX_IOCTL_BUFFER + int "Lustre obd max ioctl buffer bytes (default 8KB)" + depends on LUSTRE_FS + default 8192 + help + This option defines the maximum size of buffer in bytes that user space + applications can pass to Lustre kernel module through ioctl interface. + + If unsure, use default. diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile new file mode 100644 index 000000000000..3fb94fc12068 --- /dev/null +++ b/drivers/staging/lustre/lustre/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_LUSTRE_FS) := fid/ lvfs/ obdclass/ ptlrpc/ obdecho/ mgc/ lov/ \ + osc/ mdc/ lmv/ llite/ fld/ libcfs/ diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile new file mode 100644 index 000000000000..b8d6d21b39ff --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += fid.o +fid-y := fid_handler.o fid_store.o fid_request.o lproc_fid.o fid_lib.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/fid/fid_handler.c b/drivers/staging/lustre/lustre/fid/fid_handler.c new file mode 100644 index 000000000000..bbbb3cfe57b3 --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/fid_handler.c @@ -0,0 +1,661 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_handler.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +# include +# include + +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +int client_fid_init(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type) +{ + struct client_obd *cli = &obd->u.cli; + char *prefix; + int rc; + ENTRY; + + OBD_ALLOC_PTR(cli->cl_seq); + if (cli->cl_seq == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (prefix == NULL) + GOTO(out_free_seq, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(cli->cl_seq, exp, type, prefix, NULL); + OBD_FREE(prefix, MAX_OBD_NAME + 5); + if (rc) + GOTO(out_free_seq, rc); + + RETURN(rc); +out_free_seq: + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + return rc; +} +EXPORT_SYMBOL(client_fid_init); + +int client_fid_fini(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + ENTRY; + + if (cli->cl_seq != NULL) { + seq_client_fini(cli->cl_seq); + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + + RETURN(0); +} +EXPORT_SYMBOL(client_fid_fini); + +static void seq_server_proc_fini(struct lu_server_seq *seq); + +/* Assigns client to sequence controller node. */ +int seq_server_set_cli(struct lu_server_seq *seq, + struct lu_client_seq *cli, + const struct lu_env *env) +{ + int rc = 0; + ENTRY; + + /* + * Ask client for new range, assign that range to ->seq_space and write + * seq state to backing store should be atomic. + */ + mutex_lock(&seq->lss_mutex); + + if (cli == NULL) { + CDEBUG(D_INFO, "%s: Detached sequence client %s\n", + seq->lss_name, cli->lcs_name); + seq->lss_cli = cli; + GOTO(out_up, rc = 0); + } + + if (seq->lss_cli != NULL) { + CDEBUG(D_HA, "%s: Sequence controller is already " + "assigned\n", seq->lss_name); + GOTO(out_up, rc = -EEXIST); + } + + CDEBUG(D_INFO, "%s: Attached sequence controller %s\n", + seq->lss_name, cli->lcs_name); + + seq->lss_cli = cli; + cli->lcs_space.lsr_index = seq->lss_site->ss_node_id; + EXIT; +out_up: + mutex_unlock(&seq->lss_mutex); + return rc; +} +EXPORT_SYMBOL(seq_server_set_cli); +/* + * allocate \a w units of sequence from range \a from. + */ +static inline void range_alloc(struct lu_seq_range *to, + struct lu_seq_range *from, + __u64 width) +{ + width = min(range_space(from), width); + to->lsr_start = from->lsr_start; + to->lsr_end = from->lsr_start + width; + from->lsr_start += width; +} + +/** + * On controller node, allocate new super sequence for regular sequence server. + * As this super sequence controller, this node suppose to maintain fld + * and update index. + * \a out range always has currect mds node number of requester. + */ + +static int __seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + ENTRY; + + LASSERT(range_is_sane(space)); + + if (range_is_exhausted(space)) { + CERROR("%s: Sequences space is exhausted\n", + seq->lss_name); + RETURN(-ENOSPC); + } else { + range_alloc(out, space, seq->lss_width); + } + + rc = seq_store_update(env, seq, out, 1 /* sync */); + + LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n", + seq->lss_name, rc, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_super(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +static int __seq_set_init(const struct lu_env *env, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc; + + range_alloc(&seq->lss_lowater_set, space, seq->lss_set_width); + range_alloc(&seq->lss_hiwater_set, space, seq->lss_set_width); + + rc = seq_store_update(env, seq, NULL, 1); + + return rc; +} + +/* + * This function implements new seq allocation algorithm using async + * updates to seq file on disk. ref bug 18857 for details. + * there are four variable to keep track of this process + * + * lss_space; - available lss_space + * lss_lowater_set; - lu_seq_range for all seqs before barrier, i.e. safe to use + * lss_hiwater_set; - lu_seq_range after barrier, i.e. allocated but may be + * not yet committed + * + * when lss_lowater_set reaches the end it is replaced with hiwater one and + * a write operation is initiated to allocate new hiwater range. + * if last seq write opearion is still not commited, current operation is + * flaged as sync write op. + */ +static int range_alloc_set(const struct lu_env *env, + struct lu_seq_range *out, + struct lu_server_seq *seq) +{ + struct lu_seq_range *space = &seq->lss_space; + struct lu_seq_range *loset = &seq->lss_lowater_set; + struct lu_seq_range *hiset = &seq->lss_hiwater_set; + int rc = 0; + + if (range_is_zero(loset)) + __seq_set_init(env, seq); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_ALLOC)) /* exhaust set */ + loset->lsr_start = loset->lsr_end; + + if (range_is_exhausted(loset)) { + /* reached high water mark. */ + struct lu_device *dev = seq->lss_site->ss_lu->ls_top_dev; + int obd_num_clients = dev->ld_obd->obd_num_exports; + __u64 set_sz; + + /* calculate new seq width based on number of clients */ + set_sz = max(seq->lss_set_width, + obd_num_clients * seq->lss_width); + set_sz = min(range_space(space), set_sz); + + /* Switch to hiwater range now */ + *loset = *hiset; + /* allocate new hiwater range */ + range_alloc(hiset, space, set_sz); + + /* update ondisk seq with new *space */ + rc = seq_store_update(env, seq, NULL, seq->lss_need_sync); + } + + LASSERTF(!range_is_exhausted(loset) || range_is_sane(loset), + DRANGE"\n", PRANGE(loset)); + + if (rc == 0) + range_alloc(out, loset, seq->lss_width); + + RETURN(rc); +} + +static int __seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + struct lu_seq_range *space = &seq->lss_space; + int rc = 0; + + ENTRY; + + LASSERT(range_is_sane(space)); + + /* Check if available space ends and allocate new super seq */ + if (range_is_exhausted(space)) { + if (!seq->lss_cli) { + CERROR("%s: No sequence controller is attached.\n", + seq->lss_name); + RETURN(-ENODEV); + } + + rc = seq_client_alloc_super(seq->lss_cli, env); + if (rc) { + CERROR("%s: Can't allocate super-sequence, rc %d\n", + seq->lss_name, rc); + RETURN(rc); + } + + /* Saving new range to allocation space. */ + *space = seq->lss_cli->lcs_space; + LASSERT(range_is_sane(space)); + } + + rc = range_alloc_set(env, out, seq); + if (rc != 0) { + CERROR("%s: Allocated meta-sequence failed: rc = %d\n", + seq->lss_name, rc); + RETURN(rc); + } + + CDEBUG(D_INFO, "%s: Allocated meta-sequence " DRANGE"\n", + seq->lss_name, PRANGE(out)); + + RETURN(rc); +} + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lss_mutex); + rc = __seq_server_alloc_meta(seq, out, env); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(seq_server_alloc_meta); + +static int seq_server_handle(struct lu_site *site, + const struct lu_env *env, + __u32 opc, struct lu_seq_range *out) +{ + int rc; + struct seq_server_site *ss_site; + ENTRY; + + ss_site = lu_site2seq(site); + + switch (opc) { + case SEQ_ALLOC_META: + if (!ss_site->ss_server_seq) { + CERROR("Sequence server is not " + "initialized\n"); + RETURN(-EINVAL); + } + rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env); + break; + case SEQ_ALLOC_SUPER: + if (!ss_site->ss_control_seq) { + CERROR("Sequence controller is not " + "initialized\n"); + RETURN(-EINVAL); + } + rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env); + break; + default: + rc = -EINVAL; + break; + } + + RETURN(rc); +} + +static int seq_req_handle(struct ptlrpc_request *req, + const struct lu_env *env, + struct seq_thread_info *info) +{ + struct lu_seq_range *out, *tmp; + struct lu_site *site; + int rc = -EPROTO; + __u32 *opc; + ENTRY; + + LASSERT(!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)); + site = req->rq_export->exp_obd->obd_lu_dev->ld_site; + LASSERT(site != NULL); + + rc = req_capsule_server_pack(info->sti_pill); + if (rc) + RETURN(err_serious(rc)); + + opc = req_capsule_client_get(info->sti_pill, &RMF_SEQ_OPC); + if (opc != NULL) { + out = req_capsule_server_get(info->sti_pill, &RMF_SEQ_RANGE); + if (out == NULL) + RETURN(err_serious(-EPROTO)); + + tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE); + + /* seq client passed mdt id, we need to pass that using out + * range parameter */ + + out->lsr_index = tmp->lsr_index; + out->lsr_flags = tmp->lsr_flags; + rc = seq_server_handle(site, env, *opc, out); + } else + rc = err_serious(-EPROTO); + + RETURN(rc); +} + +/* context key constructor/destructor: seq_key_init, seq_key_fini */ +LU_KEY_INIT_FINI(seq, struct seq_thread_info); + +/* context key: seq_thread_key */ +LU_CONTEXT_KEY_DEFINE(seq, LCT_MD_THREAD | LCT_DT_THREAD); + +static void seq_thread_info_init(struct ptlrpc_request *req, + struct seq_thread_info *info) +{ + info->sti_pill = &req->rq_pill; + /* Init request capsule */ + req_capsule_init(info->sti_pill, req, RCL_SERVER); + req_capsule_set(info->sti_pill, &RQF_SEQ_QUERY); +} + +static void seq_thread_info_fini(struct seq_thread_info *info) +{ + req_capsule_fini(info->sti_pill); +} + +int seq_handle(struct ptlrpc_request *req) +{ + const struct lu_env *env; + struct seq_thread_info *info; + int rc; + + env = req->rq_svc_thread->t_env; + LASSERT(env != NULL); + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + seq_thread_info_init(req, info); + rc = seq_req_handle(req, env, info); + /* XXX: we don't need replay but MDT assign transno in any case, + * remove it manually before reply*/ + lustre_msg_set_transno(req->rq_repmsg, 0); + seq_thread_info_fini(info); + + return rc; +} +EXPORT_SYMBOL(seq_handle); + +/* + * Entry point for handling FLD RPCs called from MDT. + */ +int seq_query(struct com_thread_info *info) +{ + return seq_handle(info->cti_pill->rc_req); +} +EXPORT_SYMBOL(seq_query); + + +#ifdef LPROCFS +static int seq_server_proc_init(struct lu_server_seq *seq) +{ + int rc; + ENTRY; + + seq->lss_proc_dir = lprocfs_register(seq->lss_name, + seq_type_proc_dir, + NULL, NULL); + if (IS_ERR(seq->lss_proc_dir)) { + rc = PTR_ERR(seq->lss_proc_dir); + RETURN(rc); + } + + rc = lprocfs_add_vars(seq->lss_proc_dir, + seq_server_proc_list, seq); + if (rc) { + CERROR("%s: Can't init sequence manager " + "proc, rc %d\n", seq->lss_name, rc); + GOTO(out_cleanup, rc); + } + + RETURN(0); + +out_cleanup: + seq_server_proc_fini(seq); + return rc; +} + +static void seq_server_proc_fini(struct lu_server_seq *seq) +{ + ENTRY; + if (seq->lss_proc_dir != NULL) { + if (!IS_ERR(seq->lss_proc_dir)) + lprocfs_remove(&seq->lss_proc_dir); + seq->lss_proc_dir = NULL; + } + EXIT; +} +#else +static int seq_server_proc_init(struct lu_server_seq *seq) +{ + return 0; +} + +static void seq_server_proc_fini(struct lu_server_seq *seq) +{ + return; +} +#endif + + +int seq_server_init(struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss, + const struct lu_env *env) +{ + int rc, is_srv = (type == LUSTRE_SEQ_SERVER); + ENTRY; + + LASSERT(dev != NULL); + LASSERT(prefix != NULL); + LASSERT(ss != NULL); + LASSERT(ss->ss_lu != NULL); + + seq->lss_cli = NULL; + seq->lss_type = type; + seq->lss_site = ss; + range_init(&seq->lss_space); + + range_init(&seq->lss_lowater_set); + range_init(&seq->lss_hiwater_set); + seq->lss_set_width = LUSTRE_SEQ_BATCH_WIDTH; + + mutex_init(&seq->lss_mutex); + + seq->lss_width = is_srv ? + LUSTRE_SEQ_META_WIDTH : LUSTRE_SEQ_SUPER_WIDTH; + + snprintf(seq->lss_name, sizeof(seq->lss_name), + "%s-%s", (is_srv ? "srv" : "ctl"), prefix); + + rc = seq_store_init(seq, env, dev); + if (rc) + GOTO(out, rc); + /* Request backing store for saved sequence info. */ + rc = seq_store_read(seq, env); + if (rc == -ENODATA) { + + /* Nothing is read, init by default value. */ + seq->lss_space = is_srv ? + LUSTRE_SEQ_ZERO_RANGE: + LUSTRE_SEQ_SPACE_RANGE; + + LASSERT(ss != NULL); + seq->lss_space.lsr_index = ss->ss_node_id; + LCONSOLE_INFO("%s: No data found " + "on store. Initialize space\n", + seq->lss_name); + + rc = seq_store_update(env, seq, NULL, 0); + if (rc) { + CERROR("%s: Can't write space data, " + "rc %d\n", seq->lss_name, rc); + } + } else if (rc) { + CERROR("%s: Can't read space data, rc %d\n", + seq->lss_name, rc); + GOTO(out, rc); + } + + if (is_srv) { + LASSERT(range_is_sane(&seq->lss_space)); + } else { + LASSERT(!range_is_zero(&seq->lss_space) && + range_is_sane(&seq->lss_space)); + } + + rc = seq_server_proc_init(seq); + if (rc) + GOTO(out, rc); + + EXIT; +out: + if (rc) + seq_server_fini(seq, env); + return rc; +} +EXPORT_SYMBOL(seq_server_init); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env) +{ + ENTRY; + + seq_server_proc_fini(seq); + seq_store_fini(seq, env); + + EXIT; +} +EXPORT_SYMBOL(seq_server_fini); + +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss) +{ + if (ss == NULL) + RETURN(0); + + if (ss->ss_server_seq) { + seq_server_fini(ss->ss_server_seq, env); + OBD_FREE_PTR(ss->ss_server_seq); + ss->ss_server_seq = NULL; + } + + if (ss->ss_control_seq) { + seq_server_fini(ss->ss_control_seq, env); + OBD_FREE_PTR(ss->ss_control_seq); + ss->ss_control_seq = NULL; + } + + if (ss->ss_client_seq) { + seq_client_fini(ss->ss_client_seq); + OBD_FREE_PTR(ss->ss_client_seq); + ss->ss_client_seq = NULL; + } + + RETURN(0); +} +EXPORT_SYMBOL(seq_site_fini); + +proc_dir_entry_t *seq_type_proc_dir = NULL; + +static int __init fid_mod_init(void) +{ + seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME, + proc_lustre_root, + NULL, NULL); + if (IS_ERR(seq_type_proc_dir)) + return PTR_ERR(seq_type_proc_dir); + + LU_CONTEXT_KEY_INIT(&seq_thread_key); + lu_context_key_register(&seq_thread_key); + return 0; +} + +static void __exit fid_mod_exit(void) +{ + lu_context_key_degister(&seq_thread_key); + if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) { + lprocfs_remove(&seq_type_proc_dir); + seq_type_proc_dir = NULL; + } +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre FID Module"); +MODULE_LICENSE("GPL"); + +cfs_module(fid, "0.1.0", fid_mod_init, fid_mod_exit); diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h new file mode 100644 index 000000000000..c3a94f4c9fce --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/fid_internal.h @@ -0,0 +1,86 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_internal.h + * + * Author: Yury Umanets + */ +#ifndef __FID_INTERNAL_H +#define __FID_INTERNAL_H + +#include +#include + +#include + +struct seq_thread_info { + struct req_capsule *sti_pill; + struct lu_seq_range sti_space; + struct lu_buf sti_buf; +}; + +enum { + SEQ_TXN_STORE_CREDITS = 20 +}; + +extern struct lu_context_key seq_thread_key; + +/* Functions used internally in module. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env); + +/* Store API functions. */ +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt); + +void seq_store_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync); + +#ifdef LPROCFS +extern struct lprocfs_vars seq_server_proc_list[]; +extern struct lprocfs_vars seq_client_proc_list[]; +#endif + + +extern proc_dir_entry_t *seq_type_proc_dir; + +#endif /* __FID_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c new file mode 100644 index 000000000000..eaff51a555fb --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/fid_lib.c @@ -0,0 +1,97 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_lib.c + * + * Miscellaneous fid functions. + * + * Author: Nikita Danilov + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +# include +# include + +#include +#include +#include + +/** + * A cluster-wide range from which fid-sequences are granted to servers and + * then clients. + * + * Fid namespace: + *
+ * Normal FID:	seq:64 [2^33,2^64-1]      oid:32	  ver:32
+ * IGIF      :	0:32, ino:32	      gen:32	  0:32
+ * IDIF      :	0:31, 1:1, ost-index:16,  objd:48	 0:32
+ * 
+ * + * The first 0x400 sequences of normal FID are reserved for special purpose. + * FID_SEQ_START + 1 is for local file id generation. + * FID_SEQ_START + 2 is for .lustre directory and its objects + */ +const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { + FID_SEQ_NORMAL, + (__u64)~0ULL +}; +EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE); + +/* Zero range, used for init and other purposes. */ +const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { + 0, + 0 +}; +EXPORT_SYMBOL(LUSTRE_SEQ_ZERO_RANGE); + +/* Lustre Big Fs Lock fid. */ +const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, + .f_oid = FID_OID_SPECIAL_BFL, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LUSTRE_BFL_FID); + +/** Special fid for ".lustre" directory */ +const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); + +/** Special fid for "fid" special object in .lustre */ +const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, + .f_oid = FID_OID_DOT_LUSTRE_OBF, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_OBF_FID); diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c new file mode 100644 index 000000000000..fcaaca7e2e01 --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/fid_request.c @@ -0,0 +1,522 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_request.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +# include +# include + +#include +#include +#include +#include +#include +#include +#include +/* mdc RPC locks */ +#include +#include "fid_internal.h" + +static int seq_client_rpc(struct lu_client_seq *seq, + struct lu_seq_range *output, __u32 opc, + const char *opcname) +{ + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct lu_seq_range *out, *in; + __u32 *op; + unsigned int debug_mask; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY, + LUSTRE_MDS_VERSION, SEQ_QUERY); + if (req == NULL) + RETURN(-ENOMEM); + + /* Init operation code */ + op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC); + *op = opc; + + /* Zero out input range, this is not recovery yet. */ + in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE); + range_init(in); + + ptlrpc_request_set_replen(req); + + in->lsr_index = seq->lcs_space.lsr_index; + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + fld_range_set_mdt(in); + else + fld_range_set_ost(in); + + if (opc == SEQ_ALLOC_SUPER) { + req->rq_request_portal = SEQ_CONTROLLER_PORTAL; + req->rq_reply_portal = MDC_REPLY_PORTAL; + /* During allocating super sequence for data object, + * the current thread might hold the export of MDT0(MDT0 + * precreating objects on this OST), and it will send the + * request to MDT0 here, so we can not keep resending the + * request here, otherwise if MDT0 is failed(umounted), + * it can not release the export of MDT0 */ + if (seq->lcs_type == LUSTRE_SEQ_DATA) + req->rq_no_delay = req->rq_no_resend = 1; + debug_mask = D_CONSOLE; + } else { + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + req->rq_request_portal = SEQ_METADATA_PORTAL; + else + req->rq_request_portal = SEQ_DATA_PORTAL; + debug_mask = D_INFO; + } + + ptlrpc_at_set_req_timeout(req); + + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) + GOTO(out_req, rc); + + out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE); + *output = *out; + + if (!range_is_sane(output)) { + CERROR("%s: Invalid range received from server: " + DRANGE"\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + if (range_is_exhausted(output)) { + CERROR("%s: Range received from server is exhausted: " + DRANGE"]\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence "DRANGE"]\n", + seq->lcs_name, opcname, PRANGE(output)); + + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +/* Request sequence-controller node to allocate new super-sequence. */ +int seq_client_alloc_super(struct lu_client_seq *seq, + const struct lu_env *env) +{ + int rc; + ENTRY; + + mutex_lock(&seq->lcs_mutex); + + if (seq->lcs_srv) { + LASSERT(env != NULL); + rc = seq_server_alloc_super(seq->lcs_srv, &seq->lcs_space, + env); + } else { + /* Check whether the connection to seq controller has been + * setup (lcs_exp != NULL) */ + if (seq->lcs_exp == NULL) { + mutex_unlock(&seq->lcs_mutex); + RETURN(-EINPROGRESS); + } + + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_SUPER, "super"); + } + mutex_unlock(&seq->lcs_mutex); + RETURN(rc); +} + +/* Request sequence-controller node to allocate new meta-sequence. */ +static int seq_client_alloc_meta(const struct lu_env *env, + struct lu_client_seq *seq) +{ + int rc; + ENTRY; + + if (seq->lcs_srv) { + LASSERT(env != NULL); + rc = seq_server_alloc_meta(seq->lcs_srv, &seq->lcs_space, env); + } else { + do { + /* If meta server return -EINPROGRESS or EAGAIN, + * it means meta server might not be ready to + * allocate super sequence from sequence controller + * (MDT0)yet */ + rc = seq_client_rpc(seq, &seq->lcs_space, + SEQ_ALLOC_META, "meta"); + } while (rc == -EINPROGRESS || rc == -EAGAIN); + } + RETURN(rc); +} + +/* Allocate new sequence for client. */ +static int seq_client_alloc_seq(const struct lu_env *env, + struct lu_client_seq *seq, seqno_t *seqnr) +{ + int rc; + ENTRY; + + LASSERT(range_is_sane(&seq->lcs_space)); + + if (range_is_exhausted(&seq->lcs_space)) { + rc = seq_client_alloc_meta(env, seq); + if (rc) { + CERROR("%s: Can't allocate new meta-sequence," + "rc %d\n", seq->lcs_name, rc); + RETURN(rc); + } else { + CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + } else { + rc = 0; + } + + LASSERT(!range_is_exhausted(&seq->lcs_space)); + *seqnr = seq->lcs_space.lsr_start; + seq->lcs_space.lsr_start += 1; + + CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name, + *seqnr); + + RETURN(rc); +} + +static int seq_fid_alloc_prep(struct lu_client_seq *seq, + wait_queue_t *link) +{ + if (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + waitq_wait(link, TASK_UNINTERRUPTIBLE); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, link); + set_current_state(TASK_RUNNING); + return -EAGAIN; + } + ++seq->lcs_update; + mutex_unlock(&seq->lcs_mutex); + return 0; +} + +static void seq_fid_alloc_fini(struct lu_client_seq *seq) +{ + LASSERT(seq->lcs_update == 1); + mutex_lock(&seq->lcs_mutex); + --seq->lcs_update; + wake_up(&seq->lcs_waitq); +} + +/** + * Allocate the whole seq to the caller. + **/ +int seq_client_get_seq(const struct lu_env *env, + struct lu_client_seq *seq, seqno_t *seqnr) +{ + wait_queue_t link; + int rc; + + LASSERT(seqnr != NULL); + mutex_lock(&seq->lcs_mutex); + init_waitqueue_entry_current(&link); + + while (1) { + rc = seq_fid_alloc_prep(seq, &link); + if (rc == 0) + break; + } + + rc = seq_client_alloc_seq(env, seq, seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence, " + "rc %d\n", seq->lcs_name, rc); + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + return rc; + } + + CDEBUG(D_INFO, "%s: allocate sequence " + "[0x%16.16"LPF64"x]\n", seq->lcs_name, *seqnr); + + /* Since the caller require the whole seq, + * so marked this seq to be used */ + if (seq->lcs_type == LUSTRE_SEQ_METADATA) + seq->lcs_fid.f_oid = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_fid.f_oid = LUSTRE_DATA_SEQ_MAX_WIDTH; + + seq->lcs_fid.f_seq = *seqnr; + seq->lcs_fid.f_ver = 0; + /* + * Inform caller that sequence switch is performed to allow it + * to setup FLD for it. + */ + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + + return rc; +} +EXPORT_SYMBOL(seq_client_get_seq); + +/* Allocate new fid on passed client @seq and save it to @fid. */ +int seq_client_alloc_fid(const struct lu_env *env, + struct lu_client_seq *seq, struct lu_fid *fid) +{ + wait_queue_t link; + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(fid != NULL); + + init_waitqueue_entry_current(&link); + mutex_lock(&seq->lcs_mutex); + + if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST)) + seq->lcs_fid.f_oid = seq->lcs_width; + + while (1) { + seqno_t seqnr; + + if (!fid_is_zero(&seq->lcs_fid) && + fid_oid(&seq->lcs_fid) < seq->lcs_width) { + /* Just bump last allocated fid and return to caller. */ + seq->lcs_fid.f_oid += 1; + rc = 0; + break; + } + + rc = seq_fid_alloc_prep(seq, &link); + if (rc) + continue; + + rc = seq_client_alloc_seq(env, seq, &seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence, " + "rc %d\n", seq->lcs_name, rc); + seq_fid_alloc_fini(seq); + mutex_unlock(&seq->lcs_mutex); + RETURN(rc); + } + + CDEBUG(D_INFO, "%s: Switch to sequence " + "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr); + + seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; + seq->lcs_fid.f_seq = seqnr; + seq->lcs_fid.f_ver = 0; + + /* + * Inform caller that sequence switch is performed to allow it + * to setup FLD for it. + */ + rc = 1; + + seq_fid_alloc_fini(seq); + break; + } + + *fid = seq->lcs_fid; + mutex_unlock(&seq->lcs_mutex); + + CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, PFID(fid)); + RETURN(rc); +} +EXPORT_SYMBOL(seq_client_alloc_fid); + +/* + * Finish the current sequence due to disconnect. + * See mdc_import_event() + */ +void seq_client_flush(struct lu_client_seq *seq) +{ + wait_queue_t link; + + LASSERT(seq != NULL); + init_waitqueue_entry_current(&link); + mutex_lock(&seq->lcs_mutex); + + while (seq->lcs_update) { + add_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&seq->lcs_mutex); + + waitq_wait(&link, TASK_UNINTERRUPTIBLE); + + mutex_lock(&seq->lcs_mutex); + remove_wait_queue(&seq->lcs_waitq, &link); + set_current_state(TASK_RUNNING); + } + + fid_zero(&seq->lcs_fid); + /** + * this id shld not be used for seq range allocation. + * set to -1 for dgb check. + */ + + seq->lcs_space.lsr_index = -1; + + range_init(&seq->lcs_space); + mutex_unlock(&seq->lcs_mutex); +} +EXPORT_SYMBOL(seq_client_flush); + +static void seq_client_proc_fini(struct lu_client_seq *seq); + +#ifdef LPROCFS +static int seq_client_proc_init(struct lu_client_seq *seq) +{ + int rc; + ENTRY; + + seq->lcs_proc_dir = lprocfs_register(seq->lcs_name, + seq_type_proc_dir, + NULL, NULL); + + if (IS_ERR(seq->lcs_proc_dir)) { + CERROR("%s: LProcFS failed in seq-init\n", + seq->lcs_name); + rc = PTR_ERR(seq->lcs_proc_dir); + RETURN(rc); + } + + rc = lprocfs_add_vars(seq->lcs_proc_dir, + seq_client_proc_list, seq); + if (rc) { + CERROR("%s: Can't init sequence manager " + "proc, rc %d\n", seq->lcs_name, rc); + GOTO(out_cleanup, rc); + } + + RETURN(0); + +out_cleanup: + seq_client_proc_fini(seq); + return rc; +} + +static void seq_client_proc_fini(struct lu_client_seq *seq) +{ + ENTRY; + if (seq->lcs_proc_dir) { + if (!IS_ERR(seq->lcs_proc_dir)) + lprocfs_remove(&seq->lcs_proc_dir); + seq->lcs_proc_dir = NULL; + } + EXIT; +} +#else +static int seq_client_proc_init(struct lu_client_seq *seq) +{ + return 0; +} + +static void seq_client_proc_fini(struct lu_client_seq *seq) +{ + return; +} +#endif + +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv) +{ + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(prefix != NULL); + + seq->lcs_srv = srv; + seq->lcs_type = type; + + mutex_init(&seq->lcs_mutex); + if (type == LUSTRE_SEQ_METADATA) + seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH; + else + seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH; + + init_waitqueue_head(&seq->lcs_waitq); + /* Make sure that things are clear before work is started. */ + seq_client_flush(seq); + + if (exp != NULL) + seq->lcs_exp = class_export_get(exp); + else if (type == LUSTRE_SEQ_METADATA) + LASSERT(seq->lcs_srv != NULL); + + snprintf(seq->lcs_name, sizeof(seq->lcs_name), + "cli-%s", prefix); + + rc = seq_client_proc_init(seq); + if (rc) + seq_client_fini(seq); + RETURN(rc); +} +EXPORT_SYMBOL(seq_client_init); + +void seq_client_fini(struct lu_client_seq *seq) +{ + ENTRY; + + seq_client_proc_fini(seq); + + if (seq->lcs_exp != NULL) { + class_export_put(seq->lcs_exp); + seq->lcs_exp = NULL; + } + + seq->lcs_srv = NULL; + EXIT; +} +EXPORT_SYMBOL(seq_client_fini); diff --git a/drivers/staging/lustre/lustre/fid/fid_store.c b/drivers/staging/lustre/lustre/fid/fid_store.c new file mode 100644 index 000000000000..a90e6e37d689 --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/fid_store.c @@ -0,0 +1,259 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/fid_store.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +# include +# include + +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + + +static struct lu_buf *seq_store_buf(struct seq_thread_info *info) +{ + struct lu_buf *buf; + + buf = &info->sti_buf; + buf->lb_buf = &info->sti_space; + buf->lb_len = sizeof(info->sti_space); + return buf; +} + +struct seq_update_callback { + struct dt_txn_commit_cb suc_cb; + struct lu_server_seq *suc_seq; +}; + +void seq_update_cb(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err) +{ + struct seq_update_callback *ccb; + + ccb = container_of0(cb, struct seq_update_callback, suc_cb); + + LASSERT(ccb->suc_seq != NULL); + + ccb->suc_seq->lss_need_sync = 0; + OBD_FREE_PTR(ccb); +} + +int seq_update_cb_add(struct thandle *th, struct lu_server_seq *seq) +{ + struct seq_update_callback *ccb; + struct dt_txn_commit_cb *dcb; + int rc; + + OBD_ALLOC_PTR(ccb); + if (ccb == NULL) + return -ENOMEM; + + ccb->suc_seq = seq; + seq->lss_need_sync = 1; + + dcb = &ccb->suc_cb; + dcb->dcb_func = seq_update_cb; + INIT_LIST_HEAD(&dcb->dcb_linkage); + strncpy(dcb->dcb_name, "seq_update_cb", MAX_COMMIT_CB_STR_LEN); + dcb->dcb_name[MAX_COMMIT_CB_STR_LEN - 1] = '\0'; + + rc = dt_trans_cb_add(th, dcb); + if (rc) + OBD_FREE_PTR(ccb); + return rc; +} + +/* This function implies that caller takes care about locking. */ +int seq_store_update(const struct lu_env *env, struct lu_server_seq *seq, + struct lu_seq_range *out, int sync) +{ + struct dt_device *dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); + struct seq_thread_info *info; + struct thandle *th; + loff_t pos = 0; + int rc; + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + th = dt_trans_create(env, dt_dev); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = dt_declare_record_write(env, seq->lss_obj, + sizeof(struct lu_seq_range), 0, th); + if (rc) + GOTO(exit, rc); + + if (out != NULL) { + rc = fld_declare_server_create(env, + seq->lss_site->ss_server_fld, + out, th); + if (rc) + GOTO(exit, rc); + } + + rc = dt_trans_start_local(env, dt_dev, th); + if (rc) + GOTO(exit, rc); + + /* Store ranges in le format. */ + range_cpu_to_le(&info->sti_space, &seq->lss_space); + + rc = dt_record_write(env, seq->lss_obj, seq_store_buf(info), &pos, th); + if (rc) { + CERROR("%s: Can't write space data, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } else if (out != NULL) { + rc = fld_server_create(env, seq->lss_site->ss_server_fld, out, + th); + if (rc) { + CERROR("%s: Can't Update fld database, rc %d\n", + seq->lss_name, rc); + GOTO(exit, rc); + } + } + /* next sequence update will need sync until this update is committed + * in case of sync operation this is not needed obviously */ + if (!sync) + /* if callback can't be added then sync always */ + sync = !!seq_update_cb_add(th, seq); + + th->th_sync |= sync; +exit: + dt_trans_stop(env, dt_dev, th); + return rc; +} + +/* + * This function implies that caller takes care about locking or locking is not + * needed (init time). + */ +int seq_store_read(struct lu_server_seq *seq, + const struct lu_env *env) +{ + struct seq_thread_info *info; + loff_t pos = 0; + int rc; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + rc = seq->lss_obj->do_body_ops->dbo_read(env, seq->lss_obj, + seq_store_buf(info), + &pos, BYPASS_CAPA); + + if (rc == sizeof(info->sti_space)) { + range_le_to_cpu(&seq->lss_space, &info->sti_space); + CDEBUG(D_INFO, "%s: Space - "DRANGE"\n", + seq->lss_name, PRANGE(&seq->lss_space)); + rc = 0; + } else if (rc == 0) { + rc = -ENODATA; + } else if (rc > 0) { + CERROR("%s: Read only %d bytes of %d\n", seq->lss_name, + rc, (int)sizeof(info->sti_space)); + rc = -EIO; + } + + RETURN(rc); +} + +int seq_store_init(struct lu_server_seq *seq, + const struct lu_env *env, + struct dt_device *dt) +{ + struct dt_object *dt_obj; + struct lu_fid fid; + struct lu_attr attr; + struct dt_object_format dof; + const char *name; + int rc; + ENTRY; + + name = seq->lss_type == LUSTRE_SEQ_SERVER ? + LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME; + + if (seq->lss_type == LUSTRE_SEQ_SERVER) + lu_local_obj_fid(&fid, FID_SEQ_SRV_OID); + else + lu_local_obj_fid(&fid, FID_SEQ_CTL_OID); + + memset(&attr, 0, sizeof(attr)); + attr.la_valid = LA_MODE; + attr.la_mode = S_IFREG | 0666; + dof.dof_type = DFT_REGULAR; + + dt_obj = dt_find_or_create(env, dt, &fid, &dof, &attr); + if (!IS_ERR(dt_obj)) { + seq->lss_obj = dt_obj; + rc = 0; + } else { + CERROR("%s: Can't find \"%s\" obj %d\n", + seq->lss_name, name, (int)PTR_ERR(dt_obj)); + rc = PTR_ERR(dt_obj); + } + + RETURN(rc); +} + +void seq_store_fini(struct lu_server_seq *seq, + const struct lu_env *env) +{ + ENTRY; + + if (seq->lss_obj != NULL) { + if (!IS_ERR(seq->lss_obj)) + lu_object_put(env, &seq->lss_obj->do_lu); + seq->lss_obj = NULL; + } + + EXIT; +} diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c new file mode 100644 index 000000000000..49ea357be686 --- /dev/null +++ b/drivers/staging/lustre/lustre/fid/lproc_fid.c @@ -0,0 +1,360 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fid/lproc_fid.c + * + * Lustre Sequence Manager + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FID + +# include +# include + +#include +#include +#include +#include +#include +#include +#include +#include "fid_internal.h" + +#ifdef LPROCFS +/* + * Note: this function is only used for testing, it is no safe for production + * use. + */ +static int +seq_proc_write_common(struct file *file, const char *buffer, + unsigned long count, void *data, + struct lu_seq_range *range) +{ + struct lu_seq_range tmp; + int rc; + ENTRY; + + LASSERT(range != NULL); + + rc = sscanf(buffer, "[%llx - %llx]\n", + (long long unsigned *)&tmp.lsr_start, + (long long unsigned *)&tmp.lsr_end); + if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp)) + RETURN(-EINVAL); + *range = tmp; + RETURN(0); +} + +static int +seq_proc_read_common(char *page, char **start, off_t off, + int count, int *eof, void *data, + struct lu_seq_range *range) +{ + int rc; + ENTRY; + + *eof = 1; + rc = snprintf(page, count, "["LPX64" - "LPX64"]:%x:%s\n", + PRANGE(range)); + RETURN(rc); +} + +/* + * Server side procfs stuff. + */ +static int +seq_server_proc_write_space(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lss_mutex); + rc = seq_proc_write_common(file, buffer, count, + data, &seq->lss_space); + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: "DRANGE"\n", + seq->lss_name, PRANGE(&seq->lss_space)); + } + + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +seq_server_proc_read_space(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lss_mutex); + rc = seq_proc_read_common(page, start, off, count, eof, + data, &seq->lss_space); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +static int +seq_server_proc_read_server(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)data; + struct client_obd *cli; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + *eof = 1; + if (seq->lss_cli) { + if (seq->lss_cli->lcs_exp != NULL) { + cli = &seq->lss_cli->lcs_exp->exp_obd->u.cli; + rc = snprintf(page, count, "%s\n", + cli->cl_target_uuid.uuid); + } else { + rc = snprintf(page, count, "%s\n", + seq->lss_cli->lcs_srv->lss_name); + } + } else { + rc = snprintf(page, count, "\n"); + } + + RETURN(rc); +} + +static int +seq_server_proc_write_width(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)data; + int rc, val; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lss_mutex); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc != 0) { + CERROR("%s: invalid width.\n", seq->lss_name); + GOTO(out_unlock, rc); + } + + seq->lss_width = val; + + CDEBUG(D_INFO, "%s: Width: "LPU64"\n", + seq->lss_name, seq->lss_width); +out_unlock: + mutex_unlock(&seq->lss_mutex); + + RETURN(count); +} + +static int +seq_server_proc_read_width(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_server_seq *seq = (struct lu_server_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lss_mutex); + rc = snprintf(page, count, LPU64"\n", seq->lss_width); + mutex_unlock(&seq->lss_mutex); + + RETURN(rc); +} + +/* Client side procfs stuff */ +static int +seq_client_proc_write_space(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + rc = seq_proc_write_common(file, buffer, count, + data, &seq->lcs_space); + + if (rc == 0) { + CDEBUG(D_INFO, "%s: Space: "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + + mutex_unlock(&seq->lcs_mutex); + + RETURN(count); +} + +static int +seq_client_proc_read_space(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + rc = seq_proc_read_common(page, start, off, count, eof, + data, &seq->lcs_space); + mutex_unlock(&seq->lcs_mutex); + + RETURN(rc); +} + +static int +seq_client_proc_write_width(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + __u64 max; + int rc, val; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) { + mutex_unlock(&seq->lcs_mutex); + RETURN(rc); + } + + if (seq->lcs_type == LUSTRE_SEQ_DATA) + max = LUSTRE_DATA_SEQ_MAX_WIDTH; + else + max = LUSTRE_METADATA_SEQ_MAX_WIDTH; + + if (val <= max && val > 0) { + seq->lcs_width = val; + + if (rc == 0) { + CDEBUG(D_INFO, "%s: Sequence size: "LPU64"\n", + seq->lcs_name, seq->lcs_width); + } + } + + mutex_unlock(&seq->lcs_mutex); + + RETURN(count); +} + +static int +seq_client_proc_read_width(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + rc = snprintf(page, count, LPU64"\n", seq->lcs_width); + mutex_unlock(&seq->lcs_mutex); + + RETURN(rc); +} + +static int +seq_client_proc_read_fid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + mutex_lock(&seq->lcs_mutex); + rc = snprintf(page, count, DFID"\n", PFID(&seq->lcs_fid)); + mutex_unlock(&seq->lcs_mutex); + + RETURN(rc); +} + +static int +seq_client_proc_read_server(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_seq *seq = (struct lu_client_seq *)data; + struct client_obd *cli; + int rc; + ENTRY; + + LASSERT(seq != NULL); + + if (seq->lcs_exp != NULL) { + cli = &seq->lcs_exp->exp_obd->u.cli; + rc = snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid); + } else { + rc = snprintf(page, count, "%s\n", seq->lcs_srv->lss_name); + } + RETURN(rc); +} + +struct lprocfs_vars seq_server_proc_list[] = { + { "space", seq_server_proc_read_space, seq_server_proc_write_space, NULL }, + { "width", seq_server_proc_read_width, seq_server_proc_write_width, NULL }, + { "server", seq_server_proc_read_server, NULL, NULL }, + { NULL }}; + +struct lprocfs_vars seq_client_proc_list[] = { + { "space", seq_client_proc_read_space, seq_client_proc_write_space, NULL }, + { "width", seq_client_proc_read_width, seq_client_proc_write_width, NULL }, + { "server", seq_client_proc_read_server, NULL, NULL }, + { "fid", seq_client_proc_read_fid, NULL, NULL }, + { NULL }}; +#endif diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile new file mode 100644 index 000000000000..e7f2881a1d9e --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += fld.o +fld-y := fld_handler.o fld_request.o fld_cache.o fld_index.o lproc_fld.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c new file mode 100644 index 000000000000..347f2ae83bc8 --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/fld_cache.c @@ -0,0 +1,566 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_cache.c + * + * FLD (Fids Location Database) + * + * Author: Pravin Shelar + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +# include +# include +# include +# include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "fld_internal.h" + +/** + * create fld cache. + */ +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold) +{ + struct fld_cache *cache; + ENTRY; + + LASSERT(name != NULL); + LASSERT(cache_threshold < cache_size); + + OBD_ALLOC_PTR(cache); + if (cache == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + INIT_LIST_HEAD(&cache->fci_entries_head); + INIT_LIST_HEAD(&cache->fci_lru); + + cache->fci_cache_count = 0; + rwlock_init(&cache->fci_lock); + + strlcpy(cache->fci_name, name, + sizeof(cache->fci_name)); + + cache->fci_cache_size = cache_size; + cache->fci_threshold = cache_threshold; + + /* Init fld cache info. */ + memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); + + CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", + cache->fci_name, cache_size, cache_threshold); + + RETURN(cache); +} + +/** + * destroy fld cache. + */ +void fld_cache_fini(struct fld_cache *cache) +{ + __u64 pct; + ENTRY; + + LASSERT(cache != NULL); + fld_cache_flush(cache); + + if (cache->fci_stat.fst_count > 0) { + pct = cache->fci_stat.fst_cache * 100; + do_div(pct, cache->fci_stat.fst_count); + } else { + pct = 0; + } + + CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); + CDEBUG(D_INFO, " Total reqs: "LPU64"\n", cache->fci_stat.fst_count); + CDEBUG(D_INFO, " Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache); + CDEBUG(D_INFO, " Cache hits: "LPU64"%%\n", pct); + + OBD_FREE_PTR(cache); + + EXIT; +} + +/** + * delete given node from list. + */ +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node) +{ + list_del(&node->fce_list); + list_del(&node->fce_lru); + cache->fci_cache_count--; + OBD_FREE_PTR(node); +} + +/** + * fix list by checking new entry with NEXT entry in order. + */ +static void fld_fix_new_list(struct fld_cache *cache) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *f_next; + struct lu_seq_range *c_range; + struct lu_seq_range *n_range; + struct list_head *head = &cache->fci_entries_head; + ENTRY; + +restart_fixup: + + list_for_each_entry_safe(f_curr, f_next, head, fce_list) { + c_range = &f_curr->fce_range; + n_range = &f_next->fce_range; + + LASSERT(range_is_sane(c_range)); + if (&f_next->fce_list == head) + break; + + if (c_range->lsr_flags != n_range->lsr_flags) + continue; + + LASSERTF(c_range->lsr_start <= n_range->lsr_start, + "cur lsr_start "DRANGE" next lsr_start "DRANGE"\n", + PRANGE(c_range), PRANGE(n_range)); + + /* check merge possibility with next range */ + if (c_range->lsr_end == n_range->lsr_start) { + if (c_range->lsr_index != n_range->lsr_index) + continue; + n_range->lsr_start = c_range->lsr_start; + fld_cache_entry_delete(cache, f_curr); + continue; + } + + /* check if current range overlaps with next range. */ + if (n_range->lsr_start < c_range->lsr_end) { + if (c_range->lsr_index == n_range->lsr_index) { + n_range->lsr_start = c_range->lsr_start; + n_range->lsr_end = max(c_range->lsr_end, + n_range->lsr_end); + fld_cache_entry_delete(cache, f_curr); + } else { + if (n_range->lsr_end <= c_range->lsr_end) { + *n_range = *c_range; + fld_cache_entry_delete(cache, f_curr); + } else + n_range->lsr_start = c_range->lsr_end; + } + + /* we could have overlap over next + * range too. better restart. */ + goto restart_fixup; + } + + /* kill duplicates */ + if (c_range->lsr_start == n_range->lsr_start && + c_range->lsr_end == n_range->lsr_end) + fld_cache_entry_delete(cache, f_curr); + } + + EXIT; +} + +/** + * add node to fld cache + */ +static inline void fld_cache_entry_add(struct fld_cache *cache, + struct fld_cache_entry *f_new, + struct list_head *pos) +{ + list_add(&f_new->fce_list, pos); + list_add(&f_new->fce_lru, &cache->fci_lru); + + cache->fci_cache_count++; + fld_fix_new_list(cache); +} + +/** + * Check if cache needs to be shrunk. If so - do it. + * Remove one entry in list and so on until cache is shrunk enough. + */ +static int fld_cache_shrink(struct fld_cache *cache) +{ + struct fld_cache_entry *flde; + struct list_head *curr; + int num = 0; + ENTRY; + + LASSERT(cache != NULL); + + if (cache->fci_cache_count < cache->fci_cache_size) + RETURN(0); + + curr = cache->fci_lru.prev; + + while (cache->fci_cache_count + cache->fci_threshold > + cache->fci_cache_size && curr != &cache->fci_lru) { + + flde = list_entry(curr, struct fld_cache_entry, fce_lru); + curr = curr->prev; + fld_cache_entry_delete(cache, flde); + num++; + } + + CDEBUG(D_INFO, "%s: FLD cache - Shrunk by " + "%d entries\n", cache->fci_name, num); + + RETURN(0); +} + +/** + * kill all fld cache entries. + */ +void fld_cache_flush(struct fld_cache *cache) +{ + ENTRY; + + write_lock(&cache->fci_lock); + cache->fci_cache_size = 0; + fld_cache_shrink(cache); + write_unlock(&cache->fci_lock); + + EXIT; +} + +/** + * punch hole in existing range. divide this range and add new + * entry accordingly. + */ + +void fld_cache_punch_hole(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const seqno_t new_start = range->lsr_start; + const seqno_t new_end = range->lsr_end; + struct fld_cache_entry *fldt; + + ENTRY; + OBD_ALLOC_GFP(fldt, sizeof *fldt, GFP_ATOMIC); + if (!fldt) { + OBD_FREE_PTR(f_new); + EXIT; + /* overlap is not allowed, so dont mess up list. */ + return; + } + /* break f_curr RANGE into three RANGES: + * f_curr, f_new , fldt + */ + + /* f_new = *range */ + + /* fldt */ + fldt->fce_range.lsr_start = new_end; + fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; + fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index; + + /* f_curr */ + f_curr->fce_range.lsr_end = new_start; + + /* add these two entries to list */ + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + fld_cache_entry_add(cache, fldt, &f_new->fce_list); + + /* no need to fixup */ + EXIT; +} + +/** + * handle range overlap in fld cache. + */ +static void fld_cache_overlap_handle(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const seqno_t new_start = range->lsr_start; + const seqno_t new_end = range->lsr_end; + const mdsno_t mdt = range->lsr_index; + + /* this is overlap case, these case are checking overlapping with + * prev range only. fixup will handle overlaping with next range. */ + + if (f_curr->fce_range.lsr_index == mdt) { + f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, + new_start); + + f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, + new_end); + + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (new_start <= f_curr->fce_range.lsr_start && + f_curr->fce_range.lsr_end <= new_end) { + /* case 1: new range completely overshadowed existing range. + * e.g. whole range migrated. update fld cache entry */ + + f_curr->fce_range = *range; + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); + + } else if (f_curr->fce_range.lsr_start < new_start && + new_end < f_curr->fce_range.lsr_end) { + /* case 2: new range fit within existing range. */ + + fld_cache_punch_hole(cache, f_curr, f_new); + + } else if (new_end <= f_curr->fce_range.lsr_end) { + /* case 3: overlap: + * [new_start [c_start new_end) c_end) + */ + + LASSERT(new_start <= f_curr->fce_range.lsr_start); + + f_curr->fce_range.lsr_start = new_end; + fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); + + } else if (f_curr->fce_range.lsr_start <= new_start) { + /* case 4: overlap: + * [c_start [new_start c_end) new_end) + */ + + LASSERT(f_curr->fce_range.lsr_end <= new_end); + + f_curr->fce_range.lsr_end = new_start; + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + } else + CERROR("NEW range ="DRANGE" curr = "DRANGE"\n", + PRANGE(range),PRANGE(&f_curr->fce_range)); +} + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range) +{ + struct fld_cache_entry *f_new; + + LASSERT(range_is_sane(range)); + + OBD_ALLOC_PTR(f_new); + if (!f_new) + RETURN(ERR_PTR(-ENOMEM)); + + f_new->fce_range = *range; + RETURN(f_new); +} + +/** + * Insert FLD entry in FLD cache. + * + * This function handles all cases of merging and breaking up of + * ranges. + */ +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *n; + struct list_head *head; + struct list_head *prev = NULL; + const seqno_t new_start = f_new->fce_range.lsr_start; + const seqno_t new_end = f_new->fce_range.lsr_end; + __u32 new_flags = f_new->fce_range.lsr_flags; + ENTRY; + + /* + * Duplicate entries are eliminated in insert op. + * So we don't need to search new entry before starting + * insertion loop. + */ + + if (!cache->fci_no_shrink) + fld_cache_shrink(cache); + + head = &cache->fci_entries_head; + + list_for_each_entry_safe(f_curr, n, head, fce_list) { + /* add list if next is end of list */ + if (new_end < f_curr->fce_range.lsr_start || + (new_end == f_curr->fce_range.lsr_start && + new_flags != f_curr->fce_range.lsr_flags)) + break; + + prev = &f_curr->fce_list; + /* check if this range is to left of new range. */ + if (new_start < f_curr->fce_range.lsr_end && + new_flags == f_curr->fce_range.lsr_flags) { + fld_cache_overlap_handle(cache, f_curr, f_new); + goto out; + } + } + + if (prev == NULL) + prev = head; + + CDEBUG(D_INFO, "insert range "DRANGE"\n", PRANGE(&f_new->fce_range)); + /* Add new entry to cache and lru list. */ + fld_cache_entry_add(cache, f_new, prev); +out: + RETURN(0); +} + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + int rc; + + flde = fld_cache_entry_create(range); + if (IS_ERR(flde)) + RETURN(PTR_ERR(flde)); + + write_lock(&cache->fci_lock); + rc = fld_cache_insert_nolock(cache, flde); + write_unlock(&cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); + + RETURN(rc); +} + +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *tmp; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry_safe(flde, tmp, head, fce_list) { + /* add list if next is end of list */ + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + fld_cache_entry_delete(cache, flde); + break; + } + } +} + +/** + * Delete FLD entry in FLD cache. + * + */ +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range) +{ + write_lock(&cache->fci_lock); + fld_cache_delete_nolock(cache, range); + write_unlock(&cache->fci_lock); +} + +struct fld_cache_entry +*fld_cache_entry_lookup_nolock(struct fld_cache *cache, + struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *got = NULL; + struct list_head *head; + + head = &cache->fci_entries_head; + list_for_each_entry(flde, head, fce_list) { + if (range->lsr_start == flde->fce_range.lsr_start || + (range->lsr_end == flde->fce_range.lsr_end && + range->lsr_flags == flde->fce_range.lsr_flags)) { + got = flde; + break; + } + } + + RETURN(got); +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +struct fld_cache_entry +*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range) +{ + struct fld_cache_entry *got = NULL; + ENTRY; + + read_lock(&cache->fci_lock); + got = fld_cache_entry_lookup_nolock(cache, range); + read_unlock(&cache->fci_lock); + RETURN(got); +} + +/** + * lookup \a seq sequence for range in fld cache. + */ +int fld_cache_lookup(struct fld_cache *cache, + const seqno_t seq, struct lu_seq_range *range) +{ + struct fld_cache_entry *flde; + struct fld_cache_entry *prev = NULL; + struct list_head *head; + ENTRY; + + read_lock(&cache->fci_lock); + head = &cache->fci_entries_head; + + cache->fci_stat.fst_count++; + list_for_each_entry(flde, head, fce_list) { + if (flde->fce_range.lsr_start > seq) { + if (prev != NULL) + *range = prev->fce_range; + break; + } + + prev = flde; + if (range_within(&flde->fce_range, seq)) { + *range = flde->fce_range; + + cache->fci_stat.fst_cache++; + read_unlock(&cache->fci_lock); + RETURN(0); + } + } + read_unlock(&cache->fci_lock); + RETURN(-ENOENT); +} diff --git a/drivers/staging/lustre/lustre/fld/fld_handler.c b/drivers/staging/lustre/lustre/fld/fld_handler.c new file mode 100644 index 000000000000..d2707ae4ad57 --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/fld_handler.c @@ -0,0 +1,447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_handler.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + * Author: WangDi + * Author: Pravin Shelar + */ + +#define DEBUG_SUBSYSTEM S_FLD + +# include +# include +# include +# include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "fld_internal.h" +#include + + +/* context key constructor/destructor: fld_key_init, fld_key_fini */ +LU_KEY_INIT_FINI(fld, struct fld_thread_info); + +/* context key: fld_thread_key */ +LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD); + +proc_dir_entry_t *fld_type_proc_dir = NULL; + +static int __init fld_mod_init(void) +{ + fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME, + proc_lustre_root, + NULL, NULL); + if (IS_ERR(fld_type_proc_dir)) + return PTR_ERR(fld_type_proc_dir); + + LU_CONTEXT_KEY_INIT(&fld_thread_key); + lu_context_key_register(&fld_thread_key); + return 0; +} + +static void __exit fld_mod_exit(void) +{ + lu_context_key_degister(&fld_thread_key); + if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) { + lprocfs_remove(&fld_type_proc_dir); + fld_type_proc_dir = NULL; + } +} + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + struct lu_seq_range *range, + struct thandle *th) +{ + int rc; + + rc = fld_declare_index_create(env, fld, range, th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_declare_server_create); + +/** + * Insert FLD index entry and update FLD cache. + * + * This function is called from the sequence allocator when a super-sequence + * is granted to a server. + */ +int fld_server_create(const struct lu_env *env, struct lu_server_fld *fld, + struct lu_seq_range *range, struct thandle *th) +{ + int rc; + + mutex_lock(&fld->lsf_lock); + rc = fld_index_create(env, fld, range, th); + mutex_unlock(&fld->lsf_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_create); + +/** + * Lookup mds by seq, returns a range for given seq. + * + * If that entry is not cached in fld cache, request is sent to super + * sequence controller node (MDT0). All other MDT[1...N] and client + * cache fld entries, but this cache is not persistent. + */ +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + seqno_t seq, struct lu_seq_range *range) +{ + struct lu_seq_range *erange; + struct fld_thread_info *info; + int rc; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + erange = &info->fti_lrange; + + /* Lookup it in the cache. */ + rc = fld_cache_lookup(fld->lsf_cache, seq, erange); + if (rc == 0) { + if (unlikely(fld_range_type(erange) != fld_range_type(range) && + !fld_range_is_any(range))) { + CERROR("%s: FLD cache range "DRANGE" does not match" + "requested flag %x: rc = %d\n", fld->lsf_name, + PRANGE(erange), range->lsr_flags, -EIO); + RETURN(-EIO); + } + *range = *erange; + RETURN(0); + } + + if (fld->lsf_obj) { + /* On server side, all entries should be in cache. + * If we can not find it in cache, just return error */ + CERROR("%s: Cannot find sequence "LPX64": rc = %d\n", + fld->lsf_name, seq, -EIO); + RETURN(-EIO); + } else { + LASSERT(fld->lsf_control_exp); + /* send request to mdt0 i.e. super seq. controller. + * This is temporary solution, long term solution is fld + * replication on all mdt servers. + */ + range->lsr_start = seq; + rc = fld_client_rpc(fld->lsf_control_exp, + range, FLD_LOOKUP); + if (rc == 0) + fld_cache_insert(fld->lsf_cache, range); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_server_lookup); + +/** + * All MDT server handle fld lookup operation. But only MDT0 has fld index. + * if entry is not found in cache we need to forward lookup request to MDT0 + */ + +static int fld_server_handle(struct lu_server_fld *fld, + const struct lu_env *env, + __u32 opc, struct lu_seq_range *range, + struct fld_thread_info *info) +{ + int rc; + ENTRY; + + switch (opc) { + case FLD_LOOKUP: + rc = fld_server_lookup(env, fld, range->lsr_start, range); + break; + default: + rc = -EINVAL; + break; + } + + CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: " + DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range)); + + RETURN(rc); + +} + +static int fld_req_handle(struct ptlrpc_request *req, + struct fld_thread_info *info) +{ + struct obd_export *exp = req->rq_export; + struct lu_site *site = exp->exp_obd->obd_lu_dev->ld_site; + struct lu_seq_range *in; + struct lu_seq_range *out; + int rc; + __u32 *opc; + ENTRY; + + rc = req_capsule_server_pack(info->fti_pill); + if (rc) + RETURN(err_serious(rc)); + + opc = req_capsule_client_get(info->fti_pill, &RMF_FLD_OPC); + if (opc != NULL) { + in = req_capsule_client_get(info->fti_pill, &RMF_FLD_MDFLD); + if (in == NULL) + RETURN(err_serious(-EPROTO)); + out = req_capsule_server_get(info->fti_pill, &RMF_FLD_MDFLD); + if (out == NULL) + RETURN(err_serious(-EPROTO)); + *out = *in; + + /* For old 2.0 client, the 'lsr_flags' is uninitialized. + * Set it as 'LU_SEQ_RANGE_MDT' by default. */ + if (!(exp_connect_flags(exp) & OBD_CONNECT_64BITHASH) && + !(exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) && + !(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) && + !exp->exp_libclient) + fld_range_set_mdt(out); + + rc = fld_server_handle(lu_site2seq(site)->ss_server_fld, + req->rq_svc_thread->t_env, + *opc, out, info); + } else { + rc = err_serious(-EPROTO); + } + + RETURN(rc); +} + +static void fld_thread_info_init(struct ptlrpc_request *req, + struct fld_thread_info *info) +{ + info->fti_pill = &req->rq_pill; + /* Init request capsule. */ + req_capsule_init(info->fti_pill, req, RCL_SERVER); + req_capsule_set(info->fti_pill, &RQF_FLD_QUERY); +} + +static void fld_thread_info_fini(struct fld_thread_info *info) +{ + req_capsule_fini(info->fti_pill); +} + +static int fld_handle(struct ptlrpc_request *req) +{ + struct fld_thread_info *info; + const struct lu_env *env; + int rc; + + env = req->rq_svc_thread->t_env; + LASSERT(env != NULL); + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + + fld_thread_info_init(req, info); + rc = fld_req_handle(req, info); + fld_thread_info_fini(info); + + return rc; +} + +/* + * Entry point for handling FLD RPCs called from MDT. + */ +int fld_query(struct com_thread_info *info) +{ + return fld_handle(info->cti_pill->rc_req); +} +EXPORT_SYMBOL(fld_query); + +/* + * Returns true, if fid is local to this server node. + * + * WARNING: this function is *not* guaranteed to return false if fid is + * remote: it makes an educated conservative guess only. + * + * fid_is_local() is supposed to be used in assertion checks only. + */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid) +{ + int result; + struct seq_server_site *ss_site; + struct lu_seq_range *range; + struct fld_thread_info *info; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + + result = 1; /* conservatively assume fid is local */ + ss_site = lu_site2seq(site); + if (ss_site->ss_client_fld != NULL) { + int rc; + + rc = fld_cache_lookup(ss_site->ss_client_fld->lcf_cache, + fid_seq(fid), range); + if (rc == 0) + result = (range->lsr_index == ss_site->ss_node_id); + } + return result; +} +EXPORT_SYMBOL(fid_is_local); + +static void fld_server_proc_fini(struct lu_server_fld *fld); + +#ifdef LPROCFS +static int fld_server_proc_init(struct lu_server_fld *fld) +{ + int rc = 0; + ENTRY; + + fld->lsf_proc_dir = lprocfs_register(fld->lsf_name, + fld_type_proc_dir, + fld_server_proc_list, fld); + if (IS_ERR(fld->lsf_proc_dir)) { + rc = PTR_ERR(fld->lsf_proc_dir); + RETURN(rc); + } + + rc = lprocfs_seq_create(fld->lsf_proc_dir, "fldb", 0444, + &fld_proc_seq_fops, fld); + if (rc) { + lprocfs_remove(&fld->lsf_proc_dir); + fld->lsf_proc_dir = NULL; + } + + RETURN(rc); +} + +static void fld_server_proc_fini(struct lu_server_fld *fld) +{ + ENTRY; + if (fld->lsf_proc_dir != NULL) { + if (!IS_ERR(fld->lsf_proc_dir)) + lprocfs_remove(&fld->lsf_proc_dir); + fld->lsf_proc_dir = NULL; + } + EXIT; +} +#else +static int fld_server_proc_init(struct lu_server_fld *fld) +{ + return 0; +} + +static void fld_server_proc_fini(struct lu_server_fld *fld) +{ + return; +} +#endif + +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int mds_node_id, + int type) +{ + int cache_size, cache_threshold; + int rc; + ENTRY; + + snprintf(fld->lsf_name, sizeof(fld->lsf_name), + "srv-%s", prefix); + + cache_size = FLD_SERVER_CACHE_SIZE / + sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * + FLD_SERVER_CACHE_THRESHOLD / 100; + + mutex_init(&fld->lsf_lock); + fld->lsf_cache = fld_cache_init(fld->lsf_name, + cache_size, cache_threshold); + if (IS_ERR(fld->lsf_cache)) { + rc = PTR_ERR(fld->lsf_cache); + fld->lsf_cache = NULL; + GOTO(out, rc); + } + + if (!mds_node_id && type == LU_SEQ_RANGE_MDT) { + rc = fld_index_init(env, fld, dt); + if (rc) + GOTO(out, rc); + } else { + fld->lsf_obj = NULL; + } + + rc = fld_server_proc_init(fld); + if (rc) + GOTO(out, rc); + + fld->lsf_control_exp = NULL; + + GOTO(out, rc); + +out: + if (rc) + fld_server_fini(env, fld); + return rc; +} +EXPORT_SYMBOL(fld_server_init); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + + fld_server_proc_fini(fld); + fld_index_fini(env, fld); + + if (fld->lsf_cache != NULL) { + if (!IS_ERR(fld->lsf_cache)) + fld_cache_fini(fld->lsf_cache); + fld->lsf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_server_fini); + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre FLD"); +MODULE_LICENSE("GPL"); + +cfs_module(mdd, "0.1.0", fld_mod_init, fld_mod_exit); diff --git a/drivers/staging/lustre/lustre/fld/fld_index.c b/drivers/staging/lustre/lustre/fld/fld_index.c new file mode 100644 index 000000000000..ec68a54c23bd --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/fld_index.c @@ -0,0 +1,426 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_index.c + * + * Author: WangDi + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +# include +# include +# include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "fld_internal.h" + +const char fld_index_name[] = "fld"; + +static const struct lu_seq_range IGIF_FLD_RANGE = { + .lsr_start = FID_SEQ_IGIF, + .lsr_end = FID_SEQ_IGIF_MAX + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range DOT_LUSTRE_FLD_RANGE = { + .lsr_start = FID_SEQ_DOT_LUSTRE, + .lsr_end = FID_SEQ_DOT_LUSTRE + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +static const struct lu_seq_range ROOT_FLD_RANGE = { + .lsr_start = FID_SEQ_ROOT, + .lsr_end = FID_SEQ_ROOT + 1, + .lsr_index = 0, + .lsr_flags = LU_SEQ_RANGE_MDT +}; + +const struct dt_index_features fld_index_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(seqno_t), + .dif_keysize_max = sizeof(seqno_t), + .dif_recsize_min = sizeof(struct lu_seq_range), + .dif_recsize_max = sizeof(struct lu_seq_range), + .dif_ptrsize = 4 +}; + +extern struct lu_context_key fld_thread_key; + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new_range, + struct thandle *th) +{ + struct lu_seq_range *tmp; + struct lu_seq_range *range; + struct fld_thread_info *info; + int rc = 0; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; + tmp = &info->fti_irange; + memset(range, 0, sizeof(*range)); + + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc == 0) { + /* In case of duplicate entry, the location must be same */ + LASSERT((range_compare_loc(new_range, range) == 0)); + GOTO(out, rc = -EEXIST); + } + + if (rc != -ENOENT) { + CERROR("%s: lookup range "DRANGE" error: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + + /* Check for merge case, since the fld entry can only be increamental, + * so we will only check whether it can be merged from the left. */ + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_declare_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th); + if (rc) { + CERROR("%s: declare record "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + GOTO(out, rc); + } + memcpy(tmp, new_range, sizeof(*new_range)); + tmp->lsr_start = range->lsr_start; + } else { + memcpy(tmp, new_range, sizeof(*new_range)); + } + + range_cpu_to_be(tmp, tmp); + rc = dt_declare_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th); +out: + RETURN(rc); +} + +/** + * insert range in fld store. + * + * \param range range to be inserted + * \param th transaction for this operation as it could compound + * transaction. + * + * \retval 0 success + * \retval -ve error + * + * The whole fld index insertion is protected by seq->lss_mutex (see + * seq_server_alloc_super), i.e. only one thread will access fldb each + * time, so we do not need worry the fld file and cache will being + * changed between declare and create. + * Because the fld entry can only be increamental, so we will only check + * whether it can be merged from the left. + **/ +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new_range, struct thandle *th) +{ + struct lu_seq_range *range; + struct lu_seq_range *tmp; + struct fld_thread_info *info; + int rc = 0; + int deleted = 0; + struct fld_cache_entry *flde; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + + LASSERT(mutex_is_locked(&fld->lsf_lock)); + + range = &info->fti_lrange; + memset(range, 0, sizeof(*range)); + tmp = &info->fti_irange; + rc = fld_index_lookup(env, fld, new_range->lsr_start, range); + if (rc != -ENOENT) { + rc = rc == 0 ? -EEXIST : rc; + GOTO(out, rc); + } + + if (new_range->lsr_start == range->lsr_end && range->lsr_end != 0 && + range_compare_loc(new_range, range) == 0) { + range_cpu_to_be(tmp, range); + rc = dt_delete(env, fld->lsf_obj, + (struct dt_key *)&tmp->lsr_start, th, + BYPASS_CAPA); + if (rc != 0) + GOTO(out, rc); + memcpy(tmp, new_range, sizeof(*new_range)); + tmp->lsr_start = range->lsr_start; + deleted = 1; + } else { + memcpy(tmp, new_range, sizeof(*new_range)); + } + + range_cpu_to_be(tmp, tmp); + rc = dt_insert(env, fld->lsf_obj, (struct dt_rec *)tmp, + (struct dt_key *)&tmp->lsr_start, th, BYPASS_CAPA, 1); + if (rc != 0) { + CERROR("%s: insert range "DRANGE" failed: rc = %d\n", + fld->lsf_name, PRANGE(new_range), rc); + GOTO(out, rc); + } + + flde = fld_cache_entry_create(new_range); + if (IS_ERR(flde)) + GOTO(out, rc = PTR_ERR(flde)); + + write_lock(&fld->lsf_cache->fci_lock); + if (deleted) + fld_cache_delete_nolock(fld->lsf_cache, new_range); + rc = fld_cache_insert_nolock(fld->lsf_cache, flde); + write_unlock(&fld->lsf_cache->fci_lock); + if (rc) + OBD_FREE_PTR(flde); +out: + RETURN(rc); +} + +/** + * lookup range for a seq passed. note here we only care about the start/end, + * caller should handle the attached location data (flags, index). + * + * \param seq seq for lookup. + * \param range result of lookup. + * + * \retval 0 found, \a range is the matched range; + * \retval -ENOENT not found, \a range is the left-side range; + * \retval -ve other error; + */ +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + seqno_t seq, struct lu_seq_range *range) +{ + struct lu_seq_range *fld_rec; + struct fld_thread_info *info; + int rc; + + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + fld_rec = &info->fti_rec; + + rc = fld_cache_lookup(fld->lsf_cache, seq, fld_rec); + if (rc == 0) { + *range = *fld_rec; + if (range_within(range, seq)) + rc = 0; + else + rc = -ENOENT; + } + + CDEBUG(D_INFO, "%s: lookup seq = "LPX64" range : "DRANGE" rc = %d\n", + fld->lsf_name, seq, PRANGE(range), rc); + + RETURN(rc); +} + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range) +{ + struct thandle *th; + int rc; + ENTRY; + + th = dt_trans_create(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev)); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = fld_declare_index_create(env, fld, range, th); + if (rc != 0) { + if (rc == -EEXIST) + rc = 0; + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), + th); + if (rc) + GOTO(out, rc); + + rc = fld_index_create(env, fld, range, th); + if (rc == -EEXIST) + rc = 0; +out: + dt_trans_stop(env, lu2dt_dev(fld->lsf_obj->do_lu.lo_dev), th); + RETURN(rc); +} +EXPORT_SYMBOL(fld_insert_entry); + +static int fld_insert_special_entries(const struct lu_env *env, + struct lu_server_fld *fld) +{ + int rc; + + rc = fld_insert_entry(env, fld, &IGIF_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &DOT_LUSTRE_FLD_RANGE); + if (rc != 0) + RETURN(rc); + + rc = fld_insert_entry(env, fld, &ROOT_FLD_RANGE); + + RETURN(rc); +} + +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt) +{ + struct dt_object *dt_obj = NULL; + struct lu_fid fid; + struct lu_attr *attr = NULL; + struct lu_seq_range *range = NULL; + struct fld_thread_info *info; + struct dt_object_format dof; + struct dt_it *it; + const struct dt_it_ops *iops; + int rc; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + LASSERT(info != NULL); + + lu_local_obj_fid(&fid, FLD_INDEX_OID); + OBD_ALLOC_PTR(attr); + if (attr == NULL) + RETURN(-ENOMEM); + + memset(attr, 0, sizeof(*attr)); + attr->la_valid = LA_MODE; + attr->la_mode = S_IFREG | 0666; + dof.dof_type = DFT_INDEX; + dof.u.dof_idx.di_feat = &fld_index_features; + + dt_obj = dt_find_or_create(env, dt, &fid, &dof, attr); + if (IS_ERR(dt_obj)) { + rc = PTR_ERR(dt_obj); + CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name, + fld_index_name, rc); + dt_obj = NULL; + GOTO(out, rc); + } + + fld->lsf_obj = dt_obj; + rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features); + if (rc != 0) { + CERROR("%s: File \"%s\" is not an index: rc = %d!\n", + fld->lsf_name, fld_index_name, rc); + GOTO(out, rc); + } + + range = &info->fti_rec; + /* Load fld entry to cache */ + iops = &dt_obj->do_index_ops->dio_it; + it = iops->init(env, dt_obj, 0, NULL); + if (IS_ERR(it)) + GOTO(out, rc = PTR_ERR(it)); + + rc = iops->load(env, it, 0); + if (rc < 0) + GOTO(out_it_fini, rc); + + if (rc > 0) { + /* Load FLD entry into server cache */ + do { + rc = iops->rec(env, it, (struct dt_rec *)range, 0); + if (rc != 0) + GOTO(out_it_put, rc); + LASSERT(range != NULL); + range_be_to_cpu(range, range); + rc = fld_cache_insert(fld->lsf_cache, range); + if (rc != 0) + GOTO(out_it_put, rc); + rc = iops->next(env, it); + } while (rc == 0); + } + + /* Note: fld_insert_entry will detect whether these + * special entries already exist inside FLDB */ + mutex_lock(&fld->lsf_lock); + rc = fld_insert_special_entries(env, fld); + mutex_unlock(&fld->lsf_lock); + if (rc != 0) { + CERROR("%s: insert special entries failed!: rc = %d\n", + fld->lsf_name, rc); + GOTO(out_it_put, rc); + } + +out_it_put: + iops->put(env, it); +out_it_fini: + iops->fini(env, it); +out: + if (attr != NULL) + OBD_FREE_PTR(attr); + + if (rc != 0) { + if (dt_obj != NULL) + lu_object_put(env, &dt_obj->do_lu); + fld->lsf_obj = NULL; + } + RETURN(rc); +} + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld) +{ + ENTRY; + if (fld->lsf_obj != NULL) { + if (!IS_ERR(fld->lsf_obj)) + lu_object_put(env, &fld->lsf_obj->do_lu); + fld->lsf_obj = NULL; + } + EXIT; +} diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h new file mode 100644 index 000000000000..9fa9e01cdb67 --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/fld_internal.h @@ -0,0 +1,223 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_internal.h + * + * Author: Yury Umanets + * Author: Tom WangDi + */ +#ifndef __FLD_INTERNAL_H +#define __FLD_INTERNAL_H + +#include +#include + +#include +#include +#include + +enum { + LUSTRE_FLD_INIT = 1 << 0, + LUSTRE_FLD_RUN = 1 << 1 +}; + +struct fld_stats { + __u64 fst_count; + __u64 fst_cache; + __u64 fst_inflight; +}; + +typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64); + +typedef struct lu_fld_target * +(*fld_scan_func_t) (struct lu_client_fld *, __u64); + +struct lu_fld_hash { + const char *fh_name; + fld_hash_func_t fh_hash_func; + fld_scan_func_t fh_scan_func; +}; + +struct fld_cache_entry { + struct list_head fce_lru; + struct list_head fce_list; + /** + * fld cache entries are sorted on range->lsr_start field. */ + struct lu_seq_range fce_range; +}; + +struct fld_cache { + /** + * Cache guard, protects fci_hash mostly because others immutable after + * init is finished. + */ + rwlock_t fci_lock; + + /** + * Cache shrink threshold */ + int fci_threshold; + + /** + * Prefered number of cached entries */ + int fci_cache_size; + + /** + * Current number of cached entries. Protected by \a fci_lock */ + int fci_cache_count; + + /** + * LRU list fld entries. */ + struct list_head fci_lru; + + /** + * sorted fld entries. */ + struct list_head fci_entries_head; + + /** + * Cache statistics. */ + struct fld_stats fci_stat; + + /** + * Cache name used for debug and messages. */ + char fci_name[80]; + unsigned int fci_no_shrink:1; +}; + +enum fld_op { + FLD_CREATE = 0, + FLD_DELETE = 1, + FLD_LOOKUP = 2 +}; + +enum { + /* 4M of FLD cache will not hurt client a lot. */ + FLD_SERVER_CACHE_SIZE = (4 * 0x100000), + + /* 1M of FLD cache will not hurt client a lot. */ + FLD_CLIENT_CACHE_SIZE = (1 * 0x100000) +}; + +enum { + /* Cache threshold is 10 percent of size. */ + FLD_SERVER_CACHE_THRESHOLD = 10, + + /* Cache threshold is 10 percent of size. */ + FLD_CLIENT_CACHE_THRESHOLD = 10 +}; + +extern struct lu_fld_hash fld_hash[]; + + +struct fld_thread_info { + struct req_capsule *fti_pill; + __u64 fti_key; + struct lu_seq_range fti_rec; + struct lu_seq_range fti_lrange; + struct lu_seq_range fti_irange; +}; + +extern struct lu_context_key fld_thread_key; + +int fld_index_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt); + +void fld_index_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_index_create(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *new, + struct thandle *th); + +int fld_index_create(const struct lu_env *env, struct lu_server_fld *fld, + const struct lu_seq_range *new, struct thandle *th); + +int fld_index_lookup(const struct lu_env *env, struct lu_server_fld *fld, + seqno_t seq, struct lu_seq_range *range); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op); + +#ifdef LPROCFS +extern struct lprocfs_vars fld_server_proc_list[]; +extern struct lprocfs_vars fld_client_proc_list[]; +#endif + + +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold); + +void fld_cache_fini(struct fld_cache *cache); + +void fld_cache_flush(struct fld_cache *cache); + +int fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range); + +struct fld_cache_entry +*fld_cache_entry_create(const struct lu_seq_range *range); + +int fld_cache_insert_nolock(struct fld_cache *cache, + struct fld_cache_entry *f_new); +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range); +void fld_cache_delete_nolock(struct fld_cache *cache, + const struct lu_seq_range *range); +int fld_cache_lookup(struct fld_cache *cache, + const seqno_t seq, struct lu_seq_range *range); + +struct fld_cache_entry* +fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range); +void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node); +void fld_dump_cache_entries(struct fld_cache *cache); + +struct fld_cache_entry +*fld_cache_entry_lookup_nolock(struct fld_cache *cache, + struct lu_seq_range *range); +int fld_write_range(const struct lu_env *env, struct dt_object *dt, + const struct lu_seq_range *range, struct thandle *th); + +static inline const char * +fld_target_name(struct lu_fld_target *tar) +{ + if (tar->ft_srv != NULL) + return tar->ft_srv->lsf_name; + + return (const char *)tar->ft_exp->exp_obd->obd_name; +} + +extern proc_dir_entry_t *fld_type_proc_dir; +extern struct file_operations fld_proc_seq_fops; +#endif /* __FLD_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c new file mode 100644 index 000000000000..e9f07398b68a --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/fld_request.c @@ -0,0 +1,519 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/fld_request.c + * + * FLD (Fids Location Database) + * + * Author: Yury Umanets + */ + +#define DEBUG_SUBSYSTEM S_FLD + +# include +# include +# include +# include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "fld_internal.h" + +/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c + * It should be common thing. The same about mdc RPC lock */ +static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) +{ + int rc; + ENTRY; + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&mcw->mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + RETURN(rc); +}; + +static void fld_enter_request(struct client_obd *cli) +{ + struct mdc_cache_waiter mcw; + struct l_wait_info lwi = { 0 }; + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); + init_waitqueue_head(&mcw.mcw_waitq); + client_obd_list_unlock(&cli->cl_loi_list_lock); + l_wait_event(mcw.mcw_waitq, fld_req_avail(cli, &mcw), &lwi); + } else { + cli->cl_r_in_flight++; + client_obd_list_unlock(&cli->cl_loi_list_lock); + } +} + +static void fld_exit_request(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct mdc_cache_waiter *mcw; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_r_in_flight--; + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + /* No free request slots anymore */ + break; + } + + mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); + list_del_init(&mcw->mcw_entry); + cli->cl_r_in_flight++; + wake_up(&mcw->mcw_waitq); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +static int fld_rrb_hash(struct lu_client_fld *fld, + seqno_t seq) +{ + LASSERT(fld->lcf_count > 0); + return do_div(seq, fld->lcf_count); +} + +static struct lu_fld_target * +fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq) +{ + struct lu_fld_target *target; + int hash; + ENTRY; + + /* Because almost all of special sequence located in MDT0, + * it should go to index 0 directly, instead of calculating + * hash again, and also if other MDTs is not being connected, + * the fld lookup requests(for seq on MDT0) should not be + * blocked because of other MDTs */ + if (fid_seq_is_norm(seq)) + hash = fld_rrb_hash(fld, seq); + else + hash = 0; + + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + if (target->ft_idx == hash) + RETURN(target); + } + + CERROR("%s: Can't find target by hash %d (seq "LPX64"). " + "Targets (%d):\n", fld->lcf_name, hash, seq, + fld->lcf_count); + + list_for_each_entry(target, &fld->lcf_targets, ft_chain) { + const char *srv_name = target->ft_srv != NULL ? + target->ft_srv->lsf_name : ""; + const char *exp_name = target->ft_exp != NULL ? + (char *)target->ft_exp->exp_obd->obd_uuid.uuid : + ""; + + CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: "LPU64"\n", + target->ft_exp, exp_name, target->ft_srv, + srv_name, target->ft_idx); + } + + /* + * If target is not found, there is logical error anyway, so here is + * LBUG() to catch this situation. + */ + LBUG(); + RETURN(NULL); +} + +struct lu_fld_hash fld_hash[] = { + { + .fh_name = "RRB", + .fh_hash_func = fld_rrb_hash, + .fh_scan_func = fld_rrb_scan + }, + { + 0, + } +}; + +static struct lu_fld_target * +fld_client_get_target(struct lu_client_fld *fld, seqno_t seq) +{ + struct lu_fld_target *target; + ENTRY; + + LASSERT(fld->lcf_hash != NULL); + + spin_lock(&fld->lcf_lock); + target = fld->lcf_hash->fh_scan_func(fld, seq); + spin_unlock(&fld->lcf_lock); + + if (target != NULL) { + CDEBUG(D_INFO, "%s: Found target (idx "LPU64 + ") by seq "LPX64"\n", fld->lcf_name, + target->ft_idx, seq); + } + + RETURN(target); +} + +/* + * Add export to FLD. This is usually done by CMM and LMV as they are main users + * of FLD module. + */ +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar) +{ + const char *name; + struct lu_fld_target *target, *tmp; + ENTRY; + + LASSERT(tar != NULL); + name = fld_target_name(tar); + LASSERT(name != NULL); + LASSERT(tar->ft_srv != NULL || tar->ft_exp != NULL); + + if (fld->lcf_flags != LUSTRE_FLD_INIT) { + CERROR("%s: Attempt to add target %s (idx "LPU64") " + "on fly - skip it\n", fld->lcf_name, name, + tar->ft_idx); + RETURN(0); + } else { + CDEBUG(D_INFO, "%s: Adding target %s (idx " + LPU64")\n", fld->lcf_name, name, tar->ft_idx); + } + + OBD_ALLOC_PTR(target); + if (target == NULL) + RETURN(-ENOMEM); + + spin_lock(&fld->lcf_lock); + list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) { + if (tmp->ft_idx == tar->ft_idx) { + spin_unlock(&fld->lcf_lock); + OBD_FREE_PTR(target); + CERROR("Target %s exists in FLD and known as %s:#"LPU64"\n", + name, fld_target_name(tmp), tmp->ft_idx); + RETURN(-EEXIST); + } + } + + target->ft_exp = tar->ft_exp; + if (target->ft_exp != NULL) + class_export_get(target->ft_exp); + target->ft_srv = tar->ft_srv; + target->ft_idx = tar->ft_idx; + + list_add_tail(&target->ft_chain, + &fld->lcf_targets); + + fld->lcf_count++; + spin_unlock(&fld->lcf_lock); + + RETURN(0); +} +EXPORT_SYMBOL(fld_client_add_target); + +/* Remove export from FLD */ +int fld_client_del_target(struct lu_client_fld *fld, __u64 idx) +{ + struct lu_fld_target *target, *tmp; + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, + &fld->lcf_targets, ft_chain) { + if (target->ft_idx == idx) { + fld->lcf_count--; + list_del(&target->ft_chain); + spin_unlock(&fld->lcf_lock); + + if (target->ft_exp != NULL) + class_export_put(target->ft_exp); + + OBD_FREE_PTR(target); + RETURN(0); + } + } + spin_unlock(&fld->lcf_lock); + RETURN(-ENOENT); +} +EXPORT_SYMBOL(fld_client_del_target); + +#ifdef LPROCFS +static int fld_client_proc_init(struct lu_client_fld *fld) +{ + int rc; + ENTRY; + + fld->lcf_proc_dir = lprocfs_register(fld->lcf_name, + fld_type_proc_dir, + NULL, NULL); + + if (IS_ERR(fld->lcf_proc_dir)) { + CERROR("%s: LProcFS failed in fld-init\n", + fld->lcf_name); + rc = PTR_ERR(fld->lcf_proc_dir); + RETURN(rc); + } + + rc = lprocfs_add_vars(fld->lcf_proc_dir, + fld_client_proc_list, fld); + if (rc) { + CERROR("%s: Can't init FLD proc, rc %d\n", + fld->lcf_name, rc); + GOTO(out_cleanup, rc); + } + + RETURN(0); + +out_cleanup: + fld_client_proc_fini(fld); + return rc; +} + +void fld_client_proc_fini(struct lu_client_fld *fld) +{ + ENTRY; + if (fld->lcf_proc_dir) { + if (!IS_ERR(fld->lcf_proc_dir)) + lprocfs_remove(&fld->lcf_proc_dir); + fld->lcf_proc_dir = NULL; + } + EXIT; +} +#else +static int fld_client_proc_init(struct lu_client_fld *fld) +{ + return 0; +} + +void fld_client_proc_fini(struct lu_client_fld *fld) +{ + return; +} +#endif + +EXPORT_SYMBOL(fld_client_proc_fini); + +static inline int hash_is_sane(int hash) +{ + return (hash >= 0 && hash < ARRAY_SIZE(fld_hash)); +} + +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash) +{ + int cache_size, cache_threshold; + int rc; + ENTRY; + + LASSERT(fld != NULL); + + snprintf(fld->lcf_name, sizeof(fld->lcf_name), + "cli-%s", prefix); + + if (!hash_is_sane(hash)) { + CERROR("%s: Wrong hash function %#x\n", + fld->lcf_name, hash); + RETURN(-EINVAL); + } + + fld->lcf_count = 0; + spin_lock_init(&fld->lcf_lock); + fld->lcf_hash = &fld_hash[hash]; + fld->lcf_flags = LUSTRE_FLD_INIT; + INIT_LIST_HEAD(&fld->lcf_targets); + + cache_size = FLD_CLIENT_CACHE_SIZE / + sizeof(struct fld_cache_entry); + + cache_threshold = cache_size * + FLD_CLIENT_CACHE_THRESHOLD / 100; + + fld->lcf_cache = fld_cache_init(fld->lcf_name, + cache_size, cache_threshold); + if (IS_ERR(fld->lcf_cache)) { + rc = PTR_ERR(fld->lcf_cache); + fld->lcf_cache = NULL; + GOTO(out, rc); + } + + rc = fld_client_proc_init(fld); + if (rc) + GOTO(out, rc); + EXIT; +out: + if (rc) + fld_client_fini(fld); + else + CDEBUG(D_INFO, "%s: Using \"%s\" hash\n", + fld->lcf_name, fld->lcf_hash->fh_name); + return rc; +} +EXPORT_SYMBOL(fld_client_init); + +void fld_client_fini(struct lu_client_fld *fld) +{ + struct lu_fld_target *target, *tmp; + ENTRY; + + spin_lock(&fld->lcf_lock); + list_for_each_entry_safe(target, tmp, + &fld->lcf_targets, ft_chain) { + fld->lcf_count--; + list_del(&target->ft_chain); + if (target->ft_exp != NULL) + class_export_put(target->ft_exp); + OBD_FREE_PTR(target); + } + spin_unlock(&fld->lcf_lock); + + if (fld->lcf_cache != NULL) { + if (!IS_ERR(fld->lcf_cache)) + fld_cache_fini(fld->lcf_cache); + fld->lcf_cache = NULL; + } + + EXIT; +} +EXPORT_SYMBOL(fld_client_fini); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op) +{ + struct ptlrpc_request *req; + struct lu_seq_range *prange; + __u32 *op; + int rc; + struct obd_import *imp; + ENTRY; + + LASSERT(exp != NULL); + + imp = class_exp2cliimp(exp); + req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, LUSTRE_MDS_VERSION, + FLD_QUERY); + if (req == NULL) + RETURN(-ENOMEM); + + op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); + *op = fld_op; + + prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); + *prange = *range; + + ptlrpc_request_set_replen(req); + req->rq_request_portal = FLD_REQUEST_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (fld_op == FLD_LOOKUP && + imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) + req->rq_allow_replay = 1; + + if (fld_op != FLD_LOOKUP) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + fld_enter_request(&exp->exp_obd->u.cli); + rc = ptlrpc_queue_wait(req); + fld_exit_request(&exp->exp_obd->u.cli); + if (fld_op != FLD_LOOKUP) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) + GOTO(out_req, rc); + + prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD); + if (prange == NULL) + GOTO(out_req, rc = -EFAULT); + *range = *prange; + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + +int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds, + __u32 flags, const struct lu_env *env) +{ + struct lu_seq_range res = { 0 }; + struct lu_fld_target *target; + int rc; + ENTRY; + + fld->lcf_flags |= LUSTRE_FLD_RUN; + + rc = fld_cache_lookup(fld->lcf_cache, seq, &res); + if (rc == 0) { + *mds = res.lsr_index; + RETURN(0); + } + + /* Can not find it in the cache */ + target = fld_client_get_target(fld, seq); + LASSERT(target != NULL); + + CDEBUG(D_INFO, "%s: Lookup fld entry (seq: "LPX64") on " + "target %s (idx "LPU64")\n", fld->lcf_name, seq, + fld_target_name(target), target->ft_idx); + + res.lsr_start = seq; + fld_range_set_type(&res, flags); + if (target->ft_srv != NULL) { + LASSERT(env != NULL); + rc = fld_server_lookup(env, target->ft_srv, seq, &res); + } else { + rc = fld_client_rpc(target->ft_exp, &res, FLD_LOOKUP); + } + + if (rc == 0) { + *mds = res.lsr_index; + + fld_cache_insert(fld->lcf_cache, &res); + } + RETURN(rc); +} +EXPORT_SYMBOL(fld_client_lookup); + +void fld_client_flush(struct lu_client_fld *fld) +{ + fld_cache_flush(fld->lcf_cache); +} +EXPORT_SYMBOL(fld_client_flush); diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c new file mode 100644 index 000000000000..00fe31e3861c --- /dev/null +++ b/drivers/staging/lustre/lustre/fld/lproc_fld.c @@ -0,0 +1,365 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/fld/lproc_fld.c + * + * FLD (FIDs Location Database) + * + * Author: Yury Umanets + * Di Wang + */ + +#define DEBUG_SUBSYSTEM S_FLD + +# include +# include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "fld_internal.h" + +#ifdef LPROCFS +static int +fld_proc_read_targets(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)data; + struct lu_fld_target *target; + int total = 0, rc; + ENTRY; + + LASSERT(fld != NULL); + + spin_lock(&fld->lcf_lock); + list_for_each_entry(target, + &fld->lcf_targets, ft_chain) + { + rc = snprintf(page, count, "%s\n", + fld_target_name(target)); + page += rc; + count -= rc; + total += rc; + if (count == 0) + break; + } + spin_unlock(&fld->lcf_lock); + RETURN(total); +} + +static int +fld_proc_read_hash(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)data; + int rc; + ENTRY; + + LASSERT(fld != NULL); + + spin_lock(&fld->lcf_lock); + rc = snprintf(page, count, "%s\n", fld->lcf_hash->fh_name); + spin_unlock(&fld->lcf_lock); + + RETURN(rc); +} + +static int +fld_proc_write_hash(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)data; + struct lu_fld_hash *hash = NULL; + int i; + ENTRY; + + LASSERT(fld != NULL); + + for (i = 0; fld_hash[i].fh_name != NULL; i++) { + if (count != strlen(fld_hash[i].fh_name)) + continue; + + if (!strncmp(fld_hash[i].fh_name, buffer, count)) { + hash = &fld_hash[i]; + break; + } + } + + if (hash != NULL) { + spin_lock(&fld->lcf_lock); + fld->lcf_hash = hash; + spin_unlock(&fld->lcf_lock); + + CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n", + fld->lcf_name, hash->fh_name); + } + + RETURN(count); +} + +static int +fld_proc_write_cache_flush(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lu_client_fld *fld = (struct lu_client_fld *)data; + ENTRY; + + LASSERT(fld != NULL); + + fld_cache_flush(fld->lcf_cache); + + CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name); + + RETURN(count); +} + +struct fld_seq_param { + struct lu_env fsp_env; + struct dt_it *fsp_it; + struct lu_server_fld *fsp_fld; + unsigned int fsp_stop:1; +}; + +static void *fldb_seq_start(struct seq_file *p, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->load(¶m->fsp_env, param->fsp_it, *pos); + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static void fldb_seq_stop(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + const struct dt_it_ops *iops; + struct lu_server_fld *fld; + struct dt_object *obj; + + if (param == NULL) + return; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + iops->put(¶m->fsp_env, param->fsp_it); +} + +static void *fldb_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + int rc; + + if (param == NULL || param->fsp_stop) + return NULL; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + rc = iops->next(¶m->fsp_env, param->fsp_it); + if (rc > 0) { + param->fsp_stop = 1; + return NULL; + } + + *pos = be64_to_cpu(*(__u64 *)iops->key(¶m->fsp_env, param->fsp_it)); + return param; +} + +static int fldb_seq_show(struct seq_file *p, void *v) +{ + struct fld_seq_param *param = p->private; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_thread_info *info; + struct lu_seq_range *fld_rec; + int rc; + + if (param == NULL || param->fsp_stop) + return 0; + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + info = lu_context_key_get(¶m->fsp_env.le_ctx, + &fld_thread_key); + fld_rec = &info->fti_rec; + rc = iops->rec(¶m->fsp_env, param->fsp_it, + (struct dt_rec *)fld_rec, 0); + if (rc != 0) { + CERROR("%s:read record error: rc %d\n", + fld->lsf_name, rc); + } else if (fld_rec->lsr_start != 0) { + range_be_to_cpu(fld_rec, fld_rec); + rc = seq_printf(p, DRANGE"\n", PRANGE(fld_rec)); + } + + return rc; +} + +struct seq_operations fldb_sops = { + .start = fldb_seq_start, + .stop = fldb_seq_stop, + .next = fldb_seq_next, + .show = fldb_seq_show, +}; + +static int fldb_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + struct lu_server_fld *fld = (struct lu_server_fld *)dp->data; + struct dt_object *obj; + const struct dt_it_ops *iops; + struct fld_seq_param *param = NULL; + int env_init = 0; + int rc; + + LPROCFS_ENTRY_AND_CHECK(dp); + rc = seq_open(file, &fldb_sops); + if (rc) + GOTO(out, rc); + + obj = fld->lsf_obj; + if (obj == NULL) { + seq = file->private_data; + seq->private = NULL; + return 0; + } + + OBD_ALLOC_PTR(param); + if (param == NULL) + GOTO(out, rc = -ENOMEM); + + rc = lu_env_init(¶m->fsp_env, LCT_MD_THREAD); + if (rc != 0) + GOTO(out, rc); + + env_init = 1; + iops = &obj->do_index_ops->dio_it; + param->fsp_it = iops->init(¶m->fsp_env, obj, 0, NULL); + if (IS_ERR(param->fsp_it)) + GOTO(out, rc = PTR_ERR(param->fsp_it)); + + param->fsp_fld = fld; + param->fsp_stop = 0; + + seq = file->private_data; + seq->private = param; +out: + if (rc != 0) { + if (env_init == 1) + lu_env_fini(¶m->fsp_env); + if (param != NULL) + OBD_FREE_PTR(param); + LPROCFS_EXIT(); + } + return rc; +} + +static int fldb_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct fld_seq_param *param; + struct lu_server_fld *fld; + struct dt_object *obj; + const struct dt_it_ops *iops; + + param = seq->private; + if (param == NULL) { + lprocfs_seq_release(inode, file); + return 0; + } + + fld = param->fsp_fld; + obj = fld->lsf_obj; + LASSERT(obj != NULL); + iops = &obj->do_index_ops->dio_it; + + LASSERT(iops != NULL); + LASSERT(obj != NULL); + LASSERT(param->fsp_it != NULL); + iops->fini(¶m->fsp_env, param->fsp_it); + lu_env_fini(¶m->fsp_env); + OBD_FREE_PTR(param); + lprocfs_seq_release(inode, file); + + return 0; +} + +struct lprocfs_vars fld_server_proc_list[] = { + { NULL }}; + +struct lprocfs_vars fld_client_proc_list[] = { + { "targets", fld_proc_read_targets, NULL, NULL }, + { "hash", fld_proc_read_hash, fld_proc_write_hash, NULL }, + { "cache_flush", NULL, fld_proc_write_cache_flush, NULL }, + { NULL }}; + +struct file_operations fld_proc_seq_fops = { + .owner = THIS_MODULE, + .open = fldb_seq_open, + .read = seq_read, + .release = fldb_seq_release, +}; + +#endif diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h new file mode 100644 index 000000000000..61c463553200 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/cl_object.h @@ -0,0 +1,3281 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * - cl_req represents a collection of pages for a transfer. cl_req is + * constructed by req-forming engine that tries to saturate + * transport with large and continuous transfers. + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - cl_object_header::coh_lock_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include +#include +# include +# include + +struct inode; + +struct cl_device; +struct cl_device_operations; + +struct cl_object; +struct cl_object_page_operations; +struct cl_object_lock_operations; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req; +struct cl_req_slice; + +/** + * Operations for each data device in the client stack. + * + * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops + */ +struct cl_device_operations { + /** + * Initialize cl_req. This method is called top-to-bottom on all + * devices in the stack to get them a chance to allocate layer-private + * data, and to attach them to the cl_req by calling + * cl_req_slice_add(). + * + * \see osc_req_init(), lov_req_init(), lovsub_req_init() + * \see ccc_req_init() + */ + int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +}; + +/** + * Device in the client stack. + * + * \see ccc_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; + /** Per-layer operation vector. */ + const struct cl_device_operations *cd_ops; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = 1 << 0, + CAT_KMS = 1 << 1, + CAT_MTIME = 1 << 3, + CAT_ATIME = 1 << 4, + CAT_CTIME = 1 << 5, + CAT_BLOCKS = 1 << 6, + CAT_UID = 1 << 7, + CAT_GID = 1 << 8 +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see ccc_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; + /** offset of page slice in cl_page buffer */ + int co_slice_off; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lustre_md *coc_md; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; + /** + * Layout lock handle. + */ + struct ldlm_lock *coc_lock; + /** + * Operation to handle layout, OBJECT_CONF_XYZ. + */ + int coc_opc; +}; + +enum { + /** configure layout, set up a new stripe, must be called while + * holding layout lock. */ + OBJECT_CONF_SET = 0, + /** invalidate the current stripe configuration due to losing + * layout lock. */ + OBJECT_CONF_INVALIDATE = 1, + /** wait for old layout to go away so that new layout can be + * set up. */ + OBJECT_CONF_WAIT = 2 +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see ccc_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + /** \name locks + * \todo XXX move locks below to the separate cache-lines, they are + * mostly useless otherwise. + */ + /** @{ */ + /** Lock protecting page tree. */ + spinlock_t coh_page_guard; + /** Lock protecting lock list. */ + spinlock_t coh_lock_guard; + /** @} locks */ + /** Radix tree of cl_page's, cached for this object. */ + struct radix_tree_root coh_tree; + /** # of pages in radix tree. */ + unsigned long coh_pages; + /** List of cl_lock's granted for this object. */ + struct list_head coh_locks; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Size of cl_page + page slices + */ + unsigned short coh_page_bufsize; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned char coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** @} cl_object */ + +#ifndef pgoff_t +#define pgoff_t unsigned long +#endif + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), struct page. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and struct page is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and struct page has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked struct page and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and struct page (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an cl_req being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read cl_req from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO, lockless IO and liblustre. */ + CPT_TRANSIENT, +}; + +/** + * Flags maintained for every cl_page. + */ +enum cl_page_flags { + /** + * Set when pagein completes. Used for debugging (read completes at + * most once for a page). + */ + CPF_READ_COMPLETED = 1 << 0 +}; + +/** + * Fields are protected by the lock on struct page, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** Logical page index within the object. Immutable after creation. */ + pgoff_t cp_index; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** Parent page, NULL for top-level page. Immutable after creation. */ + struct cl_page *cp_parent; + /** Lower-layer page. NULL for bottommost page. Immutable after + * creation. */ + struct cl_page *cp_child; + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + const enum cl_page_state cp_state; + /** Linkage of pages within group. Protected by cl_page::cp_mutex. */ + struct list_head cp_batch; + /** Mutex serializing membership of a page in a batch. */ + struct mutex cp_mutex; + /** Linkage of pages within cl_req. */ + struct list_head cp_flight; + /** Transfer error. */ + int cp_error; + + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type; + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** + * Debug information, the task is owning the page. + */ + task_t *cp_task; + /** + * Owning IO request in cl_page_state::CPS_PAGEOUT and + * cl_page_state::CPS_PAGEIN states. This field is maintained only in + * the top-level pages. Protected by a VM lock. + */ + struct cl_req *cp_req; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link *cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link *cp_queue_ref; + /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ + unsigned cp_flags; + /** Assigned if doing a sync_io */ + struct cl_sync_io *cp_sync_io; +}; + +/** + * Per-layer part of cl_page. + * + * \see ccc_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; + /** Linkage into cl_page::cp_layers. Immutable after creation. */ + struct list_head cpl_linkage; +}; + +/** + * Lock mode. For the client extent locks. + * + * \warning: cl_lock_mode_match() assumes particular ordering here. + * \ingroup cl_lock + */ +enum cl_lock_mode { + /** + * Mode of a lock that protects no data, and exists only as a + * placeholder. This is used for `glimpse' requests. A phantom lock + * might get promoted to real lock at some point. + */ + CLM_PHANTOM, + CLM_READ, + CLM_WRITE, + CLM_GROUP +}; + +/** + * Requested transfer type. + * \ingroup cl_req + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->struct page methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * \return the underlying VM page. Optional. + */ + struct page *(*cpo_vmpage)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + int (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces whether the page contains valid data or not by \a uptodate. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice, int uptodate); + /** + * Unmaps page from the user space (if it is mapped). + * + * \see cl_page_unmap() + * \see vvp_page_unmap() + */ + int (*cpo_unmap)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice); + + /** + * Checks whether the page is protected by a cl_lock. This is a + * per-layer method, because certain layers have ways to check for the + * lock much more efficiently than through the generic locks scan, or + * implement locking mechanisms separate from cl_lock, e.g., + * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks + * being canceled, or scheduled for cancellation as soon as the last + * user goes away, too. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. + * + * \see cl_page_is_under_lock() + */ + int (*cpo_is_under_lock)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. See comment on cl_req for a description of + * transfer formation and life-cycle. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * cl_req as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Announce that this page is to be written out + * opportunistically, that is, page is dirty, it is not + * necessary to start write-out transfer right now, but + * eventually page has to be written out. + * + * Main caller of this is the write path (see + * vvp_io_commit_write()), using this method to build a + * "transfer cache" from which large transfers are then + * constructed by the req-formation engine. + * + * \todo XXX it would make sense to add page-age tracking + * semantics here, and to oblige the req-formation engine to + * send the page out not later than it is too old. + * + * \see cl_page_cache_add() + */ + int (*cpo_cache_add)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * \pre the page was queued for transferring. + * \post page is removed from client's pending list, or -EBUSY + * is returned if it has already been in transferring. + * + * This is one of seldom page operation which is: + * 0. called from top level; + * 1. don't have vmpage locked; + * 2. every layer should synchronize execution of its ->cpo_cancel() + * with completion handlers. Osc uses client obd lock for this + * purpose. Based on there is no vvp_page_cancel and + * lov_page_cancel(), cpo_cancel is defacto protected by client lock. + * + * \see osc_page_cancel(). + */ + int (*cpo_cancel)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Write out a page by kernel. This is only called by ll_writepage + * right now. + * + * \see cl_page_flush() + */ + int (*cpo_flush)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +static inline int __page_in_use(const struct cl_page *page, int refc) +{ + if (page->cp_type == CPT_CACHEABLE) + ++refc; + LASSERT(atomic_read(&page->cp_ref) > 0); + return (atomic_read(&page->cp_ref) > refc); +} +#define cl_page_in_use(pg) __page_in_use(pg, 1) +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * All locks for a given object are linked into cl_object_header::coh_locks + * list (protected by cl_object_header::coh_lock_guard spin-lock) through + * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can + * sort it in starting lock offset, or use altogether different data structure + * like a tree. + * + * Typical cl_lock consists of the two layers: + * + * - vvp_lock (vvp specific data), and + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - lovsub_lock, and + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is reference counted. When reference counter drops to 0, lock is + * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING + * lock is destroyed when last reference is released. Referencing between + * top-lock and its sub-locks is described in the lov documentation module. + * + * STATE MACHINE + * + * Also, cl_lock is a state machine. This requires some clarification. One of + * the goals of client IO re-write was to make IO path non-blocking, or at + * least to make it easier to make it non-blocking in the future. Here + * `non-blocking' means that when a system call (read, write, truncate) + * reaches a situation where it has to wait for a communication with the + * server, it should --instead of waiting-- remember its current state and + * switch to some other work. E.g,. instead of waiting for a lock enqueue, + * client should proceed doing IO on the next stripe, etc. Obviously this is + * rather radical redesign, and it is not planned to be fully implemented at + * this time, instead we are putting some infrastructure in place, that would + * make it easier to do asynchronous non-blocking IO easier in the + * future. Specifically, where old locking code goes to sleep (waiting for + * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When + * enqueue reply comes, its completion handler signals that lock state-machine + * is ready to transit to the next state. There is some generic code in + * cl_lock.c that sleeps, waiting for these signals. As a result, for users of + * this cl_lock.c code, it looks like locking is done in normal blocking + * fashion, and it the same time it is possible to switch to the non-blocking + * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c + * functions). + * + * For a description of state machine states and transitions see enum + * cl_lock_state. + * + * There are two ways to restrict a set of states which lock might move to: + * + * - placing a "hold" on a lock guarantees that lock will not be moved + * into cl_lock_state::CLS_FREEING state until hold is released. Hold + * can be only acquired on a lock that is not in + * cl_lock_state::CLS_FREEING. All holds on a lock are counted in + * cl_lock::cll_holds. Hold protects lock from cancellation and + * destruction. Requests to cancel and destroy a lock on hold will be + * recorded, but only honored when last hold on a lock is released; + * + * - placing a "user" on a lock guarantees that lock will not leave + * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of + * states, once it enters this set. That is, if a user is added onto a + * lock in a state not from this set, it doesn't immediately enforce + * lock to move to this set, but once lock enters this set it will + * remain there until all users are removed. Lock users are counted in + * cl_lock::cll_users. + * + * User is used to assure that lock is not canceled or destroyed while + * it is being enqueued, or actively used by some IO. + * + * Currently, a user always comes with a hold (cl_lock_invariant() + * checks that a number of holds is not less than a number of users). + * + * CONCURRENCY + * + * This is how lock state-machine operates. struct cl_lock contains a mutex + * cl_lock::cll_guard that protects struct fields. + * + * - mutex is taken, and cl_lock::cll_state is examined. + * + * - for every state there are possible target states where lock can move + * into. They are tried in order. Attempts to move into next state are + * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try(). + * + * - if the transition can be performed immediately, state is changed, + * and mutex is released. + * + * - if the transition requires blocking, _try() function returns + * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to + * sleep, waiting for possibility of lock state change. It is woken + * up when some event occurs, that makes lock state change possible + * (e.g., the reception of the reply from the server), and repeats + * the loop. + * + * Top-lock and sub-lock has separate mutexes and the latter has to be taken + * first to avoid dead-lock. + * + * To see an example of interaction of all these issues, take a look at the + * lov_cl.c:lov_lock_enqueue() function. It is called as a part of + * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by + * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note + * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It + * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be + * done in parallel, rather than one after another (this is used for glimpse + * locks, that cannot dead-lock). + * + * INTERFACE AND USAGE + * + * struct cl_lock_operations provide a number of call-backs that are invoked + * when events of interest occurs. Layers can intercept and handle glimpse, + * blocking, cancel ASTs and a reception of the reply from the server. + * + * One important difference with the old client locking model is that new + * client has a representation for the top-lock, whereas in the old code only + * sub-locks existed as real data structures and file-level locks are + * represented by "request sets" that are created and destroyed on each and + * every lock creation. + * + * Top-locks are cached, and can be found in the cache by the system calls. It + * is possible that top-lock is in cache, but some of its sub-locks were + * canceled and destroyed. In that case top-lock has to be enqueued again + * before it can be used. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released into cache. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * As multi-stripe locks have to be allowed, it makes sense to cache them, so + * that, for example, a sequence of O_APPEND writes can proceed quickly + * without going down to the individual stripes to do lock matching. On the + * other hand, multi-stripe locks shouldn't be used by normal read/write + * calls. To achieve this, every layer can implement ->clo_fits_into() method, + * that is called by lock matching code (cl_lock_lookup()), and that can be + * used to selectively disable matching of certain locks for certain IOs. For + * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe + * locks to be matched only for truncates and O_APPEND writes. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Group ID, for group lock */ + __u64 cld_gid; + /** Lock mode. */ + enum cl_lock_mode cld_mode; + /** + * flags to enqueue lock. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cld_enq_flags; +}; + +#define DDESCR "%s(%d):[%lu, %lu]" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Lock state-machine states. + * + * \htmlonly + *
+ *
+ * Possible state transitions:
+ *
+ *	      +------------------>NEW
+ *	      |		    |
+ *	      |		    | cl_enqueue_try()
+ *	      |		    |
+ *	      |    cl_unuse_try()  V
+ *	      |  +--------------QUEUING (*)
+ *	      |  |		 |
+ *	      |  |		 | cl_enqueue_try()
+ *	      |  |		 |
+ *	      |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |		 |
+ *	      |  |		 | cl_wait_try()
+ *	      |  |		 |
+ *	      |  |		(R)
+ *	      |  |		 |
+ *	      |  |		 V
+ *	      |  |		HELD<---------+
+ *	      |  |		 |	    |
+ *	      |  |		 |	    | cl_use_try()
+ *	      |  |  cl_unuse_try() |	    |
+ *	      |  |		 |	    |
+ *	      |  |		 V	 ---+
+ *	      |  +------------>INTRANSIT (D) <--+
+ *	      |		    |	    |
+ *	      |     cl_unuse_try() |	    | cached lock found
+ *	      |		    |	    | cl_use_try()
+ *	      |		    |	    |
+ *	      |		    V	    |
+ *	      +------------------CACHED---------+
+ *				   |
+ *				  (C)
+ *				   |
+ *				   V
+ *				FREEING
+ *
+ * Legend:
+ *
+ *	 In states marked with (*) transition to the same state (i.e., a loop
+ *	 in the diagram) is possible.
+ *
+ *	 (R) is the point where Receive call-back is invoked: it allows layers
+ *	 to handle arrival of lock reply.
+ *
+ *	 (C) is the point where Cancellation call-back is invoked.
+ *
+ *	 (D) is the transit state which means the lock is changing.
+ *
+ *	 Transition to FREEING state is possible from any other state in the
+ *	 diagram in case of unrecoverable error.
+ * 
+ * \endhtmlonly + * + * These states are for individual cl_lock object. Top-lock and its sub-locks + * can be in the different states. Another way to say this is that we have + * nested state-machines. + * + * Separate QUEUING and ENQUEUED states are needed to support non-blocking + * operation for locks with multiple sub-locks. Imagine lock on a file F, that + * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send + * enqueue to S0, wait for its completion, then send enqueue for S1, wait for + * its completion and at last enqueue lock for S2, and wait for its + * completion. In that case, top-lock is in QUEUING state while S0, S1 are + * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note + * that in this case, sub-locks move from state to state, and top-lock remains + * in the same state). + */ +enum cl_lock_state { + /** + * Lock that wasn't yet enqueued + */ + CLS_NEW, + /** + * Enqueue is in progress, blocking for some intermediate interaction + * with the other side. + */ + CLS_QUEUING, + /** + * Lock is fully enqueued, waiting for server to reply when it is + * granted. + */ + CLS_ENQUEUED, + /** + * Lock granted, actively used by some IO. + */ + CLS_HELD, + /** + * This state is used to mark the lock is being used, or unused. + * We need this state because the lock may have several sublocks, + * so it's impossible to have an atomic way to bring all sublocks + * into CLS_HELD state at use case, or all sublocks to CLS_CACHED + * at unuse case. + * If a thread is referring to a lock, and it sees the lock is in this + * state, it must wait for the lock. + * See state diagram for details. + */ + CLS_INTRANSIT, + /** + * Lock granted, not used. + */ + CLS_CACHED, + /** + * Lock is being destroyed. + */ + CLS_FREEING, + CLS_NR +}; + +enum cl_lock_flags { + /** + * lock has been cancelled. This flag is never cleared once set (by + * cl_lock_cancel0()). + */ + CLF_CANCELLED = 1 << 0, + /** cancellation is pending for this lock. */ + CLF_CANCELPEND = 1 << 1, + /** destruction is pending for this lock. */ + CLF_DOOMED = 1 << 2, + /** from enqueue RPC reply upcall. */ + CLF_FROM_UPCALL= 1 << 3, +}; + +/** + * Lock closure. + * + * Lock closure is a collection of locks (both top-locks and sub-locks) that + * might be updated in a result of an operation on a certain lock (which lock + * this is a closure of). + * + * Closures are needed to guarantee dead-lock freedom in the presence of + * + * - nested state-machines (top-lock state-machine composed of sub-lock + * state-machines), and + * + * - shared sub-locks. + * + * Specifically, many operations, such as lock enqueue, wait, unlock, + * etc. start from a top-lock, and then operate on a sub-locks of this + * top-lock, holding a top-lock mutex. When sub-lock state changes as a result + * of such operation, this change has to be propagated to all top-locks that + * share this sub-lock. Obviously, no natural lock ordering (e.g., + * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has + * to be used. Lock closure systematizes this try-and-repeat logic. + */ +struct cl_lock_closure { + /** + * Lock that is mutexed when closure construction is started. When + * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on + * origin is released before waiting. + */ + struct cl_lock *clc_origin; + /** + * List of enclosed locks, so far. Locks are linked here through + * cl_lock::cll_inclosure. + */ + struct list_head clc_list; + /** + * True iff closure is in a `wait' mode. This determines what + * cl_lock_enclosure() does when a lock L to be added to the closure + * is currently mutexed by some other thread. + * + * If cl_lock_closure::clc_wait is not set, then closure construction + * fails with CLO_REPEAT immediately. + * + * In wait mode, cl_lock_enclosure() waits until next attempt to build + * a closure might succeed. To this end it releases an origin mutex + * (cl_lock_closure::clc_origin), that has to be the only lock mutex + * owned by the current thread, and then waits on L mutex (by grabbing + * it and immediately releasing), before returning CLO_REPEAT to the + * caller. + */ + int clc_wait; + /** Number of locks in the closure. */ + int clc_nr; +}; + +/** + * Layered client lock. + */ +struct cl_lock { + /** Reference counter. */ + atomic_t cll_ref; + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** + * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected + * by cl_lock::cll_descr::cld_obj::coh_lock_guard. + */ + struct list_head cll_linkage; + /** + * Parameters of this lock. Protected by + * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within + * cl_lock::cll_guard. Modified only on lock creation and in + * cl_lock_modify(). + */ + struct cl_lock_descr cll_descr; + /** Protected by cl_lock::cll_guard. */ + enum cl_lock_state cll_state; + /** signals state changes. */ + wait_queue_head_t cll_wq; + /** + * Recursive lock, most fields in cl_lock{} are protected by this. + * + * Locking rules: this mutex is never held across network + * communication, except when lock is being canceled. + * + * Lock ordering: a mutex of a sub-lock is taken first, then a mutex + * on a top-lock. Other direction is implemented through a + * try-lock-repeat loop. Mutices of unrelated locks can be taken only + * by try-locking. + * + * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait(). + */ + struct mutex cll_guard; + task_t *cll_guarder; + int cll_depth; + + /** + * the owner for INTRANSIT state + */ + task_t *cll_intransit_owner; + int cll_error; + /** + * Number of holds on a lock. A hold prevents a lock from being + * canceled and destroyed. Protected by cl_lock::cll_guard. + * + * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release() + */ + int cll_holds; + /** + * Number of lock users. Valid in cl_lock_state::CLS_HELD state + * only. Lock user pins lock in CLS_HELD state. Protected by + * cl_lock::cll_guard. + * + * \see cl_wait(), cl_unuse(). + */ + int cll_users; + /** + * Flag bit-mask. Values from enum cl_lock_flags. Updates are + * protected by cl_lock::cll_guard. + */ + unsigned long cll_flags; + /** + * A linkage into a list of locks in a closure. + * + * \see cl_lock_closure + */ + struct list_head cll_inclosure; + /** + * Confict lock at queuing time. + */ + struct cl_lock *cll_conflict; + /** + * A list of references to this lock, for debugging. + */ + struct lu_ref cll_reference; + /** + * A list of holds on this lock, for debugging. + */ + struct lu_ref cll_holders; + /** + * A reference for cl_lock::cll_descr::cld_obj. For debugging. + */ + struct lu_ref_link *cll_obj_ref; +#ifdef CONFIG_LOCKDEP + /* "dep_map" name is assumed by lockdep.h macros. */ + struct lockdep_map dep_map; +#endif +}; + +/** + * Per-layer part of cl_lock + * + * \see ccc_lock, lov_lock, lovsub_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}(). + * + * NOTE: lov_subresult() depends on ordering here. + */ +enum cl_lock_transition { + /** operation cannot be completed immediately. Wait for state change. */ + CLO_WAIT = 1, + /** operation had to release lock mutex, restart. */ + CLO_REPEAT = 2, + /** lower layer re-enqueued. */ + CLO_REENQUEUED = 3, +}; + +/** + * + * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** + * \name statemachine + * + * State machine transitions. These 3 methods are called to transfer + * lock from one state to another, as described in the commentary + * above enum #cl_lock_state. + * + * \retval 0 this layer has nothing more to do to before + * transition to the target state happens; + * + * \retval CLO_REPEAT method had to release and re-acquire cl_lock + * mutex, repeat invocation of transition method + * across all layers; + * + * \retval CLO_WAIT this layer cannot move to the target state + * immediately, as it has to wait for certain event + * (e.g., the communication with the server). It + * is guaranteed, that when the state transfer + * becomes possible, cl_lock::cll_wq wait-queue + * is signaled. Caller can wait for this event by + * calling cl_lock_state_wait(); + * + * \retval -ve failure, abort state transition, move the lock + * into cl_lock_state::CLS_FREEING state, and set + * cl_lock::cll_error. + * + * Once all layers voted to agree to transition (by returning 0), lock + * is moved into corresponding target state. All state transition + * methods are optional. + */ + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), + * \see osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); + /** + * Attempts to wait for enqueue result. Called top-to-bottom. + * + * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait() + */ + int (*clo_wait)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Attempts to unlock the lock. Called bottom-to-top. In addition to + * usual return values of lock state-machine methods, this can return + * -ESTALE to indicate that lock cannot be returned to the cache, and + * has to be re-initialized. + * unuse is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse() + */ + int (*clo_unuse)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Notifies layer that cached lock is started being used. + * + * \pre lock->cll_state == CLS_CACHED + * + * \see lov_lock_use(), osc_lock_use() + */ + int (*clo_use)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} statemachine */ + /** + * A method invoked when lock state is changed (as a result of state + * transition). This is used, for example, to track when the state of + * a sub-lock changes, to propagate this change to the corresponding + * top-lock. Optional + * + * \see lovsub_lock_state() + */ + void (*clo_state)(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state st); + /** + * Returns true, iff given lock is suitable for the given io, idea + * being, that there are certain "unsafe" locks, e.g., ones acquired + * for O_APPEND writes, that we don't want to re-use for a normal + * write, to avoid the danger of cascading evictions. Optional. Runs + * under cl_object_header::coh_lock_guard. + * + * XXX this should take more information about lock needed by + * io. Probably lock description or something similar. + * + * \see lov_fits_into() + */ + int (*clo_fits_into)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); + /** + * \name ast + * Asynchronous System Traps. All of then are optional, all are + * executed bottom-to-top. + */ + /** @{ */ + + /** + * Cancellation callback. Cancel a lock voluntarily, or under + * the request of server. + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Lock weighting ast. Executed to estimate how precious this lock + * is. The sum of results across all layers is used to determine + * whether lock worth keeping in cache given present memory usage. + * + * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh(). + */ + unsigned long (*clo_weigh)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} ast */ + + /** + * \see lovsub_lock_closure() + */ + int (*clo_closure)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure); + /** + * Executed bottom-to-top when lock description changes (e.g., as a + * result of server granting more generous lock than was requested). + * + * \see lovsub_lock_modify() + */ + int (*clo_modify)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *updated); + /** + * Notifies layers (bottom-to-top) that lock is going to be + * destroyed. Responsibility of layers is to prevent new references on + * this lock from being acquired once this method returns. + * + * This can be called multiple times due to the races. + * + * \see cl_lock_delete() + * \see osc_lock_delete(), lovsub_lock_delete() + */ + void (*clo_delete)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Destructor. Frees resources and the slice. + * + * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), + * \see osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +#define CL_LOCK_ASSERT(expr, env, lock) do { \ + if (likely(expr)) \ + break; \ + \ + CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ + LBUG(); \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; + task_t *pl_owner; +}; + +/** + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * (cl_io_operations::cio_read_page() for read, + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ, + /** write system call */ + CIT_WRITE, + /** truncate, utime system calls */ + CIT_SETATTR, + /** + * page fault handling + */ + CIT_FAULT, + /** + * fsync system call handling + * To write out a range of file + */ + CIT_FSYNC, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - glimpse. An io context to acquire glimpse lock. + * + * - grouplock. An io context to acquire group lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io, ccc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + struct { + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + } req_op[CRT_NR]; + /** + * Read missing page. + * + * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start() + * method, when it hits not-up-to-date page in the range. Optional. + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_page)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page); + /** + * Prepare write of a \a page. Called bottom-to-top by a top-level + * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for + * get data from user-level buffer. + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_prepare_write(), lov_io_prepare_write(), + * osc_io_prepare_write(). + */ + int (*cio_prepare_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_commit_write(), lov_io_commit_write(), + * osc_io_commit_write(). + */ + int (*cio_commit_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EWOULDBLOCK is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * take lock asynchronously (out of order), as it cannot + * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. + */ + CEF_ASYNC = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004, + /** + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks and glimpse locks that must be never converted + * into lockless mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock(). + */ + CEF_MUST = 0x00000008, + /** + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. + */ + CEF_NEVER = 0x00000010, + /** + * for async glimpse lock. + */ + CEF_AGL = 0x00000020, + /** + * mask of enq_flags. + */ + CEF_MASK = 0x0000003f, +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock_descr cill_descr; + struct cl_lock *cill_lock; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks currently being processed. */ + struct list_head cls_curr; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +enum cl_fsync_mode { + /** start writeback, do not wait for them to finish */ + CL_FSYNC_NONE = 0, + /** start writeback and wait for them to finish */ + CL_FSYNC_LOCAL = 1, + /** discard all of dirty pages in a specific file range */ + CL_FSYNC_DISCARD = 2, + /** start writeback and make sure they have reached storage before + * return. OST_SYNC RPC must be issued and finished */ + CL_FSYNC_ALL = 3 +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; + + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_setattr_io { + struct ost_lvb sa_attr; + unsigned int sa_valid; + struct obd_capa *sa_capa; + } ci_setattr; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + int ft_nob; + /** writable page? for nopage() only */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** page_mkwrite() */ + int ft_mkwrite; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + struct cl_fsync_io { + loff_t fi_start; + loff_t fi_end; + struct obd_capa *fi_capa; + /** file system level fid */ + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; + /* how many pages were written/discarded */ + unsigned int fi_nr_written; + } ci_fsync; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + unsigned int ci_continue:1, + /** + * This io has held grouplock, to inform sublayers that + * don't do lockless i/o. + */ + ci_no_srvlock:1, + /** + * The whole IO need to be restarted because layout has been changed + */ + ci_need_restart:1, + /** + * to not refresh layout - the IO issuer knows that the layout won't + * change(page operations, layout change causes all page to be + * discarded), or it doesn't matter if it changes(sync). + */ + ci_ignore_layout:1, + /** + * Check if layout changed after the IO finishes. Mainly for HSM + * requirement. If IO occurs to openning files, it doesn't need to + * verify layout because HSM won't release openning files. + * Right now, only two opertaions need to verify layout: glimpse + * and setattr. + */ + ci_verify_layout:1; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; +}; + +/** @} cl_io */ + +/** \addtogroup cl_req cl_req + * @{ */ +/** \struct cl_req + * Transfer. + * + * There are two possible modes of transfer initiation on the client: + * + * - immediate transfer: this is started when a high level io wants a page + * or a collection of pages to be transferred right away. Examples: + * read-ahead, synchronous read in the case of non-page aligned write, + * page write-out as a part of extent lock cancellation, page write-out + * as a part of memory cleansing. Immediate transfer can be both + * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE; + * + * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens + * when io wants to transfer a page to the server some time later, when + * it can be done efficiently. Example: pages dirtied by the write(2) + * path. + * + * In any case, transfer takes place in the form of a cl_req, which is a + * representation for a network RPC. + * + * Pages queued for an opportunistic transfer are cached until it is decided + * that efficient RPC can be composed of them. This decision is made by "a + * req-formation engine", currently implemented as a part of osc + * layer. Req-formation depends on many factors: the size of the resulting + * RPC, whether or not multi-object RPCs are supported by the server, + * max-rpc-in-flight limitations, size of the dirty cache, etc. + * + * For the immediate transfer io submits a cl_page_list, that req-formation + * engine slices into cl_req's, possibly adding cached pages to some of + * the resulting req's. + * + * Whenever a page from cl_page_list is added to a newly constructed req, its + * cl_page_operations::cpo_prep() layer methods are called. At that moment, + * page state is atomically changed from cl_page_state::CPS_OWNED to + * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner + * is zeroed, and cl_page::cp_req is set to the + * req. cl_page_operations::cpo_prep() method at the particular layer might + * return -EALREADY to indicate that it does not need to submit this page + * at all. This is possible, for example, if page, submitted for read, + * became up-to-date in the meantime; and for write, the page don't have + * dirty bit marked. \see cl_io_submit_rw() + * + * Whenever a cached page is added to a newly constructed req, its + * cl_page_operations::cpo_make_ready() layer methods are called. At that + * moment, page state is atomically changed from cl_page_state::CPS_CACHED to + * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to + * req. cl_page_operations::cpo_make_ready() method at the particular layer + * might return -EAGAIN to indicate that this page is not eligible for the + * transfer right now. + * + * FUTURE + * + * Plan is to divide transfers into "priority bands" (indicated when + * submitting cl_page_list, and queuing a page for the opportunistic transfer) + * and allow glueing of cached pages to immediate transfers only within single + * band. This would make high priority transfers (like lock cancellation or + * memory pressure induced write-out) really high priority. + * + */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Capability. */ + struct obd_capa *cra_capa; + /** Jobid */ + char cra_jobid[JOBSTATS_JOBID_SIZE]; +}; + +/** + * Transfer request operations definable at every layer. + * + * Concurrency: transfer formation engine synchronizes calls to all transfer + * methods. + */ +struct cl_req_operations { + /** + * Invoked top-to-bottom by cl_req_prep() when transfer formation is + * complete (all pages are added). + * + * \see osc_req_prep() + */ + int (*cro_prep)(const struct lu_env *env, + const struct cl_req_slice *slice); + /** + * Called top-to-bottom to fill in \a oa fields. This is called twice + * with different flags, see bug 10150 and osc_build_req(). + * + * \param obj an object from cl_req which attributes are to be set in + * \a oa. + * + * \param oa struct obdo where attributes are placed + * + * \param flags \a oa fields to be filled. + */ + void (*cro_attr_set)(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags); + /** + * Called top-to-bottom from cl_req_completion() to notify layers that + * transfer completed. Has to free all state allocated by + * cl_device_operations::cdo_req_init(). + */ + void (*cro_completion)(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +}; + +/** + * A per-object state that (potentially multi-object) transfer request keeps. + */ +struct cl_req_obj { + /** object itself */ + struct cl_object *ro_obj; + /** reference to cl_req_obj::ro_obj. For debugging. */ + struct lu_ref_link *ro_obj_ref; + /* something else? Number of pages for a given object? */ +}; + +/** + * Transfer request. + * + * Transfer requests are not reference counted, because IO sub-system owns + * them exclusively and knows when to free them. + * + * Life cycle. + * + * cl_req is created by cl_req_alloc() that calls + * cl_device_operations::cdo_req_init() device methods to allocate per-req + * state in every layer. + * + * Then pages are added (cl_req_page_add()), req keeps track of all objects it + * contains pages for. + * + * Once all pages were collected, cl_page_operations::cpo_prep() method is + * called top-to-bottom. At that point layers can modify req, let it pass, or + * deny it completely. This is to support things like SNS that have transfer + * ordering requirements invisible to the individual req-formation engine. + * + * On transfer completion (or transfer timeout, or failure to initiate the + * transfer of an allocated req), cl_req_operations::cro_completion() method + * is called, after execution of cl_page_operations::cpo_completion() of all + * req's pages. + */ +struct cl_req { + enum cl_req_type crq_type; + /** A list of pages being transfered */ + struct list_head crq_pages; + /** Number of pages in cl_req::crq_pages */ + unsigned crq_nrpages; + /** An array of objects which pages are in ->crq_pages */ + struct cl_req_obj *crq_o; + /** Number of elements in cl_req::crq_objs[] */ + unsigned crq_nrobjs; + struct list_head crq_layers; +}; + +/** + * Per-layer state for request. + */ +struct cl_req_slice { + struct cl_req *crs_req; + struct cl_device *crs_dev; + struct list_head crs_linkage; + const struct cl_req_operations *crs_ops; +}; + +/* @} cl_req */ + +enum cache_stats_item { + /** how many cache lookups were performed */ + CS_lookup = 0, + /** how many times cache lookup resulted in a hit */ + CS_hit, + /** how many entities are in the cache right now */ + CS_total, + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + CS_busy, + /** how many entities were created at all */ + CS_create, + CS_NR +}; + +#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + atomic_t cs_stats[CS_NR]; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); +int cache_stats_print(const struct cache_stats *cs, + char *page, int count, int header); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + struct cache_stats cs_locks; + atomic_t cs_pages_state[CPS_NR]; + atomic_t cs_locks_state[CLS_NR]; +}; + +int cl_site_init (struct cl_site *s, struct cl_device *top); +void cl_site_fini (struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *s, char *page, int count); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline int lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of0(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of0(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of0(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_device *cl_object_device(const struct cl_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); + return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of0(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +void cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); +int cl_object_has_locks (struct cl_object *obj); + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +static inline void cl_object_page_init(struct cl_object *clob, int size) +{ + clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; + cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8); +} + +static inline void *cl_object_page_slice(struct cl_object *clob, + struct cl_page *page) +{ + return (void *)((char *)page + clob->co_slice_off); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +enum { + CLP_GANG_OKAY = 0, + CLP_GANG_RESCHED, + CLP_GANG_AGAIN, + CLP_GANG_ABORT +}; + +/* callback of cl_page_gang_lookup() */ +typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *, + struct cl_page *, void *); +int cl_page_gang_lookup (const struct lu_env *env, + struct cl_object *obj, + struct cl_io *io, + pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata); +struct cl_page *cl_page_lookup (struct cl_object_header *hdr, + pgoff_t index); +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +struct cl_page *cl_page_find_sub (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +struct page *cl_page_vmpage (const struct lu_env *env, + struct cl_page *page); +struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_own_try (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_cancel (const struct lu_env *env, struct cl_page *page); +int cl_page_flush (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete (const struct lu_env *env, struct cl_page *pg); +int cl_page_unmap (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +int cl_page_is_vmlocked (const struct lu_env *env, + const struct cl_page *pg); +void cl_page_export (const struct lu_env *env, + struct cl_page *pg, int uptodate); +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +loff_t cl_offset (const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index (const struct cl_object *obj, loff_t offset); +int cl_page_size (const struct cl_object *obj); +int cl_pages_prune (const struct lu_env *env, struct cl_object *obj); + +void cl_lock_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ + +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, int pending, + int canceld); +static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, + struct cl_lock *except, + int pending, int canceld) +{ + LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj)); + return cl_lock_at_pgoff(env, obj, page->cp_index, except, + pending, canceld); +} + +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); + +void cl_lock_get (struct cl_lock *lock); +void cl_lock_get_trust (struct cl_lock *lock); +void cl_lock_put (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_release (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock); + +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock); +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_lock_is_intransit(struct cl_lock *lock); + +int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock, + int keep_mutex); + +/** \name statemachine statemachine + * Interface to lock state machine consists of 3 parts: + * + * - "try" functions that attempt to effect a state transition. If state + * transition is not possible right now (e.g., if it has to wait for some + * asynchronous event to occur), these functions return + * cl_lock_transition::CLO_WAIT. + * + * - "non-try" functions that implement synchronous blocking interface on + * top of non-blocking "try" functions. These functions repeatedly call + * corresponding "try" versions, and if state transition is not possible + * immediately, wait for lock state change. + * + * - methods from cl_lock_operations, called by "try" functions. Lock can + * be advanced to the target state only when all layers voted that they + * are ready for this transition. "Try" functions call methods under lock + * mutex. If a layer had to release a mutex, it re-acquires it and returns + * cl_lock_transition::CLO_REPEAT, causing "try" function to call all + * layers again. + * + * TRY NON-TRY METHOD FINAL STATE + * + * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED + * + * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD + * + * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED + * + * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD + * + * @{ */ + +int cl_enqueue (const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_unuse (const struct lu_env *env, struct cl_lock *lock); +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock); +int cl_wait_try (const struct lu_env *env, struct cl_lock *lock); +int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic); + +/** @} statemachine */ + +void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_queue_match (const struct list_head *queue, + const struct cl_lock_descr *need); + +void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_is_mutexed (struct cl_lock *lock); +int cl_lock_nr_mutexed (const struct lu_env *env); +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock); +int cl_lock_ext_match (const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need); +int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc); + +void cl_lock_closure_init (const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait); +void cl_lock_closure_fini (struct cl_lock_closure *closure); +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); +void cl_lock_disclosure (const struct lu_env *env, + struct cl_lock_closure *closure); +int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error); +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait); + +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_read_page (const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_commit_write (const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); +int cl_io_cancel (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue); +int cl_io_is_going (const struct lu_env *env); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +static inline int cl_io_is_sync_write(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; +} + +static inline int cl_io_is_mkwrite(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; +} + +/** + * True, iff \a io is a truncate(2). + */ +static inline int cl_io_is_trunc(const struct cl_io *io) +{ + return io->ci_type == CIT_SETATTR && + (io->u.ci_setattr.sa_valid & ATTR_SIZE); +} + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(foo_io, base) \ +do { \ + typeof(foo_io) __foo_io = (foo_io); \ + \ + CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \ + memset(&__foo_io->base + 1, 0, \ + (sizeof *__foo_io) - sizeof __foo_io->base); \ +} while (0) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Last page in the page list. + */ +static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); +} + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init (struct cl_page_list *plist); +void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice (struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del (const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_own (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_unmap (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init (struct cl_2queue *queue); +void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); +void cl_2queue_disown (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_assume (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_discard (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +/** \defgroup cl_req cl_req + * @{ */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects); + +void cl_req_page_add (const struct lu_env *env, struct cl_req *req, + struct cl_page *page); +void cl_req_page_done (const struct lu_env *env, struct cl_page *page); +int cl_req_prep (const struct lu_env *env, struct cl_req *req); +void cl_req_attr_set (const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags); +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** error code. */ + int csi_sync_rc; + /** barrier of destroy this structure */ + atomic_t csi_barrier; + /** completion to be signaled when transfer is complete. */ + wait_queue_head_t csi_waitq; +}; + +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages); +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout); +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret); + +/** @} cl_sync_io */ + +/** @} cl_req */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumtpion fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * - there is a notion of "current" environment, attached to the kernel + * data structure representing current thread Top-level lustre code + * allocates an environment and makes it current, then calls into + * non-lustre code, that in turn calls lustre back. Low-level lustre + * code thus called can fetch environment created by the top-level code + * and reuse it, avoiding additional environment allocation. + * Right now, three interfaces can attach the cl_env to running thread: + * - cl_env_get + * - cl_env_implant + * - cl_env_reexit(cl_env_reenter had to be called priorly) + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct cl_env_nest { + int cen_refcheck; + void *cen_cookie; +}; + +struct lu_env *cl_env_peek (int *refcheck); +struct lu_env *cl_env_get (int *refcheck); +struct lu_env *cl_env_alloc (int *refcheck, __u32 tags); +struct lu_env *cl_env_nested_get (struct cl_env_nest *nest); +void cl_env_put (struct lu_env *env, int *refcheck); +void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env); +void *cl_env_reenter (void); +void cl_env_reexit (void *cookie); +void cl_env_implant (struct lu_env *env, int *refcheck); +void cl_env_unplant (struct lu_env *env, int *refcheck); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +int cl_global_init(void); +void cl_global_fini(void); + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h new file mode 100644 index 000000000000..e116bb21b529 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/dt_object.h @@ -0,0 +1,1498 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_DT_OBJECT_H +#define __LUSTRE_DT_OBJECT_H + +/** \defgroup dt dt + * Sub-class of lu_object with methods common for "data" objects in OST stack. + * + * Data objects behave like regular files: you can read/write them, get and + * set their attributes. Implementation of dt interface is supposed to + * implement some form of garbage collection, normally reference counting + * (nlink) based one. + * + * Examples: osd (lustre/osd) is an implementation of dt interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include + +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; + +struct thandle; +struct dt_device; +struct dt_object; +struct dt_index_features; +struct niobuf_local; +struct niobuf_remote; +struct ldlm_enqueue_info; + +typedef enum { + MNTOPT_USERXATTR = 0x00000001, + MNTOPT_ACL = 0x00000002, +} mntopt_t; + +struct dt_device_param { + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_block_shift; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + void *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */ + int ddp_mount_type; + unsigned long long ddp_maxbytes; + /* percentage of available space to reserve for grant error margin */ + int ddp_grant_reserved; + /* per-inode space consumption */ + short ddp_inodespace; + /* per-fragment grant overhead to be used by client for grant + * calculation */ + int ddp_grant_frag; +}; + +/** + * Per-transaction commit callback function + */ +struct dt_txn_commit_cb; +typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err); +/** + * Special per-transaction callback for cases when just commit callback + * is needed and per-device callback are not convenient to use + */ +#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a +#define MAX_COMMIT_CB_STR_LEN 32 + +struct dt_txn_commit_cb { + struct list_head dcb_linkage; + dt_cb_t dcb_func; + __u32 dcb_magic; + char dcb_name[MAX_COMMIT_CB_STR_LEN]; +}; + +/** + * Operations on dt device. + */ +struct dt_device_operations { + /** + * Return device-wide statistics. + */ + int (*dt_statfs)(const struct lu_env *env, + struct dt_device *dev, struct obd_statfs *osfs); + /** + * Create transaction, described by \a param. + */ + struct thandle *(*dt_trans_create)(const struct lu_env *env, + struct dt_device *dev); + /** + * Start transaction, described by \a param. + */ + int (*dt_trans_start)(const struct lu_env *env, + struct dt_device *dev, struct thandle *th); + /** + * Finish previously started transaction. + */ + int (*dt_trans_stop)(const struct lu_env *env, + struct thandle *th); + /** + * Add commit callback to the transaction. + */ + int (*dt_trans_cb_add)(struct thandle *th, + struct dt_txn_commit_cb *dcb); + /** + * Return fid of root index object. + */ + int (*dt_root_get)(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f); + /** + * Return device configuration data. + */ + void (*dt_conf_get)(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param); + /** + * handling device state, mostly for tests + */ + int (*dt_sync)(const struct lu_env *env, struct dt_device *dev); + int (*dt_ro)(const struct lu_env *env, struct dt_device *dev); + /** + * Start a transaction commit asynchronously + * + * \param env environment + * \param dev dt_device to start commit on + * + * \return 0 success, negative value if error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); + /** + * Initialize capability context. + */ + int (*dt_init_capa_ctxt)(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys); +}; + +struct dt_index_features { + /** required feature flags from enum dt_index_flags */ + __u32 dif_flags; + /** minimal required key size */ + size_t dif_keysize_min; + /** maximal required key size, 0 if no limit */ + size_t dif_keysize_max; + /** minimal required record size */ + size_t dif_recsize_min; + /** maximal required record size, 0 if no limit */ + size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; +}; + +enum dt_index_flags { + /** index supports variable sized keys */ + DT_IND_VARKEY = 1 << 0, + /** index supports variable sized records */ + DT_IND_VARREC = 1 << 1, + /** index can be modified */ + DT_IND_UPDATE = 1 << 2, + /** index supports records with non-unique (duplicate) keys */ + DT_IND_NONUNQ = 1 << 3, + /** + * index support fixed-size keys sorted with natural numerical way + * and is able to return left-side value if no exact value found + */ + DT_IND_RANGE = 1 << 4, +}; + +/** + * Features, required from index to support file system directories (mapping + * names to fids). + */ +extern const struct dt_index_features dt_directory_features; +extern const struct dt_index_features dt_otable_features; +extern const struct dt_index_features dt_lfsck_features; + +/* index features supported by the accounting objects */ +extern const struct dt_index_features dt_acct_features; + +/* index features supported by the quota global indexes */ +extern const struct dt_index_features dt_quota_glb_features; + +/* index features supported by the quota slave indexes */ +extern const struct dt_index_features dt_quota_slv_features; + +/** + * This is a general purpose dt allocation hint. + * It now contains the parent object. + * It can contain any allocation hint in the future. + */ +struct dt_allocation_hint { + struct dt_object *dah_parent; + __u32 dah_mode; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + int striped; + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; +}; + +enum dt_format_type dt_mode_to_dft(__u32 mode); + +typedef __u64 dt_obj_version_t; + +/** + * Per-dt-object operations. + */ +struct dt_object_operations { + void (*do_read_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_write_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_read_unlock)(const struct lu_env *env, + struct dt_object *dt); + void (*do_write_unlock)(const struct lu_env *env, + struct dt_object *dt); + int (*do_write_locked)(const struct lu_env *env, + struct dt_object *dt); + /** + * Note: following ->do_{x,}attr_{set,get}() operations are very + * similar to ->moo_{x,}attr_{set,get}() operations in struct + * md_object_operations (see md_object.h). These operations are not in + * lu_object_operations, because ->do_{x,}attr_set() versions take + * transaction handle as an argument (this transaction is started by + * caller). We might factor ->do_{x,}attr_get() into + * lu_object_operations, but that would break existing symmetry. + */ + + /** + * Return standard attributes. + * + * precondition: lu_object_exists(&dt->do_lu); + */ + int (*do_attr_get)(const struct lu_env *env, + struct dt_object *dt, struct lu_attr *attr, + struct lustre_capa *capa); + /** + * Set standard attributes. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle); + int (*do_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle, + struct lustre_capa *capa); + /** + * Return a value of an extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name, + struct lustre_capa *capa); + /** + * Set value of an extended attribute. + * + * \a fl - flags from enum lu_xattr_flags + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *handle); + int (*do_xattr_set)(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *handle, + struct lustre_capa *capa); + /** + * Delete existing extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle); + int (*do_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle, + struct lustre_capa *capa); + /** + * Place list of existing extended attributes into \a buf (which has + * length len). + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_list)(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa); + /** + * Init allocation hint using parent object and child mode. + * (1) The \a parent might be NULL if this is a partial creation for + * remote object. + * (2) The type of child is in \a child_mode. + * (3) The result hint is stored in \a ah; + */ + void (*do_ah_init)(const struct lu_env *env, + struct dt_allocation_hint *ah, + struct dt_object *parent, + struct dt_object *child, + umode_t child_mode); + /** + * Create new object on this device. + * + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + int (*do_create)(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + Destroy object on this device + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + int (*do_destroy)(const struct lu_env *env, struct dt_object *dt, + struct thandle *th); + + /** + * Announce that this object is going to be used as an index. This + * operation check that object supports indexing operations and + * installs appropriate dt_index_operations vector on success. + * + * Also probes for features. Operation is successful if all required + * features are supported. + */ + int (*do_index_try)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_index_features *feat); + /** + * Add nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + /** + * Del nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + + struct obd_capa *(*do_capa_get)(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, + __u64 opc); + int (*do_object_sync)(const struct lu_env *, struct dt_object *); + /** + * Get object info of next level. Currently, only get inode from osd. + * This is only used by quota b=16542 + * precondition: dt_object_exists(dt); + */ + int (*do_data_get)(const struct lu_env *env, struct dt_object *dt, + void **data); + + /** + * Lock object. + */ + int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy); +}; + +/** + * Per-dt-object operations on "file body". + */ +struct dt_body_operations { + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_declare_write)(const struct lu_env *env, + struct dt_object *dt, + const loff_t size, loff_t pos, + struct thandle *handle); + ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /* + * methods for zero-copy IO + */ + + /* + * precondition: dt_object_exists(dt); + * returns: + * < 0 - error code + * = 0 - illegal + * > 0 - number of local buffers prepared + */ + int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt, + loff_t pos, ssize_t len, struct niobuf_local *lb, + int rw, struct lustre_capa *capa); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_declare_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *, + int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *, int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int nr); + int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt, + struct ll_user_fiemap *fm); + /** + * Punch object's content + * precondition: regular object, not index + */ + int (*dbo_declare_punch)(const struct lu_env *, struct dt_object *, + __u64, __u64, struct thandle *th); + int (*dbo_punch)(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa); +}; + +/** + * Incomplete type of index record. + */ +struct dt_rec; + +/** + * Incomplete type of index key. + */ +struct dt_key; + +/** + * Incomplete type of dt iterator. + */ +struct dt_it; + +/** + * Per-dt-object operations on object as index. + */ +struct dt_index_operations { + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *handle); + int (*dio_insert)(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, const struct dt_key *key, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *handle); + int (*dio_delete)(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *handle, + struct lustre_capa *capa); + /** + * Iterator interface + */ + struct dt_it_ops { + /** + * Allocate and initialize new iterator. + * + * precondition: dt_object_exists(dt); + */ + struct dt_it *(*init)(const struct lu_env *env, + struct dt_object *dt, + __u32 attr, + struct lustre_capa *capa); + void (*fini)(const struct lu_env *env, + struct dt_it *di); + int (*get)(const struct lu_env *env, + struct dt_it *di, + const struct dt_key *key); + void (*put)(const struct lu_env *env, + struct dt_it *di); + int (*next)(const struct lu_env *env, + struct dt_it *di); + struct dt_key *(*key)(const struct lu_env *env, + const struct dt_it *di); + int (*key_size)(const struct lu_env *env, + const struct dt_it *di); + int (*rec)(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, + __u32 attr); + __u64 (*store)(const struct lu_env *env, + const struct dt_it *di); + int (*load)(const struct lu_env *env, + const struct dt_it *di, __u64 hash); + int (*key_rec)(const struct lu_env *env, + const struct dt_it *di, void* key_rec); + } dio_it; +}; + +enum dt_otable_it_valid { + DOIV_ERROR_HANDLE = 0x0001, +}; + +enum dt_otable_it_flags { + /* Exit when fail. */ + DOIF_FAILOUT = 0x0001, + + /* Reset iteration position to the device beginning. */ + DOIF_RESET = 0x0002, + + /* There is up layer component uses the iteration. */ + DOIF_OUTUSED = 0x0004, +}; + +/* otable based iteration needs to use the common DT interation APIs. + * To initialize the iteration, it needs call dio_it::init() firstly. + * Here is how the otable based iteration should prepare arguments to + * call dt_it_ops::init(). + * + * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init() + * is composed of two parts: + * low 16-bits is for valid bits, high 16-bits is for flags bits. */ +#define DT_OTABLE_IT_FLAGS_SHIFT 16 +#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000 + +struct dt_device { + struct lu_device dd_lu_dev; + const struct dt_device_operations *dd_ops; + + /** + * List of dt_txn_callback (see below). This is not protected in any + * way, because callbacks are supposed to be added/deleted only during + * single-threaded start-up shut-down procedures. + */ + struct list_head dd_txn_callbacks; +}; + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t); +void dt_device_fini(struct dt_device *dev); + +static inline int lu_device_is_dt(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT); +} + +static inline struct dt_device * lu2dt_dev(struct lu_device *l) +{ + LASSERT(lu_device_is_dt(l)); + return container_of0(l, struct dt_device, dd_lu_dev); +} + +struct dt_object { + struct lu_object do_lu; + const struct dt_object_operations *do_ops; + const struct dt_body_operations *do_body_ops; + const struct dt_index_operations *do_index_ops; +}; + +/* + * In-core representation of per-device local object OID storage + */ +struct local_oid_storage { + /* all initialized llog systems on this node linked by this */ + struct list_head los_list; + + /* how many handle's reference this los has */ + atomic_t los_refcount; + struct dt_device *los_dev; + struct dt_object *los_obj; + + /* data used to generate new fids */ + struct mutex los_id_lock; + __u64 los_seq; + __u32 los_last_oid; +}; + +static inline struct dt_object *lu2dt(struct lu_object *l) +{ + LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev)); + return container_of0(l, struct dt_object, do_lu); +} + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d); + +void dt_object_fini(struct dt_object *obj); + +static inline int dt_object_exists(const struct dt_object *dt) +{ + return lu_object_exists(&dt->do_lu); +} + +static inline int dt_object_remote(const struct dt_object *dt) +{ + return lu_object_remote(&dt->do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of0(o, struct dt_object, do_lu); +} + +/** + * This is the general purpose transaction handle. + * 1. Transaction Life Cycle + * This transaction handle is allocated upon starting a new transaction, + * and deallocated after this transaction is committed. + * 2. Transaction Nesting + * We do _NOT_ support nested transaction. So, every thread should only + * have one active transaction, and a transaction only belongs to one + * thread. Due to this, transaction handle need no reference count. + * 3. Transaction & dt_object locking + * dt_object locks should be taken inside transaction. + * 4. Transaction & RPC + * No RPC request should be issued inside transaction. + */ +struct thandle { + /** the dt device on which the transactions are executed */ + struct dt_device *th_dev; + + /** context for this transaction, tag is LCT_TX_HANDLE */ + struct lu_context th_ctx; + + /** additional tags (layers can add in declare) */ + __u32 th_tags; + + /** the last operation result in this transaction. + * this value is used in recovery */ + __s32 th_result; + + /** whether we need sync commit */ + unsigned int th_sync:1; + + /* local transation, no need to inform other layers */ + unsigned int th_local:1; + + /* In DNE, one transaction can be disassemblied into + * updates on several different MDTs, and these updates + * will be attached to th_remote_update_list per target. + * Only single thread will access the list, no need lock + */ + struct list_head th_remote_update_list; + struct update_request *th_current_request; +}; + +/** + * Transaction call-backs. + * + * These are invoked by osd (or underlying transaction engine) when + * transaction changes state. + * + * Call-backs are used by upper layers to modify transaction parameters and to + * perform some actions on for each transaction state transition. Typical + * example is mdt registering call-back to write into last-received file + * before each transaction commit. + */ +struct dt_txn_callback { + int (*dtc_txn_start)(const struct lu_env *env, + struct thandle *txn, void *cookie); + int (*dtc_txn_stop)(const struct lu_env *env, + struct thandle *txn, void *cookie); + void (*dtc_txn_commit)(struct thandle *txn, void *cookie); + void *dtc_cookie; + __u32 dtc_tag; + struct list_head dtc_linkage; +}; + +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb); +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *txn); +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); +void dt_txn_hook_commit(struct thandle *txn); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid); + +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *attr); + +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev); +static inline struct dt_object * +dt_locate(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev); +} + + +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los); +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los); +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid); +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th); +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *th); +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode); +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode); +struct dt_object * +local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name); + +static inline int dt_object_lock(const struct lu_env *env, + struct dt_object *o, struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_lock); + return o->do_ops->do_object_lock(env, o, lh, einfo, policy); +} + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid); + +static inline int dt_object_sync(const struct lu_env *env, + struct dt_object *o) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_sync); + return o->do_ops->do_object_sync(env, o); +} + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th); +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th); +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o); + + +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th); +typedef int (*dt_index_page_build_t)(const struct lu_env *env, + union lu_page *lp, int nob, + const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg); +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg); +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); + +static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + LASSERT(d->dd_ops->dt_trans_create); + return d->dd_ops->dt_trans_create(env, d); +} + +static inline int dt_trans_start(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + return d->dd_ops->dt_trans_start(env, d, th); +} + +/* for this transaction hooks shouldn't be called */ +static inline int dt_trans_start_local(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + th->th_local = 1; + return d->dd_ops->dt_trans_start(env, d, th); +} + +static inline int dt_trans_stop(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_stop); + return d->dd_ops->dt_trans_stop(env, th); +} + +static inline int dt_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + LASSERT(th->th_dev->dd_ops->dt_trans_cb_add); + dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC; + return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb); +} +/** @} dt */ + + +static inline int dt_declare_record_write(const struct lu_env *env, + struct dt_object *dt, + int size, loff_t pos, + struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th); + return rc; +} + +static inline int dt_declare_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_create); + return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_create); + return dt->do_ops->do_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_declare_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_destroy); + return dt->do_ops->do_declare_destroy(env, dt, th); +} + +static inline int dt_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_destroy); + return dt->do_ops->do_destroy(env, dt, th); +} + +static inline void dt_read_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_lock); + dt->do_ops->do_read_lock(env, dt, role); +} + +static inline void dt_write_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_lock); + dt->do_ops->do_write_lock(env, dt, role); +} + +static inline void dt_read_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_unlock); + dt->do_ops->do_read_unlock(env, dt); +} + +static inline void dt_write_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_unlock); + dt->do_ops->do_write_unlock(env, dt); +} + +static inline int dt_write_locked(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_locked); + return dt->do_ops->do_write_locked(env, dt); +} + +static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *la, void *arg) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_get); + return dt->do_ops->do_attr_get(env, dt, la, arg); +} + +static inline int dt_declare_attr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *la, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_set); + return dt->do_ops->do_declare_attr_set(env, dt, la, th); +} + +static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *la, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_set); + return dt->do_ops->do_attr_set(env, dt, la, th, capa); +} + +static inline int dt_declare_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_add); + return dt->do_ops->do_declare_ref_add(env, dt, th); +} + +static inline int dt_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_add); + return dt->do_ops->do_ref_add(env, dt, th); +} + +static inline int dt_declare_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_del); + return dt->do_ops->do_declare_ref_del(env, dt, th); +} + +static inline int dt_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_ref_del(env, dt, th); +} + +static inline struct obd_capa *dt_capa_get(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, __u64 opc) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_capa_get(env, dt, old, opc); +} + +static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, + struct niobuf_remote *rnb, + struct niobuf_local *lnb, int rw, + struct lustre_capa *capa) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_get); + return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset, + rnb->len, lnb, rw, capa); +} + +static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_put); + return d->do_body_ops->dbo_bufs_put(env, d, lnb, n); +} + +static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_prep); + return d->do_body_ops->dbo_write_prep(env, d, lnb, n); +} + +static inline int dt_declare_write_commit(const struct lu_env *env, + struct dt_object *d, + struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERTF(d != NULL, "dt is NULL when we want to declare write\n"); + LASSERT(th != NULL); + return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th); +} + + +static inline int dt_write_commit(const struct lu_env *env, + struct dt_object *d, struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_commit); + return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th); +} + +static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_read_prep); + return d->do_body_ops->dbo_read_prep(env, d, lnb, n); +} + +static inline int dt_declare_punch(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_punch); + return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th); +} + +static inline int dt_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_punch); + return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa); +} + +static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d, + struct ll_user_fiemap *fm) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_fiemap_get == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_fiemap_get(env, d, fm); +} + +static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev, + struct obd_statfs *osfs) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_statfs); + return dev->dd_ops->dt_statfs(env, dev, osfs); +} + +static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev, + struct lu_fid *f) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_root_get); + return dev->dd_ops->dt_root_get(env, dev, f); +} + +static inline void dt_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_conf_get); + return dev->dd_ops->dt_conf_get(env, dev, param); +} + +static inline int dt_sync(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_sync); + return dev->dd_ops->dt_sync(env, dev); +} + +static inline int dt_ro(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_ro); + return dev->dd_ops->dt_ro(env, dev); +} + +static inline int dt_declare_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_insert); + return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th); +} + +static inline int dt_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa, + int noquota) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_insert); + return dt->do_index_ops->dio_insert(env, dt, rec, key, th, + capa, noquota); +} + +static inline int dt_declare_xattr_del(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_del); + return dt->do_ops->do_declare_xattr_del(env, dt, name, th); +} + +static inline int dt_xattr_del(const struct lu_env *env, + struct dt_object *dt, const char *name, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_del); + return dt->do_ops->do_xattr_del(env, dt, name, th, capa); +} + +static inline int dt_declare_xattr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_set); + return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_xattr_set(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_set); + return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa); +} + +static inline int dt_xattr_get(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + const char *name, struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_get); + return dt->do_ops->do_xattr_get(env, dt, buf, name, capa); +} + +static inline int dt_xattr_list(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_list); + return dt->do_ops->do_xattr_list(env, dt, buf, capa); +} + +static inline int dt_declare_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_delete); + return dt->do_index_ops->dio_declare_delete(env, dt, key, th); +} + +static inline int dt_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_delete); + return dt->do_index_ops->dio_delete(env, dt, key, th, capa); +} + +static inline int dt_commit_async(const struct lu_env *env, + struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_commit_async); + return dev->dd_ops->dt_commit_async(env, dev); +} + +static inline int dt_init_capa_ctxt(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_init_capa_ctxt); + return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode, + timeout, alg, keys); +} + +static inline int dt_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key, + struct lustre_capa *capa) +{ + int ret; + + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_lookup); + + ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -ENOENT; + return ret; +} + +#define LU221_BAD_TIME (0x80000000U + 24 * 3600) + +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct dt_find_hint dti_dfh; + struct lu_attr dti_attr; + struct lu_fid dti_fid; + struct dt_object_format dti_dof; + struct lustre_mdt_attrs dti_lma; + struct lu_buf dti_lb; + loff_t dti_off; +}; + +extern struct lu_context_key dt_key; + +static inline struct dt_thread_info *dt_info(const struct lu_env *env) +{ + struct dt_thread_info *dti; + + dti = lu_context_key_get(&env->le_ctx, &dt_key); + LASSERT(dti); + return dti; +} + +int dt_global_init(void); +void dt_global_fini(void); + +# ifdef LPROCFS +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +# endif /* LPROCFS */ + +#endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h new file mode 100644 index 000000000000..dfdb8aa4e035 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/interval_tree.h @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/interval_tree.h + * + * Author: Huang Wei + * Author: Jay Xiong + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include /* LASSERT. */ + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline void interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + LASSERT(start <= end); + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; +} + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func,void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h new file mode 100644 index 000000000000..227c261b2ae9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/ioctl.h @@ -0,0 +1,106 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _IOWR + +/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h, + * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H. + * + * We can avoid any problems with the kernel header being included again by + * defining _ASM_I386_IOCTL_H here so that a later occurence of + * does not include the kernel's ioctl.h after this one. b=14746 */ +#define _ASM_I386_IOCTL_H +#define _ASM_GENERIC_IOCTL_H + +/* ioctl command encoding: 32 bits total, command in lower 16 bits, + * size of the parameter structure in the lower 14 bits of the + * upper 16 bits. + * Encoding the size of the parameter structure in the ioctl request + * The highest 2 bits are reserved for indicating the ``access mode''. + * NOTE: This limits the max parameter size to 16kB -1 ! + */ + +/* + * The following is for compatibility across the various Linux + * platforms. The i386 ioctl numbering scheme doesn't really enforce + * a type field. De facto, however, the top 8 bits of the lower 16 + * bits are indeed used as a type field, so we might just as well make + * this explicit here. Please be sure to use the decoding macros + * below from now on. + */ +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 14 +#define _IOC_DIRBITS 2 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +/* + * Direction bits. + */ +#define _IOC_NONE 0U +#define _IOC_WRITE 1U +#define _IOC_READ 2U + +#define _IOC(dir,type,nr,size) (((dir) << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr) << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT)) + +/* used to create numbers */ +#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) +#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) +#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) +#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) + +/* used to decode ioctl numbers.. */ +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +/* ...and for the drivers/sound files... */ + +#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) +#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) +#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) +#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) +#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) + +#endif /* _IOWR */ diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h new file mode 100644 index 000000000000..d00600ce208f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lclient.h @@ -0,0 +1,441 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Definitions shared between vvp and liblustre, and other clients in the + * future. + * + * Author: Oleg Drokin + * Author: Nikita Danilov + */ + +#ifndef LCLIENT_H +#define LCLIENT_H + +blkcnt_t dirty_cnt(struct inode *inode); + +int cl_glimpse_size0(struct inode *inode, int agl); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl); + +static inline int cl_glimpse_size(struct inode *inode) +{ + return cl_glimpse_size0(inode, 0); +} + +static inline int cl_agl(struct inode *inode) +{ + return cl_glimpse_size0(inode, 1); +} + +/** + * Locking policy for setattr. + */ +enum ccc_setattr_lock_type { + /** Locking is done by server */ + SETATTR_NOLOCK, + /** Extent lock is enqueued */ + SETATTR_EXTENT_LOCK, + /** Existing local extent lock is used */ + SETATTR_MATCH_LOCK +}; + + +/** + * IO state private to vvp or slp layers. + */ +struct ccc_io { + /** super class */ + struct cl_io_slice cui_cl; + struct cl_io_lock_link cui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iovec *cui_iov; + unsigned long cui_nrsegs; + /** + * Total iov count for left IO. + */ + unsigned long cui_tot_nrsegs; + /** + * Old length for iov that was truncated partially. + */ + size_t cui_iov_olen; + /** + * Total size for the left IO. + */ + size_t cui_tot_count; + + union { + struct { + enum ccc_setattr_lock_type cui_local_lock; + } setattr; + } u; + /** + * True iff io is processing glimpse right now. + */ + int cui_glimpse; + /** + * Layout version when this IO is initialized + */ + __u32 cui_layout_gen; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *cui_fd; + struct kiocb *cui_iocb; +}; + +/** + * True, if \a io is a normal io, False for other (sendfile, splice*). + * must be impementated in arch specific code. + */ +int cl_is_normalio(const struct lu_env *env, const struct cl_io *io); + +extern struct lu_context_key ccc_key; +extern struct lu_context_key ccc_session_key; + +struct ccc_thread_info { + struct cl_lock_descr cti_descr; + struct cl_io cti_io; + struct cl_attr cti_attr; +}; + +static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env) +{ + struct ccc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &ccc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env) +{ + struct cl_attr *attr = &ccc_env_info(env)->cti_attr; + memset(attr, 0, sizeof(*attr)); + return attr; +} + +static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &ccc_env_info(env)->cti_io; + memset(io, 0, sizeof(*io)); + return io; +} + +struct ccc_session { + struct ccc_io cs_ios; +}; + +static inline struct ccc_session *ccc_env_session(const struct lu_env *env) +{ + struct ccc_session *ses; + + ses = lu_context_key_get(env->le_ses, &ccc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct ccc_io *ccc_env_io(const struct lu_env *env) +{ + return &ccc_env_session(env)->cs_ios; +} + +/** + * ccc-private object state. + */ +struct ccc_object { + struct cl_object_header cob_header; + struct cl_object cob_cl; + struct inode *cob_inode; + + /** + * A list of dirty pages pending IO in the cache. Used by + * SOM. Protected by ll_inode_info::lli_lock. + * + * \see ccc_page::cpg_pending_linkage + */ + struct list_head cob_pending_list; + + /** + * Access this counter is protected by inode->i_sem. Now that + * the lifetime of transient pages must be covered by inode sem, + * we don't need to hold any lock.. + */ + int cob_transient_pages; + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t cob_mmap_cnt; + + /** + * various flags + * cob_discard_page_warned + * if pages belonging to this object are discarded when a client + * is evicted, some debug info will be printed, this flag will be set + * during processing the first discarded page, then avoid flooding + * debug message for lots of discarded pages. + * + * \see ll_dirty_page_discard_warn. + */ + unsigned int cob_discard_page_warned:1; +}; + +/** + * ccc-private page state. + */ +struct ccc_page { + struct cl_page_slice cpg_cl; + int cpg_defer_uptodate; + int cpg_ra_used; + int cpg_write_queued; + /** + * Non-empty iff this page is already counted in + * ccc_object::cob_pending_list. Protected by + * ccc_object::cob_pending_guard. This list is only used as a flag, + * that is, never iterated through, only checked for list_empty(), but + * having a list is useful for debugging. + */ + struct list_head cpg_pending_linkage; + /** VM page */ + struct page *cpg_page; +}; + +static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct ccc_page, cpg_cl); +} + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); + +struct ccc_device { + struct cl_device cdv_cl; + struct super_block *cdv_sb; + struct cl_device *cdv_next; +}; + +struct ccc_lock { + struct cl_lock_slice clk_cl; +}; + +struct ccc_req { + struct cl_req_slice crq_cl; +}; + +void *ccc_key_init (const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_key_fini (const struct lu_context *ctx, + struct lu_context_key *key, void *data); +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + +int ccc_device_init (const struct lu_env *env, + struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *ccc_device_fini (const struct lu_env *env, + struct lu_device *d); +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops); +struct lu_device *ccc_device_free (const struct lu_env *env, + struct lu_device *d); +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops); + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +void ccc_umount(const struct lu_env *env, struct cl_device *dev); +int ccc_global_init(struct lu_device_type *device_type); +void ccc_global_fini(struct lu_device_type *device_type); +int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob, + const struct cl_object_conf *conf); +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void ccc_object_free(const struct lu_env *env, struct lu_object *obj); +int ccc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io, + const struct cl_lock_operations *lkops); +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +struct page *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice); +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice); +void ccc_transient_page_verify(const struct cl_page *page); +int ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice); +void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice); +int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); +int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state); + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios); +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end); +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end); +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios); +void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios, + size_t nob); +void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio, + struct cl_io *io); +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, int *exceed); +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *oa, obd_valid flags); + +struct lu_device *ccc2lu_dev (struct ccc_device *vdv); +struct lu_object *ccc2lu (struct ccc_object *vob); +struct ccc_device *lu2ccc_dev (const struct lu_device *d); +struct ccc_device *cl2ccc_dev (const struct cl_device *d); +struct ccc_object *lu2ccc (const struct lu_object *obj); +struct ccc_object *cl2ccc (const struct cl_object *obj); +struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice); +struct ccc_io *cl2ccc_io (const struct lu_env *env, + const struct cl_io_slice *slice); +struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice); +struct page *cl2vm_page (const struct cl_page_slice *slice); +struct inode *ccc_object_inode(const struct cl_object *obj); +struct ccc_object *cl_inode2ccc (struct inode *inode); + +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa); + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); +int ccc_object_invariant(const struct cl_object *obj); +int cl_file_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); +int cl_local_size(struct inode *inode); + +__u16 ll_dirent_type_get(struct lu_dirent *ent); +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); +__u32 cl_fid_build_gen(const struct lu_fid *fid); + +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr)) + +int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp); +int cl_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + +struct ccc_grouplock { + struct lu_env *cg_env; + struct cl_io *cg_io; + struct cl_lock *cg_lock; + unsigned long cg_gid; +}; + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ccc_grouplock *cg); +void cl_put_grouplock(struct ccc_grouplock *cg); + +/** + * New interfaces to get and put lov_stripe_md from lov layer. This violates + * layering because lov_stripe_md is supposed to be a private data in lov. + * + * NB: If you find you have to use these interfaces for your new code, please + * think about it again. These interfaces may be removed in the future for + * better layering. */ +struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj); +void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm); +int lov_read_and_clear_async_rc(struct cl_object *clob); + +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode); +void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm); + +/** + * Data structure managing a client's cached pages. A count of + * "unstable" pages is maintained, and an LRU of clean pages is + * maintained. "unstable" pages are pages pinned by the ptlrpc + * layer for recovery purposes. + */ +struct cl_client_cache { + atomic_t ccc_users; /* # of users (OSCs) of this data */ + struct list_head ccc_lru; /* LRU list of cached clean pages */ + spinlock_t ccc_lru_lock; /* lock for list */ + atomic_t ccc_lru_left; /* # of LRU entries available */ + unsigned long ccc_lru_max; /* Max # of LRU entries possible */ + unsigned int ccc_lru_shrinkers; /* # of threads reclaiming */ + atomic_t ccc_unstable_nr; /* # of unstable pages pinned */ + wait_queue_head_t ccc_unstable_waitq; /* Signaled on BRW commit */ +}; + +#endif /*LCLIENT_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h new file mode 100644 index 000000000000..586692272d78 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h @@ -0,0 +1,58 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lprocfs_status.h + * + * Top level header file for LProc SNMP + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LINUX_LPROCFS_SNMP_H +#define _LINUX_LPROCFS_SNMP_H + +#ifndef _LPROCFS_SNMP_H +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include +#include +#include +#include +#include + + +#endif /* LPROCFS_SNMP_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h new file mode 100644 index 000000000000..ff4fc4ff2894 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h @@ -0,0 +1,66 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_acl.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_LINUX_ACL_H +#define _LUSTRE_LINUX_ACL_H + +#ifndef _LUSTRE_ACL_H +#error Shoud not include direectly. use #include instead +#endif + +# include +# include +# ifdef CONFIG_FS_POSIX_ACL +# include +# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 +# define LUSTRE_POSIX_ACL_MAX_SIZE \ + (sizeof(posix_acl_xattr_header) + \ + LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry)) +# endif /* CONFIG_FS_POSIX_ACL */ +# include +# include /* XATTR_{REPLACE,CREATE} */ + +#ifndef LUSTRE_POSIX_ACL_MAX_SIZE +# define LUSTRE_POSIX_ACL_MAX_SIZE 0 +#endif + +#endif /* _LUSTRE_LINUX_ACL_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h new file mode 100644 index 000000000000..d1783a33d8ca --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_common.h @@ -0,0 +1,22 @@ +#ifndef LUSTRE_COMMON_H +#define LUSTRE_COMMON_H + +#include + +static inline int cfs_cleanup_group_info(void) +{ + struct group_info *ginfo; + + ginfo = groups_alloc(0); + if (!ginfo) + return -ENOMEM; + + set_current_groups(ginfo); + put_group_info(ginfo); + + return 0; +} + +#define ll_inode_blksize(a) (1<<(a)->i_blkbits) + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h new file mode 100644 index 000000000000..dff04688945b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h @@ -0,0 +1,349 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_COMPAT25_H +#define _LINUX_COMPAT25_H + +#include +#include +#include + +#include + +# define LOCK_FS_STRUCT(fs) spin_lock(&(fs)->lock) +# define UNLOCK_FS_STRUCT(fs) spin_unlock(&(fs)->lock) + +static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct path path; + struct path old_pwd; + + path.mnt = mnt; + path.dentry = dentry; + LOCK_FS_STRUCT(fs); + old_pwd = fs->pwd; + path_get(&path); + fs->pwd = path; + UNLOCK_FS_STRUCT(fs); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + + +/* + * set ATTR_BLOCKS to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_BLOCKS (1 << 27) + +#define current_ngroups current_cred()->group_info->ngroups +#define current_groups current_cred()->group_info->small_block + +/* + * OBD need working random driver, thus all our + * initialization routines must be called after device + * driver initialization + */ +#ifndef MODULE +#undef module_init +#define module_init(a) late_initcall(a) +#endif + + +#define LTIME_S(time) (time.tv_sec) + +#define ll_permission(inode,mask,nd) inode_permission(inode,mask) + +# define ll_generic_permission(inode, mask, flags, check_acl) \ + generic_permission(inode, mask) + +#define ll_blkdev_put(a, b) blkdev_put(a, b) + +#define ll_dentry_open(a,b,c) dentry_open(a,b,c) + +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path) + + +#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \ + generic_file_llseek_size(file, offset, origin, maxbytes, eof); + +/* inode_dio_wait(i) use as-is for write lock */ +# define inode_dio_write_done(i) do {} while (0) /* for write unlock */ +# define inode_dio_read(i) atomic_inc(&(i)->i_dio_count) +/* inode_dio_done(i) use as-is for read unlock */ + +#define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock) +#define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock) + +static inline +int ll_unregister_blkdev(unsigned int dev, const char *name) +{ + unregister_blkdev(dev, name); + return 0; +} + +#define ll_invalidate_bdev(a,b) invalidate_bdev((a)) + +#ifndef FS_HAS_FIEMAP +#define FS_HAS_FIEMAP (0) +#endif + + + +/* add a lustre compatible layer for crypto API */ +#include +#define ll_crypto_hash crypto_hash +#define ll_crypto_cipher crypto_blkcipher +#define ll_crypto_alloc_hash(name, type, mask) crypto_alloc_hash(name, type, mask) +#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen) +#define ll_crypto_hash_init(desc) crypto_hash_init(desc) +#define ll_crypto_hash_update(desc, sl, bytes) crypto_hash_update(desc, sl, bytes) +#define ll_crypto_hash_final(desc, out) crypto_hash_final(desc, out) +#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \ + crypto_blkcipher_setkey(tfm, key, keylen) +#define ll_crypto_blkcipher_set_iv(tfm, src, len) \ + crypto_blkcipher_set_iv(tfm, src, len) +#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \ + crypto_blkcipher_get_iv(tfm, dst, len) +#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \ + crypto_blkcipher_encrypt(desc, dst, src, bytes) +#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \ + crypto_blkcipher_decrypt(desc, dst, src, bytes) +#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \ + crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) +#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \ + crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) + +static inline +struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name, + u32 type, u32 mask) +{ + struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask); + + return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn); +} + +static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm, + u8 *key, unsigned int *keylen, + struct scatterlist *sg, + unsigned int size, u8 *result) +{ + struct hash_desc desc; + int rv; + desc.tfm = tfm; + desc.flags = 0; + rv = crypto_hash_setkey(desc.tfm, key, *keylen); + if (rv) { + CERROR("failed to hash setkey: %d\n", rv); + return rv; + } + return crypto_hash_digest(&desc, sg, size, result); +} +static inline +unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm) +{ + return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize; +} +static inline +unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm) +{ + return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize; +} + +#define ll_crypto_hash_blocksize(tfm) crypto_hash_blocksize(tfm) +#define ll_crypto_hash_digestsize(tfm) crypto_hash_digestsize(tfm) +#define ll_crypto_blkcipher_ivsize(tfm) crypto_blkcipher_ivsize(tfm) +#define ll_crypto_blkcipher_blocksize(tfm) crypto_blkcipher_blocksize(tfm) +#define ll_crypto_free_hash(tfm) crypto_free_hash(tfm) +#define ll_crypto_free_blkcipher(tfm) crypto_free_blkcipher(tfm) + +#define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry) +#define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode) +#define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new) +#define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry) +#define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev) +#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) +#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \ + vfs_rename(old,old_dir,new,new_dir) + +#ifdef for_each_possible_cpu +#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) +#elif defined(for_each_cpu) +#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu) +#endif + +#define cfs_bio_io_error(a,b) bio_io_error((a)) +#define cfs_bio_endio(a,b,c) bio_endio((a),(c)) + +#define cfs_fs_pwd(fs) ((fs)->pwd.dentry) +#define cfs_fs_mnt(fs) ((fs)->pwd.mnt) +#define cfs_path_put(nd) path_put(&(nd)->path) + + +#ifndef SLAB_DESTROY_BY_RCU +#define SLAB_DESTROY_BY_RCU 0 +#endif + + + +static inline int +ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount) +{ + int rc; + + if (sb->s_qcop->quota_on) { + struct path path; + + rc = kern_path(name, LOOKUP_FOLLOW, &path); + if (!rc) + return rc; + rc = sb->s_qcop->quota_on(sb, off, ver + , &path + ); + path_put(&path); + return rc; + } + else + return -ENOSYS; +} + +static inline int ll_quota_off(struct super_block *sb, int off, int remount) +{ + if (sb->s_qcop->quota_off) { + return sb->s_qcop->quota_off(sb, off + ); + } + else + return -ENOSYS; +} + + +# define ll_vfs_dq_init dquot_initialize +# define ll_vfs_dq_drop dquot_drop +# define ll_vfs_dq_transfer dquot_transfer +# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1) + + + + + +#define queue_max_phys_segments(rq) queue_max_segments(rq) +#define queue_max_hw_segments(rq) queue_max_segments(rq) + +#define ll_kmap_atomic(a, b) kmap_atomic(a) +#define ll_kunmap_atomic(a, b) kunmap_atomic(a) + + +#define ll_d_hlist_node hlist_node +#define ll_d_hlist_empty(list) hlist_empty(list) +#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name) +#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry) +#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \ + p = NULL; hlist_for_each_entry(dentry, i_dentry, alias) + + +#define bio_hw_segments(q, bio) 0 + + +#define ll_pagevec_init(pv, cold) do {} while (0) +#define ll_pagevec_add(pv, pg) (0) +#define ll_pagevec_lru_add_file(pv) do {} while (0) + + +#ifndef QUOTA_OK +# define QUOTA_OK 0 +#endif +#ifndef NO_QUOTA +# define NO_QUOTA (-EDQUOT) +#endif + +#ifndef SEEK_DATA +#define SEEK_DATA 3 /* seek to the next data */ +#endif +#ifndef SEEK_HOLE +#define SEEK_HOLE 4 /* seek to the next hole */ +#endif + +#ifndef FMODE_UNSIGNED_OFFSET +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#endif + +#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit) +# define ext2_set_bit __test_and_set_bit_le +# define ext2_clear_bit __test_and_clear_bit_le +# define ext2_test_bit test_bit_le +# define ext2_find_first_zero_bit find_first_zero_bit_le +# define ext2_find_next_zero_bit find_next_zero_bit_le +#endif + +#ifdef ATTR_TIMES_SET +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +#else +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET) +#endif + + + +/* + * After 3.1, kernel's nameidata.intent.open.flags is different + * with lustre's lookup_intent.it_flags, as lustre's it_flags' + * lower bits equal to FMODE_xxx while kernel doesn't transliterate + * lower bits of nameidata.intent.open.flags to FMODE_xxx. + * */ +#include +static inline int ll_namei_to_lookup_intent_flag(int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0) + flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag); +#endif + return flag; +} + +# define ll_mrf_ret void +# define LL_MRF_RETURN(rc) + +#include + +# define ll_umode_t umode_t + +#include + +# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) + +#endif /* _COMPAT25_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h new file mode 100644 index 000000000000..11deac7248ae --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_DEBUG_H +#define _LINUX_LUSTRE_DEBUG_H + +#ifndef _LUSTRE_DEBUG_H +#error Do not #include this file directly. #include instead +#endif + +#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ + CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\ + fmt, page, page->mapping, page->index, (long)page->flags, \ + page_count(page), page_private(page), ## arg) + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h new file mode 100644 index 000000000000..207df03f6149 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h @@ -0,0 +1,46 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_DLM_H__ +#define _LINUX_LUSTRE_DLM_H__ + +#ifndef _LUSTRE_DLM_H__ +#error Do not #include this file directly. #include instead +#endif + +# include +# include +# include + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h new file mode 100644 index 000000000000..6c7260957383 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h @@ -0,0 +1,181 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_fsfilt.h + * + * Filesystem interface helper. + */ + +#ifndef _LINUX_LUSTRE_FSFILT_H +#define _LINUX_LUSTRE_FSFILT_H + +#ifndef _LUSTRE_FSFILT_H +#error Do not #include this file directly. #include instead +#endif + + +#include +#include + +typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd, + void *data, int error); + +struct fsfilt_operations { + struct list_head fs_list; + module_t *fs_owner; + char *fs_type; + char *(* fs_getlabel)(struct super_block *sb); + void *(* fs_start)(struct inode *inode, int op, void *desc_private, + int logs); + int (* fs_commit)(struct inode *inode, void *handle,int force_sync); + int (* fs_map_inode_pages)(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int create, struct mutex *sem); + int (* fs_write_record)(struct file *, void *, int size, loff_t *, + int force_sync); + int (* fs_read_record)(struct file *, void *, int size, loff_t *); + int (* fs_setup)(struct super_block *sb); +}; + +extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops); +extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops); +extern struct fsfilt_operations *fsfilt_get_ops(const char *type); +extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); + +static inline char *fsfilt_get_label(struct obd_device *obd, + struct super_block *sb) +{ + if (obd->obd_fsops->fs_getlabel == NULL) + return NULL; + if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0') + return NULL; + + return obd->obd_fsops->fs_getlabel(sb); +} + +#define FSFILT_OP_UNLINK 1 +#define FSFILT_OP_CANCEL_UNLINK 10 + +#define __fsfilt_check_slow(obd, start, msg) \ +do { \ + if (cfs_time_before(jiffies, start + 15 * HZ)) \ + break; \ + else if (cfs_time_before(jiffies, start + 30 * HZ)) \ + CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \ + msg, (jiffies-start) / HZ); \ + else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \ + CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ + else \ + CERROR("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ +} while (0) + +#define fsfilt_check_slow(obd, start, msg) \ +do { \ + __fsfilt_check_slow(obd, start, msg); \ + start = jiffies; \ +} while (0) + +static inline void *fsfilt_start_log(struct obd_device *obd, + struct inode *inode, int op, + struct obd_trans_info *oti, int logs) +{ + unsigned long now = jiffies; + void *parent_handle = oti ? oti->oti_handle : NULL; + void *handle; + + handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs); + CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle); + + if (oti != NULL) { + if (parent_handle == NULL) { + oti->oti_handle = handle; + } else if (handle != parent_handle) { + CERROR("mismatch: parent %p, handle %p, oti %p\n", + parent_handle, handle, oti); + LBUG(); + } + } + fsfilt_check_slow(obd, now, "journal start"); + return handle; +} + +static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, + void *handle, int force_sync) +{ + unsigned long now = jiffies; + int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); + CDEBUG(D_INFO, "committing handle %p\n", handle); + + fsfilt_check_slow(obd, now, "journal start"); + + return rc; +} + +static inline int fsfilt_map_inode_pages(struct obd_device *obd, + struct inode *inode, + struct page **page, int pages, + unsigned long *blocks, + int create, struct mutex *mutex) +{ + return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks, + create, mutex); +} + +static inline int fsfilt_read_record(struct obd_device *obd, struct file *file, + void *buf, loff_t size, loff_t *offs) +{ + return obd->obd_fsops->fs_read_record(file, buf, size, offs); +} + +static inline int fsfilt_write_record(struct obd_device *obd, struct file *file, + void *buf, loff_t size, loff_t *offs, + int force_sync) +{ + return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync); +} + +static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs) +{ + if (obd->obd_fsops->fs_setup) + return obd->obd_fsops->fs_setup(fs); + return 0; +} + + + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h new file mode 100644 index 000000000000..ecf184051252 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_LUSTRE_HANDLES_H_ +#define __LINUX_LUSTRE_HANDLES_H_ + +#ifndef __LUSTRE_HANDLES_H_ +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include +#include +#include +#include + +#include /* for rcu_head{} */ +typedef struct rcu_head cfs_rcu_head_t; + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h new file mode 100644 index 000000000000..b10ddfa7df29 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h @@ -0,0 +1,62 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_INTENT_H +#define LUSTRE_INTENT_H + +/* intent IT_XXX are defined in lustre/include/obd.h */ +struct lustre_intent_data { + int it_disposition; + int it_status; + __u64 it_lock_handle; + __u64 it_lock_bits; + int it_lock_mode; + int it_remote_lock_mode; + __u64 it_remote_lock_handle; + void *it_data; + unsigned int it_lock_set:1; +}; + +struct lookup_intent { + int it_op; + int it_flags; + int it_create_mode; + union { + struct lustre_intent_data lustre; + } d; +}; + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h new file mode 100644 index 000000000000..b2f755acadf6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LINUX_LUSTRE_LIB_H +#define _LINUX_LUSTRE_LIB_H + +#ifndef _LUSTRE_LIB_H +#error Do not #include this file directly. #include instead +#endif + +# include +# include +# include +# include +# include +# include + +#ifndef LP_POISON +#if BITS_PER_LONG > 32 +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) +#else +# define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a) +#endif +#endif + +/* This macro is only for compatibility reasons with older Linux Lustre user + * tools. New ioctls should NOT use this macro as the ioctl "size". Instead + * the ioctl should get a "size" argument which is the actual data type used + * by the ioctl, to ensure the ioctl interface is versioned correctly. */ +#define OBD_IOC_DATA_TYPE long + +#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \ + sigmask(SIGTERM) | sigmask(SIGQUIT) | \ + sigmask(SIGALRM)) + +/* initialize ost_lvb according to inode */ +static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb) +{ + lvb->lvb_size = i_size_read(inode); + lvb->lvb_blocks = inode->i_blocks; + lvb->lvb_mtime = LTIME_S(inode->i_mtime); + lvb->lvb_atime = LTIME_S(inode->i_atime); + lvb->lvb_ctime = LTIME_S(inode->i_ctime); +} + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h new file mode 100644 index 000000000000..c95dff900b58 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h @@ -0,0 +1,100 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LL_H +#define _LINUX_LL_H + +#ifndef _LL_H +#error Do not #include this file directly. #include instead +#endif + + +#include + +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* lprocfs.c */ +enum { + LPROC_LL_DIRTY_HITS = 0, + LPROC_LL_DIRTY_MISSES, + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_BRW_READ, + LPROC_LL_BRW_WRITE, + LPROC_LL_OSC_READ, + LPROC_LL_OSC_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MAP, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STAFS, + LPROC_LL_ALLOC_INODE, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FILE_OPCODES +}; + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h new file mode 100644 index 000000000000..e9c8e56737d2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_log.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LINUX_LUSTRE_LOG_H +#define _LINUX_LUSTRE_LOG_H + +#ifndef _LUSTRE_LOG_H +#error Do not #include this file directly. #include instead +#endif + +#define LUSTRE_LOG_SERVER + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h new file mode 100644 index 000000000000..2d7c425d7012 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_net.h @@ -0,0 +1,50 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_NET_H +#define _LINUX_LUSTRE_NET_H + +#ifndef _LUSTRE_NET_H +#error Do not #include this file directly. #include instead +#endif + +#include +#include + +/* XXX Liang: should be moved to other header instead of here */ +#ifndef WITH_GROUP_INFO +#define WITH_GROUP_INFO +#endif + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h new file mode 100644 index 000000000000..f0508084e8c5 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_PATCHLESS_COMPAT_H +#define LUSTRE_PATCHLESS_COMPAT_H + +#include + +#include +#include +#include + + +#define ll_delete_from_page_cache(page) delete_from_page_cache(page) + +static inline void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + page->mapping->a_ops->invalidatepage(page, 0); + + cancel_dirty_page(page, PAGE_SIZE); + ClearPageMappedToDisk(page); + ll_delete_from_page_cache(page); +} + +# define d_refcount(d) ((d)->d_count) + +#ifdef ATTR_OPEN +# define ATTR_FROM_OPEN ATTR_OPEN +#else +# ifndef ATTR_FROM_OPEN +# define ATTR_FROM_OPEN 0 +# endif +#endif /* ATTR_OPEN */ + +#ifndef ATTR_RAW +#define ATTR_RAW 0 +#endif + +#ifndef ATTR_CTIME_SET +/* + * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_CTIME_SET (1 << 28) +#endif + +#endif /* LUSTRE_PATCHLESS_COMPAT_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h new file mode 100644 index 000000000000..421866b004cf --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_QUOTA_H +#define _LINUX_LUSTRE_QUOTA_H + +#ifndef _LUSTRE_QUOTA_H +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include +#include + +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h new file mode 100644 index 000000000000..ebaf92977f7f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LINUX_LUSTRE_USER_H +#define _LINUX_LUSTRE_USER_H + +# include +# include + +/* + * asm-x86_64/processor.h on some SLES 9 distros seems to use + * kernel-only typedefs. fortunately skipping it altogether is ok + * (for now). + */ +#define __ASM_X86_64_PROCESSOR_H + +#include + +#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \ + defined(__craynv) || defined (__mips64__) || defined(__powerpc64__) +typedef struct stat lstat_t; +#define lstat_f lstat +#define HAVE_LOV_USER_MDS_DATA +#else +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#define HAVE_LOV_USER_MDS_DATA +#endif + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h new file mode 100644 index 000000000000..b4db6cb581bd --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h @@ -0,0 +1,134 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LINUX_LVFS_H__ +#define __LINUX_LVFS_H__ + +#ifndef __LVFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include + +#define LLOG_LVFS + +/* simple.c */ + +struct lvfs_ucred { + __u32 luc_uid; + __u32 luc_gid; + __u32 luc_fsuid; + __u32 luc_fsgid; + kernel_cap_t luc_cap; + __u32 luc_umask; + struct group_info *luc_ginfo; + struct md_identity *luc_identity; +}; + +struct lvfs_callback_ops { + struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data); +}; + +#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA +#define OBD_CTXT_DEBUG /* development-only debugging */ +struct lvfs_run_ctxt { + struct vfsmount *pwdmnt; + struct dentry *pwd; + mm_segment_t fs; + struct lvfs_ucred luc; + int ngroups; + struct lvfs_callback_ops cb_ops; + struct group_info *group_info; + struct dt_device *dt; +#ifdef OBD_CTXT_DEBUG + __u32 magic; +#endif +}; + +#ifdef OBD_CTXT_DEBUG +#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC +#else +#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0) +#endif + + +int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname, + char *newname); + +static inline void l_dput(struct dentry *de) +{ + if (!de || IS_ERR(de)) + return; + //shrink_dcache_parent(de); + LASSERT(d_refcount(de) > 0); + dput(de); +} + +/* We need to hold the inode semaphore over the dcache lookup itself, or we + * run the risk of entering the filesystem lookup path concurrently on SMP + * systems, and instantiating two inodes for the same entry. We still + * protect against concurrent addition/removal races with the DLM locking. + */ +static inline struct dentry *ll_lookup_one_len(const char *fid_name, + struct dentry *dparent, + int fid_namelen) +{ + struct dentry *dchild; + + mutex_lock(&dparent->d_inode->i_mutex); + dchild = lookup_one_len(fid_name, dparent, fid_namelen); + mutex_unlock(&dparent->d_inode->i_mutex); + + if (IS_ERR(dchild) || dchild->d_inode == NULL) + return dchild; + + if (is_bad_inode(dchild->d_inode)) { + CERROR("bad inode returned %lu/%u\n", + dchild->d_inode->i_ino, dchild->d_inode->i_generation); + dput(dchild); + dchild = ERR_PTR(-ENOENT); + } + return dchild; +} + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h new file mode 100644 index 000000000000..140a60f1f0c9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h @@ -0,0 +1,66 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LVFS_LINUX_H__ +#define __LVFS_LINUX_H__ + +#include +#include +#include +#include +#include + +#include + +#define l_file file +#define l_dentry dentry + +#define l_filp_open filp_open + +struct lvfs_run_ctxt; +struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *, + int flags); + +struct l_linux_dirent { + struct list_head lld_list; + ino_t lld_ino; + unsigned long lld_off; + char lld_name[LL_FID_NAMELEN]; +}; +struct l_readdir_callback { + struct l_linux_dirent *lrc_dirent; + struct list_head *lrc_list; +}; + +#endif /* __LVFS_LINUX_H__ */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h new file mode 100644 index 000000000000..2c36c0d19d06 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd.h @@ -0,0 +1,128 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_OBD_H +#define __LINUX_OBD_H + +#ifndef __OBD_H +#error Do not #include this file directly. #include instead +#endif + +#include + +# include +# include +# include /* for struct task_struct, for current.h */ +# include +# include +# include + +struct ll_iattr { + struct iattr iattr; + unsigned int ia_attr_flags; +}; + +#define CLIENT_OBD_LIST_LOCK_DEBUG 1 + +typedef struct { + spinlock_t lock; + + unsigned long time; + struct task_struct *task; + const char *func; + int line; +} client_obd_lock_t; + +static inline void __client_obd_list_lock(client_obd_lock_t *lock, + const char *func, int line) +{ + unsigned long cur = jiffies; + while (1) { + if (spin_trylock(&lock->lock)) { + LASSERT(lock->task == NULL); + lock->task = current; + lock->func = func; + lock->line = line; + lock->time = jiffies; + break; + } + + if ((jiffies - cur > 5 * HZ) && + (jiffies - lock->time > 5 * HZ)) { + struct task_struct *task = lock->task; + + if (task == NULL) + continue; + + LCONSOLE_WARN("%s:%d: lock %p was acquired" + " by <%s:%d:%s:%d> for %lu seconds.\n", + current->comm, current->pid, + lock, task->comm, task->pid, + lock->func, lock->line, + (jiffies - lock->time) / HZ); + LCONSOLE_WARN("====== for process holding the " + "lock =====\n"); + libcfs_debug_dumpstack(task); + LCONSOLE_WARN("====== for current process =====\n"); + libcfs_debug_dumpstack(NULL); + LCONSOLE_WARN("====== end =======\n"); + cfs_pause(1000 * HZ); + } + cpu_relax(); + } +} + +#define client_obd_list_lock(lock) \ + __client_obd_list_lock(lock, __FUNCTION__, __LINE__) + +static inline void client_obd_list_unlock(client_obd_lock_t *lock) +{ + LASSERT(lock->task != NULL); + lock->task = NULL; + lock->time = jiffies; + spin_unlock(&lock->lock); +} + + +static inline void client_obd_list_lock_init(client_obd_lock_t *lock) +{ + spin_lock_init(&lock->lock); +} + +static inline void client_obd_list_lock_done(client_obd_lock_t *lock) +{} + +#endif /* __LINUX_OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h new file mode 100644 index 000000000000..021ead6639fc --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd_class.h @@ -0,0 +1,58 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_CLASS_OBD_H +#define __LINUX_CLASS_OBD_H + +#ifndef __CLASS_OBD_H +#error Do not #include this file directly. #include instead +#endif + +#include +#include +#include +#include +#include + +/* obdo.c */ +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid); +void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid); +void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid); +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid); +#define ll_inode_flags(inode) (inode->i_flags) + + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h new file mode 100644 index 000000000000..9166503408aa --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd_support.h @@ -0,0 +1,63 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_OBD_SUPPORT +#define _LINUX_OBD_SUPPORT + +#ifndef _OBD_SUPPORT +#error Do not #include this file directly. #include instead +#endif + +#ifdef CONFIG_X86 +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +# include +# include +# include + +#endif diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h new file mode 100644 index 000000000000..e4e8f72b92b1 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h @@ -0,0 +1,1100 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lprocfs_status.h + * + * Top level header file for LProc SNMP + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LPROCFS_SNMP_H +#define _LPROCFS_SNMP_H + +#include +#include +#include + +struct lprocfs_vars { + const char *name; + read_proc_t *read_fptr; + write_proc_t *write_fptr; + void *data; + struct file_operations *fops; + /** + * /proc file mode. + */ + mode_t proc_mode; +}; + +struct lprocfs_static_vars { + struct lprocfs_vars *module_vars; + struct lprocfs_vars *obd_vars; +}; + +/* if we find more consumers this could be generalized */ +#define OBD_HIST_MAX 32 +struct obd_histogram { + spinlock_t oh_lock; + unsigned long oh_buckets[OBD_HIST_MAX]; +}; + +enum { + BRW_R_PAGES = 0, + BRW_W_PAGES, + BRW_R_RPC_HIST, + BRW_W_RPC_HIST, + BRW_R_IO_TIME, + BRW_W_IO_TIME, + BRW_R_DISCONT_PAGES, + BRW_W_DISCONT_PAGES, + BRW_R_DISCONT_BLOCKS, + BRW_W_DISCONT_BLOCKS, + BRW_R_DISK_IOSIZE, + BRW_W_DISK_IOSIZE, + BRW_R_DIO_FRAGS, + BRW_W_DIO_FRAGS, + BRW_LAST, +}; + +struct brw_stats { + struct obd_histogram hist[BRW_LAST]; +}; + +enum { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST, +}; + +struct rename_stats { + struct obd_histogram hist[RENAME_LAST]; +}; + +/* An lprocfs counter can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 0x0001, + LPROCFS_CNTR_AVGMINMAX = 0x0002, + LPROCFS_CNTR_STDDEV = 0x0004, + + /* counter data type */ + LPROCFS_TYPE_REGS = 0x0100, + LPROCFS_TYPE_BYTES = 0x0200, + LPROCFS_TYPE_PAGES = 0x0400, + LPROCFS_TYPE_CYCLE = 0x0800, +}; + +#define LC_MIN_INIT ((~(__u64)0) >> 1) + +struct lprocfs_counter_header { + unsigned int lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ +}; + +struct lprocfs_counter { + __s64 lc_count; + __s64 lc_min; + __s64 lc_max; + __s64 lc_sumsquare; + /* + * Every counter has lc_array_sum[0], while lc_array_sum[1] is only + * for irq context counter, i.e. stats with + * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need + * lc_array_sum[1] + */ + __s64 lc_array_sum[1]; +}; +#define lc_sum lc_array_sum[0] +#define lc_sum_irq lc_array_sum[1] + +struct lprocfs_percpu { +#ifndef __GNUC__ + __s64 pad; +#endif + struct lprocfs_counter lp_cntr[0]; +}; + +#define LPROCFS_GET_NUM_CPU 0x0001 +#define LPROCFS_GET_SMP_ID 0x0002 + +enum lprocfs_stats_flags { + LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ + LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu + * area and need locking */ + LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ +}; + +enum lprocfs_fields_flags { + LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, + LPROCFS_FIELDS_FLAGS_SUM = 0x0002, + LPROCFS_FIELDS_FLAGS_MIN = 0x0003, + LPROCFS_FIELDS_FLAGS_MAX = 0x0004, + LPROCFS_FIELDS_FLAGS_AVG = 0x0005, + LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, + LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, +}; + +struct lprocfs_stats { + /* # of counters */ + unsigned short ls_num; + /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ + unsigned short ls_biggest_alloc_num; + enum lprocfs_stats_flags ls_flags; + /* Lock used when there are no percpu stats areas; For percpu stats, + * it is used to protect ls_biggest_alloc_num change */ + spinlock_t ls_lock; + + /* has ls_num of counter headers */ + struct lprocfs_counter_header *ls_cnt_header; + struct lprocfs_percpu *ls_percpu[0]; +}; + +#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) + +/* Pack all opcodes down into a single monotonically increasing index */ +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + OPC_RANGE(OST)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < MGS_LAST_OPC) { + /* MGS Opcode */ + return (opc - MGS_FIRST_OPC + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OBD_LAST_OPC) { + /* OBD Ping */ + return (opc - OBD_FIRST_OPC + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LLOG_LAST_OPC) { + /* LLOG Opcode */ + return (opc - LLOG_FIRST_OPC + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEC_LAST_OPC) { + /* SEC opcode */ + return (opc - SEC_FIRST_OPC + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < UPDATE_LAST_OPC) { + /* update opcode */ + return (opc - UPDATE_FIRST_OPC + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else { + /* Unknown Opcode */ + return -1; + } +} + + +#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \ + OPC_RANGE(MDS) + \ + OPC_RANGE(LDLM) + \ + OPC_RANGE(MGS) + \ + OPC_RANGE(OBD) + \ + OPC_RANGE(LLOG) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(SEQ) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(FLD) + \ + OPC_RANGE(UPDATE)) + +#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ + OPC_RANGE(EXTRA)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, + PTLRPC_REQBUF_AVAIL_CNTR, + PTLRPC_LAST_CNTR +}; + +#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR + +enum { + LDLM_GLIMPSE_ENQUEUE = 0, + LDLM_PLAIN_ENQUEUE, + LDLM_EXTENT_ENQUEUE, + LDLM_FLOCK_ENQUEUE, + LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, + MDS_REINT_CREATE, + MDS_REINT_LINK, + MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, + BRW_READ_BYTES, + BRW_WRITE_BYTES, + EXTRA_LAST_OPC +}; + +#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE +/* class_obd.c */ +extern proc_dir_entry_t *proc_lustre_root; + +struct obd_device; +struct obd_histogram; + +/* Days / hours / mins / seconds format */ +struct dhms { + int d,h,m,s; +}; +static inline void s2dhms(struct dhms *ts, time_t secs) +{ + ts->d = secs / 86400; + secs = secs % 86400; + ts->h = secs / 3600; + secs = secs % 3600; + ts->m = secs / 60; + ts->s = secs % 60; +} +#define DHMS_FMT "%dd%dh%02dm%02ds" +#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s + +#define JOBSTATS_JOBID_VAR_MAX_LEN 20 +#define JOBSTATS_DISABLE "disable" +#define JOBSTATS_PROCNAME_UID "procname_uid" + +typedef void (*cntr_init_callback)(struct lprocfs_stats *stats); + +struct obd_job_stats { + cfs_hash_t *ojs_hash; + struct list_head ojs_list; + rwlock_t ojs_lock; /* protect the obj_list */ + cntr_init_callback ojs_cntr_init_fn; + int ojs_cntr_num; + int ojs_cleanup_interval; + time_t ojs_last_cleanup; +}; + +#ifdef LPROCFS + +extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, + unsigned int cpuid); +/* + * \return value + * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID) + */ +static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + int rc = 0; + + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 0; + } else { + unsigned int cpuid = get_cpu(); + + if (unlikely(stats->ls_percpu[cpuid] == NULL)) { + rc = lprocfs_stats_alloc_one(stats, cpuid); + if (rc < 0) { + put_cpu(); + return rc; + } + } + return cpuid; + } + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 1; + } else { + return stats->ls_biggest_alloc_num; + } + } +} + +static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } else { + put_cpu(); + } + return; + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + return; + } +} + +static inline unsigned int +lprocfs_stats_counter_size(struct lprocfs_stats *stats) +{ + unsigned int percpusize; + + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + + /* irq safe stats need lc_array_sum[1] */ + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpusize += stats->ls_num * sizeof(__s64); + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) + percpusize = L1_CACHE_ALIGN(percpusize); + + return percpusize; +} + +static inline struct lprocfs_counter * +lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, + int index) +{ + struct lprocfs_counter *cntr; + + cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + cntr = (void *)cntr + index * sizeof(__s64); + + return cntr; +} + +/* Two optimized LPROCFS counter increment functions are provided: + * lprocfs_counter_incr(cntr, value) - optimized for by-one counters + * lprocfs_counter_add(cntr) - use for multi-valued counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); + +#define lprocfs_counter_incr(stats, idx) \ + lprocfs_counter_add(stats, idx, 1) +#define lprocfs_counter_decr(stats, idx) \ + lprocfs_counter_sub(stats, idx, 1) + +extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field); +static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, + int idx, + enum lprocfs_fields_flags field) +{ + int i; + unsigned int num_cpu; + unsigned long flags = 0; + __u64 ret = 0; + + LASSERT(stats != NULL); + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + ret += lprocfs_read_helper( + lprocfs_stats_counter_get(stats, i, idx), + &stats->ls_cnt_header[idx], stats->ls_flags, + field); + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); + return ret; +} + +extern struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); +extern void lprocfs_clear_stats(struct lprocfs_stats *stats); +extern void lprocfs_free_stats(struct lprocfs_stats **stats); +extern void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats); +extern int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, + const char *units); +extern void lprocfs_free_obd_stats(struct obd_device *obddev); +extern void lprocfs_free_md_stats(struct obd_device *obddev); +struct obd_export; +struct nid_stat; +extern int lprocfs_add_clear_entry(struct obd_device * obd, + proc_dir_entry_t *entry); +extern int lprocfs_exp_setup(struct obd_export *exp, + lnet_nid_t *peer_nid, int *newnid); +extern int lprocfs_exp_cleanup(struct obd_export *exp); +extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + read_proc_t *read_proc, + write_proc_t *write_proc, + void *data, + struct file_operations *fops); +extern struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...); +extern void lprocfs_free_per_client_stats(struct obd_device *obd); +extern int +lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off, + int count, int *eof, void *data); + +extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name, + struct lprocfs_stats *stats); + +/* lprocfs_status.c */ +extern int lprocfs_add_vars(proc_dir_entry_t *root, + struct lprocfs_vars *var, + void *data); + +extern proc_dir_entry_t *lprocfs_register(const char *name, + proc_dir_entry_t *parent, + struct lprocfs_vars *list, + void *data); + +extern void lprocfs_remove(proc_dir_entry_t **root); +extern void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); +extern void lprocfs_try_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); + +extern proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *root, + const char *name); + +extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list); +extern int lprocfs_obd_cleanup(struct obd_device *obd); +extern struct file_operations lprocfs_evict_client_fops; + +extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name, + mode_t mode, + const struct file_operations *seq_fops, + void *data); +extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name, + mode_t mode, + const struct file_operations *seq_fops, + void *data); + +/* Generic callbacks */ + +extern int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_atomic(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_atomic(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uint(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_uint(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_name(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_import(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_rd_state(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_num_exports(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_numrefs(char *page, char **start, off_t off, + int count, int *eof, void *data); +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at); +extern int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_wr_evict_client(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_wr_ping(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_wr_import(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_pinger_recov(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer, + unsigned long count, void *data); + +/* Statfs helpers */ +extern int lprocfs_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_filegroups(char *page, char **start, off_t off, + int count, int *eof, void *data); + +extern int lprocfs_write_helper(const char *buffer, unsigned long count, + int *val); +extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult); +extern int lprocfs_read_frac_helper(char *buffer, unsigned long count, + long val, int mult); +extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count, + __u64 *val); +extern int lprocfs_write_frac_u64_helper(const char *buffer, + unsigned long count, + __u64 *val, int mult); +char *lprocfs_find_named_value(const char *buffer, const char *name, + unsigned long *count); +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_clear(struct obd_histogram *oh); +unsigned long lprocfs_oh_sum(struct obd_histogram *oh); + +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt); + +/* lprocfs_status.c: recovery status */ +int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off, + int count, int *eof, void *data); + +/* lprocfs_statuc.c: hash statistics */ +int lprocfs_obd_rd_hash(char *page, char **start, off_t off, + int count, int *eof, void *data); + +/* lprocfs_status.c: IR factor */ +int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer, + unsigned long count, void *data); + +extern int lprocfs_single_release(cfs_inode_t *, struct file *); +extern int lprocfs_seq_release(cfs_inode_t *, struct file *); + +/* You must use these macros when you want to refer to + * the import in a client obd_device for a lprocfs entry */ +#define LPROCFS_CLIMP_CHECK(obd) do { \ + typecheck(struct obd_device *, obd); \ + down_read(&(obd)->u.cli.cl_sem); \ + if ((obd)->u.cli.cl_import == NULL) { \ + up_read(&(obd)->u.cli.cl_sem); \ + return -ENODEV; \ + } \ +} while(0) +#define LPROCFS_CLIMP_EXIT(obd) \ + up_read(&(obd)->u.cli.cl_sem); + + +/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only + proc entries; otherwise, you will define name##_seq_write function also for + a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally, + call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */ +#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(cfs_inode_t *inode, struct file *file) { \ + struct proc_dir_entry *dp = PDE(inode); \ + int rc; \ + LPROCFS_ENTRY_AND_CHECK(dp); \ + rc = single_open(file, name##_seq_show, dp->data); \ + if (rc) { \ + LPROCFS_EXIT(); \ + return rc; \ + } \ + return 0; \ +} \ +struct file_operations name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_single_open, \ + .read = seq_read, \ + .write = custom_seq_write, \ + .llseek = seq_lseek, \ + .release = lprocfs_single_release, \ +} + +#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) +#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) + +/* lprocfs_jobstats.c */ +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount); +void lprocfs_job_stats_fini(struct obd_device *obd); +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn); +int lprocfs_rd_job_interval(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_wr_job_interval(struct file *file, const char *buffer, + unsigned long count, void *data); + +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + +/* lproc_status.c */ +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_soft(struct file *file, + const char *buffer, + unsigned long count, void *data); +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_recovery_time_hard(struct file *file, + const char *buffer, + unsigned long count, void *data); +int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer, + unsigned long count, void *data); +int lprocfs_target_rd_instance(char *page, char **start, off_t off, + int count, int *eof, void *data); + +/* all quota proc functions */ +extern int lprocfs_quota_rd_bunit(char *page, char **start, + off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_btune(char *page, char **start, + off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_iunit(char *page, char **start, + off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_itune(char *page, char **start, + off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_seconds(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_qs(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_boundary_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_bunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_iunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_qs_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); + + + +#else +/* LPROCFS is not defined */ + +#define proc_lustre_root NULL + +static inline void lprocfs_counter_add(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, + int index, unsigned conf, + const char *name, const char *units) +{ return; } + +static inline __u64 lc_read_helper(struct lprocfs_counter *lc, + enum lprocfs_fields_flags field) +{ return 0; } + +/* NB: we return !NULL to satisfy error checker */ +static inline struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) +{ return (struct lprocfs_stats *)1; } +static inline void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_free_stats(struct lprocfs_stats **stats) +{ return; } +static inline int lprocfs_register_stats(proc_dir_entry_t *root, + const char *name, + struct lprocfs_stats *stats) +{ return 0; } +static inline void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ return; } +static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline void lprocfs_free_obd_stats(struct obd_device *obddev) +{ return; } +static inline void lprocfs_free_md_stats(struct obd_device *obddev) +{ return; } + +struct obd_export; +static inline int lprocfs_add_clear_entry(struct obd_export *exp) +{ return 0; } +static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid, + int *newnid) +{ return 0; } +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +static inline proc_dir_entry_t * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + read_proc_t *read_proc, write_proc_t *write_proc, + void *data, struct file_operations *fops) +{return 0; } +static inline struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...) +{return NULL; } +static inline void lprocfs_free_per_client_stats(struct obd_device *obd) +{ return; } +static inline +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{return count;} +static inline +int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{return count;} + +static inline proc_dir_entry_t * +lprocfs_register(const char *name, proc_dir_entry_t *parent, + struct lprocfs_vars *list, void *data) +{ return NULL; } +static inline int lprocfs_add_vars(proc_dir_entry_t *root, + struct lprocfs_vars *var, + void *data) +{ return 0; } +static inline void lprocfs_remove(proc_dir_entry_t **root) +{ return; } +static inline void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline void lprocfs_try_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline proc_dir_entry_t *lprocfs_srch(proc_dir_entry_t *head, + const char *name) +{ return 0; } +static inline int lprocfs_obd_setup(struct obd_device *dev, + struct lprocfs_vars *list) +{ return 0; } +static inline int lprocfs_obd_cleanup(struct obd_device *dev) +{ return 0; } +static inline int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_name(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_import(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_pinger_recov(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_state(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_wr_timeouts(struct file *file, + const char *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_evict_client(struct file *file, + const char *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_ping(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_import(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; } + +/* Statfs helpers */ +static inline +int lprocfs_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +int lprocfs_rd_filegroups(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_clear(struct obd_histogram *oh) +{ return; } +static inline +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ return 0; } +static inline +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ return; } +static inline +__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ return (__u64)0; } + +#define LPROC_SEQ_FOPS_RO(name) +#define LPROC_SEQ_FOPS(name) + +/* lprocfs_jobstats.c */ +static inline +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event, + long amount) +{ return 0; } +static inline +void lprocfs_job_stats_fini(struct obd_device *obd) +{ return; } +static inline +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn) +{ return 0; } + + +/* lproc_ptlrpc.c */ +#define target_print_req NULL + +#endif /* LPROCFS */ + +#endif /* LPROCFS_SNMP_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h new file mode 100644 index 000000000000..4bd11bbe2783 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_object.h @@ -0,0 +1,1346 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_LU_OBJECT_H +#define __LUSTRE_LU_OBJECT_H + +#include +#include +#include +#include + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; +struct lprocfs_stats; + +/** \defgroup lu lu + * lu_* data-types represent server-side entities shared by data and meta-data + * stacks. + * + * Design goals: + * + * -# support for layering. + * + * Server side object is split into layers, one per device in the + * corresponding device stack. Individual layer is represented by struct + * lu_object. Compound layered object --- by struct lu_object_header. Most + * interface functions take lu_object as an argument and operate on the + * whole compound object. This decision was made due to the following + * reasons: + * + * - it's envisaged that lu_object will be used much more often than + * lu_object_header; + * + * - we want lower (non-top) layers to be able to initiate operations + * on the whole object. + * + * Generic code supports layering more complex than simple stacking, e.g., + * it is possible that at some layer object "spawns" multiple sub-objects + * on the lower layer. + * + * -# fid-based identification. + * + * Compound object is uniquely identified by its fid. Objects are indexed + * by their fids (hash table is used for index). + * + * -# caching and life-cycle management. + * + * Object's life-time is controlled by reference counting. When reference + * count drops to 0, object is returned to cache. Cached objects still + * retain their identity (i.e., fid), and can be recovered from cache. + * + * Objects are kept in the global LRU list, and lu_site_purge() function + * can be used to reclaim given number of unused objects from the tail of + * the LRU. + * + * -# avoiding recursion. + * + * Generic code tries to replace recursion through layers by iterations + * where possible. Additionally to the end of reducing stack consumption, + * data, when practically possible, are allocated through lu_context_key + * interface rather than on stack. + * @{ + */ + +struct lu_site; +struct lu_object; +struct lu_device; +struct lu_object_header; +struct lu_context; +struct lu_env; + +/** + * Operations common for data and meta-data devices. + */ +struct lu_device_operations { + /** + * Allocate object for the given device (without lower-layer + * parts). This is called by lu_object_operations::loo_object_init() + * from the parent layer, and should setup at least lu_object::lo_dev + * and lu_object::lo_ops fields of resulting lu_object. + * + * Object creation protocol. + * + * Due to design goal of avoiding recursion, object creation (see + * lu_object_alloc()) is somewhat involved: + * + * - first, lu_device_operations::ldo_object_alloc() method of the + * top-level device in the stack is called. It should allocate top + * level object (including lu_object_header), but without any + * lower-layer sub-object(s). + * + * - then lu_object_alloc() sets fid in the header of newly created + * object. + * + * - then lu_object_operations::loo_object_init() is called. It has + * to allocate lower-layer object(s). To do this, + * lu_object_operations::loo_object_init() calls ldo_object_alloc() + * of the lower-layer device(s). + * + * - for all new objects allocated by + * lu_object_operations::loo_object_init() (and inserted into object + * stack), lu_object_operations::loo_object_init() is called again + * repeatedly, until no new objects are created. + * + * \post ergo(!IS_ERR(result), result->lo_dev == d && + * result->lo_ops != NULL); + */ + struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, + const struct lu_object_header *h, + struct lu_device *d); + /** + * process config specific for device. + */ + int (*ldo_process_config)(const struct lu_env *env, + struct lu_device *, struct lustre_cfg *); + int (*ldo_recovery_complete)(const struct lu_env *, + struct lu_device *); + + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + +}; + +/** + * For lu_object_conf flags + */ +typedef enum { + /* This is a new object to be allocated, or the file + * corresponding to the object does not exists. */ + LOC_F_NEW = 0x00000001, +} loc_flags_t; + +/** + * Object configuration, describing particulars of object being created. On + * server this is not used, as server objects are full identified by fid. On + * client configuration contains struct lustre_md. + */ +struct lu_object_conf { + /** + * Some hints for obj find and alloc. + */ + loc_flags_t loc_flags; +}; + +/** + * Type of "printer" function used by lu_object_operations::loo_object_print() + * method. + * + * Printer function is needed to provide some flexibility in (semi-)debugging + * output: possible implementations: printk, CDEBUG, sysfs/seq_file + */ +typedef int (*lu_printer_t)(const struct lu_env *env, + void *cookie, const char *format, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Operations specific for particular lu_object. + */ +struct lu_object_operations { + + /** + * Allocate lower-layer parts of the object by calling + * lu_device_operations::ldo_object_alloc() of the corresponding + * underlying device. + * + * This method is called once for each object inserted into object + * stack. It's responsibility of this method to insert lower-layer + * object(s) it create into appropriate places of object stack. + */ + int (*loo_object_init)(const struct lu_env *env, + struct lu_object *o, + const struct lu_object_conf *conf); + /** + * Called (in top-to-bottom order) during object allocation after all + * layers were allocated and initialized. Can be used to perform + * initialization depending on lower layers. + */ + int (*loo_object_start)(const struct lu_env *env, + struct lu_object *o); + /** + * Called before lu_object_operations::loo_object_free() to signal + * that object is being destroyed. Dual to + * lu_object_operations::loo_object_init(). + */ + void (*loo_object_delete)(const struct lu_env *env, + struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); + /** + * Called when last active reference to the object is released (and + * object returns to the cache). This method is optional. + */ + void (*loo_object_release)(const struct lu_env *env, + struct lu_object *o); + /** + * Optional debugging helper. Print given object. + */ + int (*loo_object_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + /** + * Optional debugging method. Returns true iff method is internally + * consistent. + */ + int (*loo_object_invariant)(const struct lu_object *o); +}; + +/** + * Type of lu_device. + */ +struct lu_device_type; + +/** + * Device: a layer in the server side abstraction stacking. + */ +struct lu_device { + /** + * reference count. This is incremented, in particular, on each object + * created at this layer. + * + * \todo XXX which means that atomic_t is probably too small. + */ + atomic_t ld_ref; + /** + * Pointer to device type. Never modified once set. + */ + struct lu_device_type *ld_type; + /** + * Operation vector for this device. + */ + const struct lu_device_operations *ld_ops; + /** + * Stack this device belongs to. + */ + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; + + /** \todo XXX: temporary back pointer into obd. */ + struct obd_device *ld_obd; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref ld_reference; + /** + * Link the device to the site. + **/ + struct list_head ld_linkage; +}; + +struct lu_device_type_operations; + +/** + * Tag bits for device type. They are used to distinguish certain groups of + * device types. + */ +enum lu_device_tag { + /** this is meta-data device */ + LU_DEVICE_MD = (1 << 0), + /** this is data device */ + LU_DEVICE_DT = (1 << 1), + /** data device in the client stack */ + LU_DEVICE_CL = (1 << 2) +}; + +/** + * Type of device. + */ +struct lu_device_type { + /** + * Tag bits. Taken from enum lu_device_tag. Never modified once set. + */ + __u32 ldt_tags; + /** + * Name of this class. Unique system-wide. Never modified once set. + */ + char *ldt_name; + /** + * Operations for this type. + */ + const struct lu_device_type_operations *ldt_ops; + /** + * \todo XXX: temporary pointer to associated obd_type. + */ + struct obd_type *ldt_obd_type; + /** + * \todo XXX: temporary: context tags used by obd_*() calls. + */ + __u32 ldt_ctx_tags; + /** + * Number of existing device type instances. + */ + unsigned ldt_device_nr; + /** + * Linkage into a global list of all device types. + * + * \see lu_device_types. + */ + struct list_head ldt_linkage; +}; + +/** + * Operations on a device type. + */ +struct lu_device_type_operations { + /** + * Allocate new device. + */ + struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *lcfg); + /** + * Free device. Dual to + * lu_device_type_operations::ldto_device_alloc(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_free)(const struct lu_env *, + struct lu_device *); + + /** + * Initialize the devices after allocation + */ + int (*ldto_device_init)(const struct lu_env *env, + struct lu_device *, const char *, + struct lu_device *); + /** + * Finalize device. Dual to + * lu_device_type_operations::ldto_device_init(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_fini)(const struct lu_env *env, + struct lu_device *); + /** + * Initialize device type. This is called on module load. + */ + int (*ldto_init)(struct lu_device_type *t); + /** + * Finalize device type. Dual to + * lu_device_type_operations::ldto_init(). Called on module unload. + */ + void (*ldto_fini)(struct lu_device_type *t); + /** + * Called when the first device is created. + */ + void (*ldto_start)(struct lu_device_type *t); + /** + * Called when number of devices drops to 0. + */ + void (*ldto_stop)(struct lu_device_type *t); +}; + +static inline int lu_device_is_md(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD); +} + +/** + * Flags for the object layers. + */ +enum lu_object_flags { + /** + * this flags is set if lu_object_operations::loo_object_init() has + * been called for this layer. Used by lu_object_alloc(). + */ + LU_OBJECT_ALLOCATED = (1 << 0) +}; + +/** + * Common object attributes. + */ +struct lu_attr { + /** size in bytes */ + __u64 la_size; + /** modification time in seconds since Epoch */ + obd_time la_mtime; + /** access time in seconds since Epoch */ + obd_time la_atime; + /** change time in seconds since Epoch */ + obd_time la_ctime; + /** 512-byte blocks allocated to object */ + __u64 la_blocks; + /** permission bits and file type */ + __u32 la_mode; + /** owner id */ + __u32 la_uid; + /** group id */ + __u32 la_gid; + /** object flags */ + __u32 la_flags; + /** number of persistent references to this object */ + __u32 la_nlink; + /** blk bits of the object*/ + __u32 la_blkbits; + /** blk size of the object*/ + __u32 la_blksize; + /** real device */ + __u32 la_rdev; + /** + * valid bits + * + * \see enum la_valid + */ + __u64 la_valid; +}; + +/** Bit-mask of valid attributes */ +enum la_valid { + LA_ATIME = 1 << 0, + LA_MTIME = 1 << 1, + LA_CTIME = 1 << 2, + LA_SIZE = 1 << 3, + LA_MODE = 1 << 4, + LA_UID = 1 << 5, + LA_GID = 1 << 6, + LA_BLOCKS = 1 << 7, + LA_TYPE = 1 << 8, + LA_FLAGS = 1 << 9, + LA_NLINK = 1 << 10, + LA_RDEV = 1 << 11, + LA_BLKSIZE = 1 << 12, + LA_KILL_SUID = 1 << 13, + LA_KILL_SGID = 1 << 14, +}; + +/** + * Layer in the layered object. + */ +struct lu_object { + /** + * Header for this object. + */ + struct lu_object_header *lo_header; + /** + * Device for this layer. + */ + struct lu_device *lo_dev; + /** + * Operations for this object. + */ + const struct lu_object_operations *lo_ops; + /** + * Linkage into list of all layers. + */ + struct list_head lo_linkage; + /** + * Depth. Top level layer depth is 0. + */ + int lo_depth; + /** + * Flags from enum lu_object_flags. + */ + __u32 lo_flags; + /** + * Link to the device, for debugging. + */ + struct lu_ref_link *lo_dev_ref; +}; + +enum lu_object_header_flags { + /** + * Don't keep this object in cache. Object will be destroyed as soon + * as last reference to it is released. This flag cannot be cleared + * once set. + */ + LU_OBJECT_HEARD_BANSHEE = 0, + /** + * Mark this object has already been taken out of cache. + */ + LU_OBJECT_UNHASHED = 1 +}; + +enum lu_object_header_attr { + LOHA_EXISTS = 1 << 0, + LOHA_REMOTE = 1 << 1, + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ +}; + +/** + * "Compound" object, consisting of multiple layers. + * + * Compound object with given fid is unique with given lu_site. + * + * Note, that object does *not* necessary correspond to the real object in the + * persistent storage: object is an anchor for locking and method calling, so + * it is created for things like not-yet-existing child created by mkdir or + * create calls. lu_object_operations::loo_exists() can be used to check + * whether object is backed by persistent storage entity. + */ +struct lu_object_header { + /** + * Object flags from enum lu_object_header_flags. Set and checked + * atomically. + */ + unsigned long loh_flags; + /** + * Object reference count. Protected by lu_site::ls_guard. + */ + atomic_t loh_ref; + /** + * Fid, uniquely identifying this object. + */ + struct lu_fid loh_fid; + /** + * Common object attributes, cached for efficiency. From enum + * lu_object_header_attr. + */ + __u32 loh_attr; + /** + * Linkage into per-site hash table. Protected by lu_site::ls_guard. + */ + struct hlist_node loh_hash; + /** + * Linkage into per-site LRU list. Protected by lu_site::ls_guard. + */ + struct list_head loh_lru; + /** + * Linkage into list of layers. Never modified once set (except lately + * during object destruction). No locking is necessary. + */ + struct list_head loh_layers; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref loh_reference; +}; + +struct fld; + +struct lu_site_bkt_data { + /** + * number of busy object on this bucket + */ + long lsb_busy; + /** + * LRU list, updated on each access to object. Protected by + * bucket lock of lu_site::ls_obj_hash. + * + * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are + * moved to the lu_site::ls_lru.prev (this is due to the non-existence + * of list_for_each_entry_safe_reverse()). + */ + struct list_head lsb_lru; + /** + * Wait-queue signaled when an object in this site is ultimately + * destroyed (lu_object_free()). It is used by lu_object_find() to + * wait before re-trying when object in the process of destruction is + * found in the hash table. + * + * \see htable_lookup(). + */ + wait_queue_head_t lsb_marche_funebre; +}; + +enum { + LU_SS_CREATED = 0, + LU_SS_CACHE_HIT, + LU_SS_CACHE_MISS, + LU_SS_CACHE_RACE, + LU_SS_CACHE_DEATH_RACE, + LU_SS_LRU_PURGED, + LU_SS_LAST_STAT +}; + +/** + * lu_site is a "compartment" within which objects are unique, and LRU + * discipline is maintained. + * + * lu_site exists so that multiple layered stacks can co-exist in the same + * address space. + * + * lu_site has the same relation to lu_device as lu_object_header to + * lu_object. + */ +struct lu_site { + /** + * objects hash table + */ + cfs_hash_t *ls_obj_hash; + /** + * index of bucket on hash table while purging + */ + int ls_purge_start; + /** + * Top-level device for this stack. + */ + struct lu_device *ls_top_dev; + /** + * Bottom-level device for this stack + */ + struct lu_device *ls_bottom_dev; + /** + * Linkage into global list of sites. + */ + struct list_head ls_linkage; + /** + * List for lu device for this site, protected + * by ls_ld_lock. + **/ + struct list_head ls_ld_linkage; + spinlock_t ls_ld_lock; + + /** + * lu_site stats + */ + struct lprocfs_stats *ls_stats; + /** + * XXX: a hack! fld has to find md_site via site, remove when possible + */ + struct seq_server_site *ld_seq_site; +}; + +static inline struct lu_site_bkt_data * +lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid) +{ + cfs_hash_bd_t bd; + + cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); + return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); +} + +/** \name ctors + * Constructors/destructors. + * @{ + */ + +int lu_site_init (struct lu_site *s, struct lu_device *d); +void lu_site_fini (struct lu_site *s); +int lu_site_init_finish (struct lu_site *s); +void lu_stack_fini (const struct lu_env *env, struct lu_device *top); +void lu_device_get (struct lu_device *d); +void lu_device_put (struct lu_device *d); +int lu_device_init (struct lu_device *d, struct lu_device_type *t); +void lu_device_fini (struct lu_device *d); +int lu_object_header_init(struct lu_object_header *h); +void lu_object_header_fini(struct lu_object_header *h); +int lu_object_init (struct lu_object *o, + struct lu_object_header *h, struct lu_device *d); +void lu_object_fini (struct lu_object *o); +void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); +void lu_object_add (struct lu_object *before, struct lu_object *o); + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); + +/** + * Helpers to initialize and finalize device types. + */ + +int lu_device_type_init(struct lu_device_type *ldt); +void lu_device_type_fini(struct lu_device_type *ldt); +void lu_types_stop(void); + +/** @} ctors */ + +/** \name caching + * Caching and reference counting. + * @{ + */ + +/** + * Acquire additional reference to the given object. This function is used to + * attain additional reference. To acquire initial reference use + * lu_object_find(). + */ +static inline void lu_object_get(struct lu_object *o) +{ + LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); + atomic_inc(&o->lo_header->loh_ref); +} + +/** + * Return true of object will not be cached after last reference to it is + * released. + */ +static inline int lu_object_is_dying(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); +} + +void lu_object_put(const struct lu_env *env, struct lu_object *o); +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); +void lu_object_unhash(const struct lu_env *env, struct lu_object *o); + +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr); + +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer); +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +/** @} caching */ + +/** \name helpers + * Helpers. + * @{ + */ + +/** + * First (topmost) sub-object of given compound object + */ +static inline struct lu_object *lu_object_top(struct lu_object_header *h) +{ + LASSERT(!list_empty(&h->loh_layers)); + return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); +} + +/** + * Next sub-object in the layering + */ +static inline struct lu_object *lu_object_next(const struct lu_object *o) +{ + return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); +} + +/** + * Pointer to the fid of this object. + */ +static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) +{ + return &o->lo_header->loh_fid; +} + +/** + * return device operations vector for this object + */ +static const inline struct lu_device_operations * +lu_object_ops(const struct lu_object *o) +{ + return o->lo_dev->ld_ops; +} + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype); + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...); + +/** + * Print object description followed by a user-supplied message. + */ +#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Print short object description followed by a user-supplied message. + */ +#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ + (object)->lo_header); \ + lu_cdebug_printer(env, &msgdata, "\n"); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +void lu_object_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o); +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o); + + +/** + * Check whether object exists, no matter on local or remote storage. + * Note: LOHA_EXISTS will be set once some one created the object, + * and it does not needs to be committed to storage. + */ +#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) + +/** + * Check whether object on the remote storage. + */ +#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) + +static inline int lu_object_assert_exists(const struct lu_object *o) +{ + return lu_object_exists(o); +} + +static inline int lu_object_assert_not_exists(const struct lu_object *o) +{ + return !lu_object_exists(o); +} + +/** + * Attr of this object. + */ +static inline __u32 lu_object_attr(const struct lu_object *o) +{ + LASSERT(lu_object_exists(o) != 0); + return o->lo_header->loh_attr; +} + +static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o, + const char *scope, + const void *source) +{ + return lu_ref_add(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del(struct lu_object *o, + const char *scope, const void *source) +{ + lu_ref_del(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, const void *source) +{ + lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); +} + +/** input params, should be filled out by mdt */ +struct lu_rdpg { + /** hash */ + __u64 rp_hash; + /** count in bytes */ + unsigned int rp_count; + /** number of pages */ + unsigned int rp_npages; + /** requested attr */ + __u32 rp_attrs; + /** pointers to pages */ + struct page **rp_pages; +}; + +enum lu_xattr_flags { + LU_XATTR_REPLACE = (1 << 0), + LU_XATTR_CREATE = (1 << 1) +}; + +/** @} helpers */ + +/** \name lu_context + * @{ */ + +/** For lu_context health-checks */ +enum lu_context_state { + LCS_INITIALIZED = 1, + LCS_ENTERED, + LCS_LEFT, + LCS_FINALIZED +}; + +/** + * lu_context. Execution context for lu_object methods. Currently associated + * with thread. + * + * All lu_object methods, except device and device type methods (called during + * system initialization and shutdown) are executed "within" some + * lu_context. This means, that pointer to some "current" lu_context is passed + * as an argument to all methods. + * + * All service ptlrpc threads create lu_context as part of their + * initialization. It is possible to create "stand-alone" context for other + * execution environments (like system calls). + * + * lu_object methods mainly use lu_context through lu_context_key interface + * that allows each layer to associate arbitrary pieces of data with each + * context (see pthread_key_create(3) for similar interface). + * + * On a client, lu_context is bound to a thread, see cl_env_get(). + * + * \see lu_context_key + */ +struct lu_context { + /** + * lu_context is used on the client side too. Yet we don't want to + * allocate values of server-side keys for the client contexts and + * vice versa. + * + * To achieve this, set of tags in introduced. Contexts and keys are + * marked with tags. Key value are created only for context whose set + * of tags has non-empty intersection with one for key. Tags are taken + * from enum lu_context_tag. + */ + __u32 lc_tags; + enum lu_context_state lc_state; + /** + * Pointer to the home service thread. NULL for other execution + * contexts. + */ + struct ptlrpc_thread *lc_thread; + /** + * Pointer to an array with key values. Internal implementation + * detail. + */ + void **lc_value; + /** + * Linkage into a list of all remembered contexts. Only + * `non-transient' contexts, i.e., ones created for service threads + * are placed here. + */ + struct list_head lc_remember; + /** + * Version counter used to skip calls to lu_context_refill() when no + * keys were registered. + */ + unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; +}; + +/** + * lu_context_key interface. Similar to pthread_key. + */ + +enum lu_context_tag { + /** + * Thread on md server + */ + LCT_MD_THREAD = 1 << 0, + /** + * Thread on dt server + */ + LCT_DT_THREAD = 1 << 1, + /** + * Context for transaction handle + */ + LCT_TX_HANDLE = 1 << 2, + /** + * Thread on client + */ + LCT_CL_THREAD = 1 << 3, + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = 1 << 4, + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = 1 << 5, + /** + * MGS device thread + */ + LCT_MG_THREAD = 1 << 6, + /** + * Context for local operations + */ + LCT_LOCAL = 1 << 7, + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = 1 << 28, + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = 1 << 29, + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = 1 << 30, + /** + * Context should be remembered. + */ + LCT_REMEMBER = 1 << 31, + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF +}; + +/** + * Key. Represents per-context value slot. + * + * Keys are usually registered when module owning the key is initialized, and + * de-registered when module is unloaded. Once key is registered, all new + * contexts with matching tags, will get key value. "Old" contexts, already + * initialized at the time of key registration, can be forced to get key value + * by calling lu_context_refill(). + * + * Every key value is counted in lu_context_key::lct_used and acquires a + * reference on an owning module. This means, that all key values have to be + * destroyed before module can be unloaded. This is usually achieved by + * stopping threads started by the module, that created contexts in their + * entry functions. Situation is complicated by the threads shared by multiple + * modules, like ptlrpcd daemon on a client. To work around this problem, + * contexts, created in such threads, are `remembered' (see + * LCT_REMEMBER)---i.e., added into a global list. When module is preparing + * for unloading it does the following: + * + * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) + * preventing new key values from being allocated in the new contexts, + * and + * + * - scans a list of remembered contexts, destroying values of module + * keys, thus releasing references to the module. + * + * This is done by lu_context_key_quiesce(). If module is re-activated + * before key has been de-registered, lu_context_key_revive() call clears + * `quiescent' marker. + * + * lu_context code doesn't provide any internal synchronization for these + * activities---it's assumed that startup (including threads start-up) and + * shutdown are serialized by some external means. + * + * \see lu_context + */ +struct lu_context_key { + /** + * Set of tags for which values of this key are to be instantiated. + */ + __u32 lct_tags; + /** + * Value constructor. This is called when new value is created for a + * context. Returns pointer to new value of error pointer. + */ + void *(*lct_init)(const struct lu_context *ctx, + struct lu_context_key *key); + /** + * Value destructor. Called when context with previously allocated + * value of this slot is destroyed. \a data is a value that was returned + * by a matching call to lu_context_key::lct_init(). + */ + void (*lct_fini)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Optional method called on lu_context_exit() for all allocated + * keys. Can be used by debugging code checking that locks are + * released, etc. + */ + void (*lct_exit)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Internal implementation detail: index within lu_context::lc_value[] + * reserved for this key. + */ + int lct_index; + /** + * Internal implementation detail: number of values created for this + * key. + */ + atomic_t lct_used; + /** + * Internal implementation detail: module for this key. + */ + module_t *lct_owner; + /** + * References to this key. For debugging. + */ + struct lu_ref lct_reference; +}; + +#define LU_KEY_INIT(mod, type) \ + static void* mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ + \ + CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value)); \ + \ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init {;} /* semicolon catcher */ + +#define LU_KEY_FINI(mod, type) \ + static void mod##_key_fini(const struct lu_context *ctx, \ + struct lu_context_key *key, void* data) \ + { \ + type *info = data; \ + \ + OBD_FREE_PTR(info); \ + } \ + struct __##mod##__dummy_fini {;} /* semicolon catcher */ + +#define LU_KEY_INIT_FINI(mod, type) \ + LU_KEY_INIT(mod,type); \ + LU_KEY_FINI(mod,type) + +#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ + struct lu_context_key mod##_thread_key = { \ + .lct_tags = tags, \ + .lct_init = mod##_key_init, \ + .lct_fini = mod##_key_fini \ + } + +#define LU_CONTEXT_KEY_INIT(key) \ +do { \ + (key)->lct_owner = THIS_MODULE; \ +} while (0) + +int lu_context_key_register(struct lu_context_key *key); +void lu_context_key_degister(struct lu_context_key *key); +void *lu_context_key_get (const struct lu_context *ctx, + const struct lu_context_key *key); +void lu_context_key_quiesce (struct lu_context_key *key); +void lu_context_key_revive (struct lu_context_key *key); + + +/* + * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an + * owning module. + */ + +#define LU_KEY_INIT_GENERIC(mod) \ + static void mod##_key_init_generic(struct lu_context_key *k, ...) \ + { \ + struct lu_context_key *key = k; \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ + key = va_arg(args, struct lu_context_key *); \ + } while (key != NULL); \ + va_end(args); \ + } + +#define LU_TYPE_INIT(mod, ...) \ + LU_KEY_INIT_GENERIC(mod) \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ + mod##_key_init_generic(__VA_ARGS__, NULL); \ + return lu_context_key_register_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_init {;} + +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ + lu_context_key_degister_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_fini {;} + +#define LU_TYPE_START(mod, ...) \ + static void mod##_type_start(struct lu_device_type *t) \ + { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_start {;} + +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop {;} + + + +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ + LU_TYPE_FINI(mod, __VA_ARGS__); \ + LU_TYPE_START(mod, __VA_ARGS__); \ + LU_TYPE_STOP(mod, __VA_ARGS__) + +int lu_context_init (struct lu_context *ctx, __u32 tags); +void lu_context_fini (struct lu_context *ctx); +void lu_context_enter (struct lu_context *ctx); +void lu_context_exit (struct lu_context *ctx); +int lu_context_refill(struct lu_context *ctx); + +/* + * Helper functions to operate on multiple keys. These are used by the default + * device type operations, defined by LU_TYPE_INIT_FINI(). + */ + +int lu_context_key_register_many(struct lu_context_key *k, ...); +void lu_context_key_degister_many(struct lu_context_key *k, ...); +void lu_context_key_revive_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many (struct lu_context_key *k, ...); + +/* + * update/clear ctx/ses tags. + */ +void lu_context_tags_update(__u32 tags); +void lu_context_tags_clear(__u32 tags); +void lu_session_tags_update(__u32 tags); +void lu_session_tags_clear(__u32 tags); + +/** + * Environment. + */ +struct lu_env { + /** + * "Local" context, used to store data instead of stack. + */ + struct lu_context le_ctx; + /** + * "Session" context for per-request data. + */ + struct lu_context *le_ses; +}; + +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); + +/** @} lu_context */ + +/** + * Output site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, char *page, int count); + +/** + * Common name structure to be passed around for various name related methods. + */ +struct lu_name { + const char *ln_name; + int ln_namelen; +}; + +/** + * Common buffer structure to be passed around for various xattr_{s,g}et() + * methods. + */ +struct lu_buf { + void *lb_buf; + ssize_t lb_len; +}; + +#define DLUBUF "(%p %zu)" +#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len +/** + * One-time initializers, called at obdclass module initialization, not + * exported. + */ + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void); + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void); + +struct lu_kmem_descr { + struct kmem_cache **ckd_cache; + const char *ckd_name; + const size_t ckd_size; +}; + +int lu_kmem_init(struct lu_kmem_descr *caches); +void lu_kmem_fini(struct lu_kmem_descr *caches); + +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid); +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf); + +/** null buffer */ +extern struct lu_buf LU_BUF_NULL; + +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, int size); +void lu_buf_realloc(struct lu_buf *buf, int size); + +int lu_buf_check_and_grow(struct lu_buf *buf, int len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len); + +/** @} lu */ +#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h new file mode 100644 index 000000000000..624c19be1524 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_ref.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LUSTRE_LU_REF_H +#define __LUSTRE_LU_REF_H + +#include + +/** \defgroup lu_ref lu_ref + * + * An interface to track references between objects. Mostly for debugging. + * + * Suppose there is a reference counted data-structure struct foo. To track + * who acquired references to instance of struct foo, add lu_ref field to it: + * + * \code + * struct foo { + * atomic_t foo_refcount; + * struct lu_ref foo_reference; + * ... + * }; + * \endcode + * + * foo::foo_reference has to be initialized by calling + * lu_ref_init(). Typically there will be functions or macros to increment and + * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) + * and foo_put(struct foo *foo), respectively. + * + * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() + * has to be called to insert into foo::foo_reference a record, describing + * acquired reference. Dually, lu_ref_del() removes matching record. Typical + * usages are: + * + * \code + * struct bar *bar; + * + * // bar owns a reference to foo. + * bar->bar_foo = foo_get(foo); + * lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del(&foo->foo_reference, "bar", bar); + * foo_put(bar->bar_foo); + * + * + * // current thread acquired a temporary reference to foo. + * foo_get(foo); + * lu_ref_add(&foo->reference, __FUNCTION__, current); + * + * ... + * + * // temporary reference is released. + * lu_ref_del(&foo->reference, __FUNCTION__, current); + * foo_put(foo); + * \endcode + * + * \e Et \e cetera. Often it makes sense to include lu_ref_add() and + * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct + * foo is destroyed, lu_ref_fini() has to be called that checks that no + * pending references remain. lu_ref_print() can be used to dump a list of + * pending references, while hunting down a leak. + * + * For objects to which a large number of references can be acquired, + * lu_ref_del() can become cpu consuming, as it has to scan the list of + * references. To work around this, remember result of lu_ref_add() (usually + * in the same place where pointer to struct foo is stored), and use + * lu_ref_del_at(): + * + * \code + * // There is a large number of bar's for a single foo. + * bar->bar_foo = foo_get(foo); + * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); + * foo_put(bar->bar_foo); + * \endcode + * + * lu_ref interface degrades gracefully in case of memory shortages. + * + * @{ + */ + + +struct lu_ref {}; + +static inline void lu_ref_init(struct lu_ref *ref) +{ +} + +static inline void lu_ref_fini(struct lu_ref *ref) +{ +} + +static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline void lu_ref_del(struct lu_ref *ref, const char *scope, + const void *source) +{ +} + +static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, + const void *source1) +{ +} + +static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ +} + +static inline int lu_ref_global_init(void) +{ + return 0; +} + +static inline void lu_ref_global_fini(void) +{ +} + +static inline void lu_ref_print(const struct lu_ref *ref) +{ +} + +static inline void lu_ref_print_all(void) +{ +} + +/** @} lu */ + +#endif /* __LUSTRE_LU_REF_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h new file mode 100644 index 000000000000..8d48cf4e27ee --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_target.h @@ -0,0 +1,91 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_LU_TARGET_H +#define _LUSTRE_LU_TARGET_H + +#include +#include + +struct lu_target { + struct obd_device *lut_obd; + struct dt_device *lut_bottom; + /** last_rcvd file */ + struct dt_object *lut_last_rcvd; + /* transaction callbacks */ + struct dt_txn_callback lut_txn_cb; + /** server data in last_rcvd file */ + struct lr_server_data lut_lsd; + /** Server last transaction number */ + __u64 lut_last_transno; + /** Lock protecting last transaction number */ + spinlock_t lut_translock; + /** Lock protecting client bitmap */ + spinlock_t lut_client_bitmap_lock; + /** Bitmap of known clients */ + unsigned long *lut_client_bitmap; +}; + +typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno, + void *data, int err); +struct tgt_commit_cb { + tgt_cb_t tgt_cb_func; + void *tgt_cb_data; +}; + +void tgt_boot_epoch_update(struct lu_target *lut); +int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut, + struct obd_export *exp, __u64 transno); +int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp); +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt); +void tgt_fini(const struct lu_env *env, struct lu_target *lut); +int tgt_client_alloc(struct obd_export *exp); +void tgt_client_free(struct obd_export *exp); +int tgt_client_del(const struct lu_env *env, struct obd_export *exp); +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int); +int tgt_client_new(const struct lu_env *env, struct obd_export *exp); +int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg, + struct lsd_client_data *lcd, loff_t *off, int index); +int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg, + struct lsd_client_data *lcd, loff_t *off, struct thandle *th); +int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg); +int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg, + struct thandle *th); +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync); +int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off); + +#endif /* __LUSTRE_LU_TARGET_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h new file mode 100644 index 000000000000..e8e0b084a6bc --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/libiam.h + * + * iam user level library + * + * Author: Wang Di + * Author: Nikita Danilov + * Author: Fan Yong + */ + +/* + * lustre/libiam.h + */ + +#ifndef __IAM_ULIB_H__ +#define __IAM_ULIB_H__ + +/** \defgroup libiam libiam + * + * @{ + */ + + +#define DX_FMT_NAME_LEN 16 + +enum iam_fmt_t { + FMT_LFIX, + FMT_LVAR +}; + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +/* + * Creat an iam file, but do NOT open it. + * Return 0 if success, else -1. + */ +int iam_creat(char *filename, enum iam_fmt_t fmt, + int blocksize, int keysize, int recsize, int ptrsize); + +/* + * Open an iam file, but do NOT creat it if the file doesn't exist. + * Please use iam_creat for creating the file before use iam_open. + * Return file id (fd) if success, else -1. + */ +int iam_open(char *filename, struct iam_uapi_info *ua); + +/* + * Close file opened by iam_open. + */ +int iam_close(int fd); + +/* + * Please use iam_open before use this function. + */ +int iam_insert(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_lookup(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_delete(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_it_start(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_next(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_stop(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Change iam file mode. + */ +int iam_polymorph(char *filename, unsigned long mode); + +/** @} libiam */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h new file mode 100644 index 000000000000..707eb74fdf68 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h @@ -0,0 +1,43 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* + * NOTE: This file is DEPRECATED! Please include lustreapi.h directly + * instead of this file. This file will be removed from a future version + * of lustre! + */ + +#ifndef _LIBLUSTREAPI_H_ +#define _LIBLUSTREAPI_H_ + +#include +#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly." + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h new file mode 100644 index 000000000000..ad253c6deadd --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/ll_fiemap.h + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah + * Author: Andreas Dilger + */ + +#ifndef _LUSTRE_FIEMAP_H +#define _LUSTRE_FIEMAP_H + + + +struct ll_fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_device; /* device number for this extent */ + __u32 fe_reserved[2]; +}; + +struct ll_user_fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ +}; + +#define FIEMAP_MAX_OFFSET (~0ULL) + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_DIRECT. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ + + +static inline size_t fiemap_count_to_size(size_t extent_count) +{ + return (sizeof(struct ll_user_fiemap) + extent_count * + sizeof(struct ll_fiemap_extent)); +} + +static inline unsigned fiemap_size_to_count(size_t array_size) +{ + return ((array_size - sizeof(struct ll_user_fiemap)) / + sizeof(struct ll_fiemap_extent)); +} + +#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ + +#ifdef FIEMAP_FLAGS_COMPAT +#undef FIEMAP_FLAGS_COMPAT +#endif + +/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ +#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ + +#endif /* _LUSTRE_FIEMAP_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h new file mode 100644 index 000000000000..93a3d7db3010 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h @@ -0,0 +1,2 @@ +#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0" +#define LUSTRE_RELEASE 3.9.0_g6e62c21 diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h new file mode 100644 index 000000000000..029aa2fcbe07 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h @@ -0,0 +1,3629 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_idl.h + * + * Lustre wire protocol definitions. + */ + +/** \defgroup lustreidl lustreidl + * + * Lustre wire protocol definitions. + * + * ALL structs passing over the wire should be declared here. Structs + * that are used in interfaces with userspace should go in lustre_user.h. + * + * All structs being declared here should be built from simple fixed-size + * types (__u8, __u16, __u32, __u64) or be built from other types or + * structs also declared in this file. Similarly, all flags and magic + * values in those structs should also be declared here. This ensures + * that the Lustre wire protocol is not influenced by external dependencies. + * + * The only other acceptable items in this file are VERY SIMPLE accessor + * functions to avoid callers grubbing inside the structures, and the + * prototypes of the swabber functions for each struct. Nothing that + * depends on external functions or definitions should be in here. + * + * Structs must be properly aligned to put 64-bit values on an 8-byte + * boundary. Any structs being added here must also be added to + * utils/wirecheck.c and "make newwiretest" run to regenerate the + * utils/wiretest.c sources. This allows us to verify that wire structs + * have the proper alignment/size on all architectures. + * + * DO NOT CHANGE any of the structs, flags, values declared here and used + * in released Lustre versions. Some structs may have padding fields that + * can be used. Some structs might allow addition at the end (verify this + * in the code to ensure that new/old clients that see this larger struct + * do not fail, otherwise you need to implement protocol compatibility). + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, + * implemented either here, inline (trivial implementations) or in + * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other" + * endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + * + * @{ + */ + +#ifndef _LUSTRE_IDL_H_ +#define _LUSTRE_IDL_H_ + +#if !defined(LASSERT) && !defined(LPU64) +#include /* for LASSERT, LPUX64, etc */ +#endif + +/* Defn's shared with user-space. */ +#include + +/* + * GENERAL STUFF + */ +/* FOO_REQUEST_PORTAL is for incoming requests on the FOO + * FOO_REPLY_PORTAL is for incoming replies on the FOO + * FOO_BULK_PORTAL is for incoming bulk on the FOO + */ + +#define CONNMGR_REQUEST_PORTAL 1 +#define CONNMGR_REPLY_PORTAL 2 +//#define OSC_REQUEST_PORTAL 3 +#define OSC_REPLY_PORTAL 4 +//#define OSC_BULK_PORTAL 5 +#define OST_IO_PORTAL 6 +#define OST_CREATE_PORTAL 7 +#define OST_BULK_PORTAL 8 +//#define MDC_REQUEST_PORTAL 9 +#define MDC_REPLY_PORTAL 10 +//#define MDC_BULK_PORTAL 11 +#define MDS_REQUEST_PORTAL 12 +//#define MDS_REPLY_PORTAL 13 +#define MDS_BULK_PORTAL 14 +#define LDLM_CB_REQUEST_PORTAL 15 +#define LDLM_CB_REPLY_PORTAL 16 +#define LDLM_CANCEL_REQUEST_PORTAL 17 +#define LDLM_CANCEL_REPLY_PORTAL 18 +//#define PTLBD_REQUEST_PORTAL 19 +//#define PTLBD_REPLY_PORTAL 20 +//#define PTLBD_BULK_PORTAL 21 +#define MDS_SETATTR_PORTAL 22 +#define MDS_READPAGE_PORTAL 23 +#define MDS_MDS_PORTAL 24 + +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 +#define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 +#define SEQ_DATA_PORTAL 31 +#define SEQ_CONTROLLER_PORTAL 32 +#define MGS_BULK_PORTAL 33 + +/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */ + +/* packet types */ +#define PTL_RPC_MSG_REQUEST 4711 +#define PTL_RPC_MSG_ERR 4712 +#define PTL_RPC_MSG_REPLY 4713 + +/* DON'T use swabbed values of MAGIC as magic! */ +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 +#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3 + +#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B +#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B + +#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2 + +#define PTLRPC_MSG_VERSION 0x00000003 +#define LUSTRE_VERSION_MASK 0xffff0000 +#define LUSTRE_OBD_VERSION 0x00010000 +#define LUSTRE_MDS_VERSION 0x00020000 +#define LUSTRE_OST_VERSION 0x00030000 +#define LUSTRE_DLM_VERSION 0x00040000 +#define LUSTRE_LOG_VERSION 0x00050000 +#define LUSTRE_MGS_VERSION 0x00060000 + +typedef __u32 mdsno_t; +typedef __u64 seqno_t; +typedef __u64 obd_id; +typedef __u64 obd_seq; +typedef __s64 obd_time; +typedef __u64 obd_size; +typedef __u64 obd_off; +typedef __u64 obd_blocks; +typedef __u64 obd_valid; +typedef __u32 obd_blksize; +typedef __u32 obd_mode; +typedef __u32 obd_uid; +typedef __u32 obd_gid; +typedef __u32 obd_flag; +typedef __u32 obd_count; + +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_index field holds mdt id + * of the home mdt. + */ +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_index; + __u32 lsr_flags; +}; + +#define LU_SEQ_RANGE_MDT 0x0 +#define LU_SEQ_RANGE_OST 0x1 +#define LU_SEQ_RANGE_ANY 0x3 + +#define LU_SEQ_RANGE_MASK 0x3 + +static inline unsigned fld_range_type(const struct lu_seq_range *range) +{ + return range->lsr_flags & LU_SEQ_RANGE_MASK; +} + +static inline int fld_range_is_ost(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_OST; +} + +static inline int fld_range_is_mdt(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_MDT; +} + +/** + * This all range is only being used when fld client sends fld query request, + * but it does not know whether the seq is MDT or OST, so it will send req + * with ALL type, which means either seq type gotten from lookup can be + * expected. + */ +static inline unsigned fld_range_is_any(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_ANY; +} + +static inline void fld_range_set_type(struct lu_seq_range *range, + unsigned flags) +{ + LASSERT(!(flags & ~LU_SEQ_RANGE_MASK)); + range->lsr_flags |= flags; +} + +static inline void fld_range_set_mdt(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_MDT); +} + +static inline void fld_range_set_ost(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_OST); +} + +static inline void fld_range_set_any(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_ANY); +} + +/** + * returns width of given range \a r + */ + +static inline __u64 range_space(const struct lu_seq_range *range) +{ + return range->lsr_end - range->lsr_start; +} + +/** + * initialize range to zero + */ + +static inline void range_init(struct lu_seq_range *range) +{ + range->lsr_start = range->lsr_end = range->lsr_index = 0; +} + +/** + * check if given seq id \a s is within given range \a r + */ + +static inline int range_within(const struct lu_seq_range *range, + __u64 s) +{ + return s >= range->lsr_start && s < range->lsr_end; +} + +static inline int range_is_sane(const struct lu_seq_range *range) +{ + return (range->lsr_end >= range->lsr_start); +} + +static inline int range_is_zero(const struct lu_seq_range *range) +{ + return (range->lsr_start == 0 && range->lsr_end == 0); +} + +static inline int range_is_exhausted(const struct lu_seq_range *range) + +{ + return range_space(range) == 0; +} + +/* return 0 if two range have the same location */ +static inline int range_compare_loc(const struct lu_seq_range *r1, + const struct lu_seq_range *r2) +{ + return r1->lsr_index != r2->lsr_index || + r1->lsr_flags != r2->lsr_flags; +} + +#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s" + +#define PRANGE(range) \ + (range)->lsr_start, \ + (range)->lsr_end, \ + (range)->lsr_index, \ + fld_range_is_mdt(range) ? "mdt" : "ost" + + +/** \defgroup lu_fid lu_fid + * @{ */ + +/** + * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat. + * Deprecated since HSM and SOM attributes are now stored in separate on-disk + * xattr. + */ +enum lma_compat { + LMAC_HSM = 0x00000001, + LMAC_SOM = 0x00000002, +}; + +/** + * Masks for all features that should be supported by a Lustre version to + * access a specific file. + * This information is stored in lustre_mdt_attrs::lma_incompat. + */ +enum lma_incompat { + LMAI_RELEASED = 0x0000001, /* file is released */ + LMAI_AGENT = 0x00000002, /* agent inode */ + LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object + is on the remote MDT */ +}; +#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT) + +extern void lustre_lma_swab(struct lustre_mdt_attrs *lma); +extern void lustre_lma_init(struct lustre_mdt_attrs *lma, + const struct lu_fid *fid, __u32 incompat); +/** + * SOM on-disk attributes stored in a separate xattr. + */ +struct som_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 som_compat; + + /** Incompat feature list. The supported feature mask is availabe in + * SOM_INCOMPAT_SUPP */ + __u32 som_incompat; + + /** IO Epoch SOM attributes belongs to */ + __u64 som_ioepoch; + /** total file size in objects */ + __u64 som_size; + /** total fs blocks in objects */ + __u64 som_blocks; + /** mds mount id the size is valid for */ + __u64 som_mountid; +}; +extern void lustre_som_swab(struct som_attrs *attrs); + +#define SOM_INCOMPAT_SUPP 0x0 + +/** + * HSM on-disk attributes stored in a separate xattr. + */ +struct hsm_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 hsm_compat; + + /** HSM flags, see hsm_flags enum below */ + __u32 hsm_flags; + /** backend archive id associated with the file */ + __u64 hsm_arch_id; + /** version associated with the last archiving, if any */ + __u64 hsm_arch_ver; +}; +extern void lustre_hsm_swab(struct hsm_attrs *attrs); + +/** + * fid constants + */ +enum { + /** initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +/** returns fid object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/** returns fid object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/** returns fid object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_zero(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +static inline obd_id fid_ver_oid(const struct lu_fid *fid) +{ + return ((__u64)fid_ver(fid) << 32 | fid_oid(fid)); +} + +/** + * Note that reserved SEQ numbers below 12 will conflict with ldiskfs + * inodes in the IGIF namespace, so these reserved SEQ numbers can be + * used for other purposes and not risk collisions with existing inodes. + * + * Different FID Format + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0 + */ +enum fid_seq { + FID_SEQ_OST_MDT0 = 0, + FID_SEQ_LLOG = 1, /* unnamed llogs */ + FID_SEQ_ECHO = 2, + FID_SEQ_OST_MDT1 = 3, + FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */ + FID_SEQ_LLOG_NAME = 10, /* named llogs */ + FID_SEQ_RSVD = 11, + FID_SEQ_IGIF = 12, + FID_SEQ_IGIF_MAX = 0x0ffffffffULL, + FID_SEQ_IDIF = 0x100000000ULL, + FID_SEQ_IDIF_MAX = 0x1ffffffffULL, + /* Normal FID sequence starts from this value, i.e. 1<<33 */ + FID_SEQ_START = 0x200000000ULL, + /* sequence for local pre-defined FIDs listed in local_oid */ + FID_SEQ_LOCAL_FILE = 0x200000001ULL, + FID_SEQ_DOT_LUSTRE = 0x200000002ULL, + /* sequence is used for local named objects FIDs generated + * by local_object_storage library */ + FID_SEQ_LOCAL_NAME = 0x200000003ULL, + /* Because current FLD will only cache the fid sequence, instead + * of oid on the client side, if the FID needs to be exposed to + * clients sides, it needs to make sure all of fids under one + * sequence will be located in one MDT. */ + FID_SEQ_SPECIAL = 0x200000004ULL, + FID_SEQ_QUOTA = 0x200000005ULL, + FID_SEQ_QUOTA_GLB = 0x200000006ULL, + FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ + FID_SEQ_NORMAL = 0x200000400ULL, + FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL +}; + +#define OBIF_OID_MAX_BITS 32 +#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) +#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) +#define IDIF_OID_MAX_BITS 48 +#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) +#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) + +/** OID for FID_SEQ_SPECIAL */ +enum special_oid { + /* Big Filesystem Lock to serialize rename operations */ + FID_OID_SPECIAL_BFL = 1UL, +}; + +/** OID for FID_SEQ_DOT_LUSTRE */ +enum dot_lustre_oid { + FID_OID_DOT_LUSTRE = 1UL, + FID_OID_DOT_LUSTRE_OBF = 2UL, +}; + +static inline int fid_seq_is_mdt0(obd_seq seq) +{ + return (seq == FID_SEQ_OST_MDT0); +} + +static inline int fid_seq_is_mdt(const __u64 seq) +{ + return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; +}; + +static inline int fid_seq_is_echo(obd_seq seq) +{ + return (seq == FID_SEQ_ECHO); +} + +static inline int fid_is_echo(const struct lu_fid *fid) +{ + return fid_seq_is_echo(fid_seq(fid)); +} + +static inline int fid_seq_is_llog(obd_seq seq) +{ + return (seq == FID_SEQ_LLOG); +} + +static inline int fid_is_llog(const struct lu_fid *fid) +{ + /* file with OID == 1 is not llog but contains last oid */ + return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 1; +} + +static inline int fid_seq_is_rsvd(const __u64 seq) +{ + return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD); +}; + +static inline int fid_seq_is_special(const __u64 seq) +{ + return seq == FID_SEQ_SPECIAL; +}; + +static inline int fid_seq_is_local_file(const __u64 seq) +{ + return seq == FID_SEQ_LOCAL_FILE || + seq == FID_SEQ_LOCAL_NAME; +}; + +static inline int fid_seq_is_root(const __u64 seq) +{ + return seq == FID_SEQ_ROOT; +} + +static inline int fid_seq_is_dot(const __u64 seq) +{ + return seq == FID_SEQ_DOT_LUSTRE; +} + +static inline int fid_seq_is_default(const __u64 seq) +{ + return seq == FID_SEQ_LOV_DEFAULT; +} + +static inline int fid_is_mdt0(const struct lu_fid *fid) +{ + return fid_seq_is_mdt0(fid_seq(fid)); +} + +static inline void lu_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = 1; + fid->f_ver = 0; +} + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is a igif; otherwise false. + */ +static inline int fid_seq_is_igif(const __u64 seq) +{ + return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; +} + +static inline int fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq_is_igif(fid_seq(fid)); +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is a idif; otherwise false. + */ +static inline int fid_seq_is_idif(const __u64 seq) +{ + return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; +} + +static inline int fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq_is_idif(fid_seq(fid)); +} + +static inline int fid_is_local_file(const struct lu_fid *fid) +{ + return fid_seq_is_local_file(fid_seq(fid)); +} + +static inline int fid_seq_is_norm(const __u64 seq) +{ + return (seq >= FID_SEQ_NORMAL); +} + +static inline int fid_is_norm(const struct lu_fid *fid) +{ + return fid_seq_is_norm(fid_seq(fid)); +} + +/* convert an OST objid into an IDIF FID SEQ number */ +static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx) +{ + return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); +} + +/* convert a packed IDIF FID into an OST objid */ +static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver) +{ + return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; +} + +/* extract ost index from IDIF FID */ +static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) +{ + LASSERT(fid_is_idif(fid)); + return (fid_seq(fid) >> 16) & 0xffff; +} + +/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ +static inline obd_seq ostid_seq(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return FID_SEQ_OST_MDT0; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return FID_SEQ_LOV_DEFAULT; + + if (fid_is_idif(&ostid->oi_fid)) + return FID_SEQ_OST_MDT0; + + return fid_seq(&ostid->oi_fid); +} + +/* extract OST objid from a wire ost_id (id/seq) pair */ +static inline obd_id ostid_id(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid_seq(ostid))) + return ostid->oi.oi_id & IDIF_OID_MASK; + + if (fid_is_idif(&ostid->oi_fid)) + return fid_idif_id(fid_seq(&ostid->oi_fid), + fid_oid(&ostid->oi_fid), 0); + + return fid_oid(&ostid->oi_fid); +} + +static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) +{ + if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { + oi->oi.oi_seq = seq; + } else { + oi->oi_fid.f_seq = seq; + /* Note: if f_oid + f_ver is zero, we need init it + * to be 1, otherwise, ostid_seq will treat this + * as old ostid (oi_seq == 0) */ + if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0) + oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; + } +} + +static inline void ostid_set_seq_mdt0(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_OST_MDT0); +} + +static inline void ostid_set_seq_echo(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_ECHO); +} + +static inline void ostid_set_seq_llog(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_LLOG); +} + +/** + * Note: we need check oi_seq to decide where to set oi_id, + * so oi_seq should always be set ahead of oi_id. + */ +static inline void ostid_set_id(struct ost_id *oi, __u64 oid) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (oid >= IDIF_MAX_OID) { + CERROR("Bad "LPU64" to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi.oi_id = oid; + } else { + if (oid > OBIF_MAX_OID) { + CERROR("Bad "LPU64" to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi_fid.f_oid = oid; + } +} + +static inline void ostid_inc_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) { + CERROR("Bad inc "DOSTID"\n", POSTID(oi)); + return; + } + oi->oi.oi_id++; + } else { + oi->oi_fid.f_oid++; + } +} + +static inline void ostid_dec_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) + oi->oi.oi_id--; + else + oi->oi_fid.f_oid--; +} + +/** + * Unpack an OST object id/seq (group) into a FID. This is needed for + * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper + * FIDs. Note that if an id/seq is already in FID/IDIF format it will + * be passed through unchanged. Only legacy OST objects in "group 0" + * will be mapped into the IDIF namespace so that they can fit into the + * struct lu_fid fields without loss. For reference see: + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs + */ +static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid, + __u32 ost_idx) +{ + if (ost_idx > 0xffff) { + CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid), + ost_idx); + return -EBADF; + } + + if (fid_seq_is_mdt0(ostid_seq(ostid))) { + /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" + * that we map into the IDIF namespace. It allows up to 2^48 + * objects per OST, as this is the object namespace that has + * been in production for years. This can handle create rates + * of 1M objects/s/OST for 9 years, or combinations thereof. */ + if (ostid_id(ostid) >= IDIF_MAX_OID) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx); + /* truncate to 32 bits by assignment */ + fid->f_oid = ostid_id(ostid); + /* in theory, not currently used */ + fid->f_ver = ostid_id(ostid) >> 48; + } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ { + /* This is either an IDIF object, which identifies objects across + * all OSTs, or a regular FID. The IDIF namespace maps legacy + * OST objects into the FID namespace. In both cases, we just + * pass the FID through, no conversion needed. */ + if (ostid->oi_fid.f_ver != 0) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + *fid = ostid->oi_fid; + } + + return 0; +} + +/* pack any OST FID into an ostid (id/seq) for the wire/disk */ +static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) +{ + if (unlikely(fid_seq_is_igif(fid->f_seq))) { + CERROR("bad IGIF, "DFID"\n", PFID(fid)); + return -EBADF; + } + + if (fid_is_idif(fid)) { + ostid_set_seq_mdt0(ostid); + ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid), + fid_ver(fid))); + } else { + ostid->oi_fid = *fid; + } + + return 0; +} + +/* Check whether the fid is for LAST_ID */ +static inline int fid_is_last_id(const struct lu_fid *fid) +{ + return (fid_is_idif(fid) || fid_is_norm(fid) || fid_is_echo(fid)) && + fid_oid(fid) == 0; +} + +/** + * Get inode number from a igif. + * \param fid a igif to get inode number from. + * \return inode number for the igif. + */ +static inline ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +extern void lustre_swab_ost_id(struct ost_id *oid); + +/** + * Get inode generation from a igif. + * \param fid a igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Build igif from the inode number/generation. + */ +static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) +{ + fid->f_seq = ino; + fid->f_oid = gen; + fid->f_ver = 0; +} + +/* + * Fids are transmitted across network (in the sender byte-ordering), + * and stored on disk in big-endian order. + */ +static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = cpu_to_le64(fid_seq(src)); + dst->f_oid = cpu_to_le32(fid_oid(src)); + dst->f_ver = cpu_to_le32(fid_ver(src)); +} + +static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = le64_to_cpu(fid_seq(src)); + dst->f_oid = le32_to_cpu(fid_oid(src)); + dst->f_ver = le32_to_cpu(fid_ver(src)); +} + +static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = cpu_to_be64(fid_seq(src)); + dst->f_oid = cpu_to_be32(fid_oid(src)); + dst->f_ver = cpu_to_be32(fid_ver(src)); +} + +static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = be64_to_cpu(fid_seq(src)); + dst->f_oid = be32_to_cpu(fid_oid(src)); + dst->f_ver = be32_to_cpu(fid_ver(src)); +} + +static inline int fid_is_sane(const struct lu_fid *fid) +{ + return fid != NULL && + ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) || + fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq_is_rsvd(fid_seq(fid))); +} + +static inline int fid_is_zero(const struct lu_fid *fid) +{ + return fid_seq(fid) == 0 && fid_oid(fid) == 0; +} + +extern void lustre_swab_lu_fid(struct lu_fid *fid); +extern void lustre_swab_lu_seq_range(struct lu_seq_range *range); + +static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +{ + /* Check that there is no alignment padding. */ + CLASSERT(sizeof *f0 == + sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver); + return memcmp(f0, f1, sizeof *f0) == 0; +} + +#define __diff_normalize(val0, val1) \ +({ \ + typeof(val0) __val0 = (val0); \ + typeof(val1) __val1 = (val1); \ + \ + (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1); \ +}) + +static inline int lu_fid_cmp(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + return + __diff_normalize(fid_seq(f0), fid_seq(f1)) ?: + __diff_normalize(fid_oid(f0), fid_oid(f1)) ?: + __diff_normalize(fid_ver(f0), fid_ver(f1)); +} + +static inline void ostid_cpu_to_le(struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); + } else { + fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +static inline void ostid_le_to_cpu(struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); + } else { + fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +/** @} lu_fid */ + +/** \defgroup lu_dir lu_dir + * @{ */ + +/** + * Enumeration of possible directory entry attributes. + * + * Attributes follow directory entry header in the order they appear in this + * enumeration. + */ +enum lu_dirent_attrs { + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT interanl only, + * not visible to client */ + + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, +}; + +#define LU_DIRENT_ATTRS_MASK 0xf800 + +/** + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + /** valid if LUDA_FID is set. */ + struct lu_fid lde_fid; + /** a unique entry identifier: a hash or an offset. */ + __u64 lde_hash; + /** total record length, including all attributes. */ + __u16 lde_reclen; + /** name length */ + __u16 lde_namelen; + /** optional variable size attributes following this entry. + * taken from enum lu_dirent_attrs. + */ + __u32 lde_attrs; + /** name is followed by the attributes indicated in ->ldp_attrs, in + * their natural order. After the last attribute, padding bytes are + * added to make ->lde_reclen a multiple of 8. + */ + char lde_name[0]; +}; + +/* + * Definitions of optional directory entry attributes formats. + * + * Individual attributes do not have their length encoded in a generic way. It + * is assumed that consumer of an attribute knows its format. This means that + * it is impossible to skip over an unknown attribute, except by skipping over all + * remaining attributes (by using ->lde_reclen), which is not too + * constraining, because new server versions will append new attributes at + * the end of an entry. + */ + +/** + * Fid directory attribute: a fid of an object referenced by the entry. This + * will be almost always requested by the client and supplied by the server. + * + * Aligned to 8 bytes. + */ +/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ + +/** + * File type. + * + * Aligned to 2 bytes. + */ +struct luda_type { + __u16 lt_type; +}; + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u32 ldp_flags; + __u32 ldp_pad0; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + /** + * dirpage contains no entry. + */ + LDF_EMPTY = 1 << 0, + /** + * last entry's lde_hash equals ldp_hash_end. + */ + LDF_COLLIDE = 1 << 1 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline int lu_dirent_calc_size(int namelen, __u16 attr) +{ + int size; + + if (attr & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + size = (sizeof(struct lu_dirent) + namelen + align) & ~align; + size += sizeof(struct luda_type); + } else + size = sizeof(struct lu_dirent) + namelen; + + return (size + 7) & ~7; +} + +static inline int lu_dirent_size(struct lu_dirent *ent) +{ + if (le16_to_cpu(ent->lde_reclen) == 0) { + return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen), + le32_to_cpu(ent->lde_attrs)); + } + return le16_to_cpu(ent->lde_reclen); +} + +#define MDS_DIR_END_OFF 0xfffffffffffffffeULL + +/** + * MDS_READPAGE page size + * + * This is the directory page size packed in MDS_READPAGE RPC. + * It's different than PAGE_CACHE_SIZE because the client needs to + * access the struct lu_dirpage header packed at the beginning of + * the "page" and without this there isn't any way to know find the + * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ. + */ +#define LU_PAGE_SHIFT 12 +#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) +#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) + +#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT)) + +/** @} lu_dir */ + +struct lustre_handle { + __u64 cookie; +}; +#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL + +static inline int lustre_handle_is_used(struct lustre_handle *lh) +{ + return lh->cookie != 0ull; +} + +static inline int lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + +/* flags for lm_flags */ +#define MSGHDR_AT_SUPPORT 0x1 +#define MSGHDR_CKSUM_INCOMPAT18 0x2 + +#define lustre_msg lustre_msg_v2 +/* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ +struct lustre_msg_v2 { + __u32 lm_bufcount; + __u32 lm_secflvr; + __u32 lm_magic; + __u32 lm_repsize; + __u32 lm_cksum; + __u32 lm_flags; + __u32 lm_padding_2; + __u32 lm_padding_3; + __u32 lm_buflens[0]; +}; + +/* without gss, ptlrpc_body is put at the first buffer. */ +#define PTLRPC_NUM_VERSIONS 4 +#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */ +struct ptlrpc_body_v3 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; + char pb_jobid[JOBSTATS_JOBID_SIZE]; +}; +#define ptlrpc_body ptlrpc_body_v3 + +struct ptlrpc_body_v2 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; +}; + +extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); + +/* message body offset for lustre_msg_v2 */ +/* ptlrpc body offset in all request/reply messages */ +#define MSG_PTLRPC_BODY_OFF 0 + +/* normal request/reply message record offset */ +#define REQ_REC_OFF 1 +#define REPLY_REC_OFF 1 + +/* ldlm request message body offset */ +#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ +#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ + +/* ldlm intent lock message body offset */ +#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ + +/* ldlm reply message body offset */ +#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define DLM_REPLY_REC_OFF 2 /* reply record offset */ + +/** only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + +/* Flags that are operation-specific go in the top 16 bits. */ +#define MSG_OP_FLAG_MASK 0xffff0000 +#define MSG_OP_FLAG_SHIFT 16 + +/* Flags that apply to all requests are in the bottom 16 bits */ +#define MSG_GEN_FLAG_MASK 0x0000ffff +#define MSG_LAST_REPLAY 0x0001 +#define MSG_RESENT 0x0002 +#define MSG_REPLAY 0x0004 +/* #define MSG_AT_SUPPORT 0x0008 + * This was used in early prototypes of adaptive timeouts, and while there + * shouldn't be any users of that code there also isn't a need for using this + * bits. Defer usage until at least 1.10 to avoid potential conflict. */ +#define MSG_DELAY_REPLAY 0x0010 +#define MSG_VERSION_REPLAY 0x0020 +#define MSG_REQ_REPLAY_DONE 0x0040 +#define MSG_LOCK_REPLAY_DONE 0x0080 + +/* + * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) + */ + +#define MSG_CONNECT_RECOVERING 0x00000001 +#define MSG_CONNECT_RECONNECT 0x00000002 +#define MSG_CONNECT_REPLAYABLE 0x00000004 +//#define MSG_CONNECT_PEER 0x8 +#define MSG_CONNECT_LIBCLIENT 0x00000010 +#define MSG_CONNECT_INITIAL 0x00000020 +#define MSG_CONNECT_ASYNC 0x00000040 +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */ + +/* Connect flags */ +#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ +#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/ +#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated. + *We do not support JOIN FILE + *anymore, reserve this flags + *just for preventing such bit + *to be reused.*/ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ +#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ +#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ +#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ +#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ +#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits + * directory hash */ +#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ +#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ +#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ +#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ +#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS + * RPC error properly */ +#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for + * finer space reservation */ +#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 + * policy and 2.x server */ +#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ +#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ +#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ +#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ +#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ +/* XXX README XXX: + * Please DO NOT add flag values here before first ensuring that this same + * flag value is not in use on some other branch. Please clear any such + * changes with senior engineers before starting to use a new flag. Then, + * submit a small patch against EVERY branch that ONLY adds the new flag, + * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the + * flag to check_obd_connect_data(), and updates wiretests accordingly, so it + * can be approved and landed easily to reserve the flag for future use. */ + +/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS + * connection. It is a temporary bug fix for Imperative Recovery interop + * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for + * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */ +#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS + +#define OCD_HAS_FLAG(ocd, flg) \ + (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) + + +#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE + +#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ + OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ + OBD_CONNECT_IBITS | \ + OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \ + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \ + OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \ + OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS) +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \ + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ + OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS) +#define ECHO_CONNECT_SUPPORTED (0) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ + OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS) + +/* Features required for this version of the client to work with server */ +#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ + OBD_CONNECT_FULL20) + +#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ + ((patch)<<8) + (fix)) +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255) +#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255) + +/* This structure is used for both request and reply. + * + * If we eventually have separate connect data for different types, which we + * almost certainly will, then perhaps we stick a union in here. */ +struct obd_connect_data_v1 { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes, must be 2^n */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* also fix lustre_swab_connect */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ +}; + +struct obd_connect_data { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* instance # of this target */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ + /* Fields after ocd_maxbytes are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + __u64 padding1; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding2; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ +}; +/* XXX README XXX: + * Please DO NOT use any fields here before first ensuring that this same + * field is not in use on some other branch. Please clear any such changes + * with senior engineers before starting to use a new field. Then, submit + * a small patch against EVERY branch that ONLY adds the new field along with + * the matching OBD_CONNECT flag, so that can be approved and landed easily to + * reserve the flag for future use. */ + + +extern void lustre_swab_connect(struct obd_connect_data *ocd); + +/* + * Supported checksum algorithms. Up to 32 checksum types are supported. + * (32-bit mask stored in obd_connect_data::ocd_cksum_types) + * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new + * algorithm and also the OBD_FL_CKSUM* flags. + */ +typedef enum { + OBD_CKSUM_CRC32 = 0x00000001, + OBD_CKSUM_ADLER = 0x00000002, + OBD_CKSUM_CRC32C= 0x00000004, +} cksum_type_t; + +/* + * OST requests: OBDO & OBD request records + */ + +/* opcodes */ +typedef enum { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_LAST_OPC +} ost_cmd_t; +#define OST_FIRST_OPC OST_REPLY + +enum obdo_flags { + OBD_FL_INLINEDATA = 0x00000001, + OBD_FL_OBDMDEXISTS = 0x00000002, + OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ + OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ + OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ + OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */ + OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ + OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ + OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ + OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ + OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ + OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ + OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ + OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ + OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */ + OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ + OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ + OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. + * XXX: obsoleted - reserved for old + * clients prior than 2.2 */ + OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ + OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ + + /* Note that while these checksum values are currently separate bits, + * in 2.x we can actually allow all values from 1-31 if we wanted. */ + OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | + OBD_FL_CKSUM_CRC32C, + + /* mask for local-only flag, which won't be sent over network */ + OBD_FL_LOCAL_MASK = 0xF0000000, +}; + +#define LOV_MAGIC_V1 0x0BD10BD0 +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_MAGIC_V3 0x0BD30BD0 + +/* + * magic for fully defined striping + * the idea is that we should have different magics for striping "hints" + * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct + * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, + * we can't just change it w/o long way preparation, but we still need a + * mechanism to allow LOD to differentiate hint versus ready striping. + * so, at the moment we do a trick: MDT knows what to expect from request + * depending on the case (replay uses ready striping, non-replay req uses + * hints), so MDT replaces magic with appropriate one and now LOD can + * easily understand what's inside -bzzz + */ +#define LOV_MAGIC_V1_DEF 0x0CD10BD0 +#define LOV_MAGIC_V3_DEF 0x0CD30BD0 + +#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */ +#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */ +#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */ +#define LOV_PATTERN_CMOBD 0x200 + +#define lov_ost_data lov_ost_data_v1 +struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this l_ost_idx */ + __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ +}; + +#define lov_mds_md lov_mds_md_v1 +struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +/** + * Sigh, because pre-2.4 uses + * struct lov_mds_md_v1 { + * ........ + * __u64 lmm_object_id; + * __u64 lmm_object_seq; + * ...... + * } + * to identify the LOV(MDT) object, and lmm_object_seq will + * be normal_fid, which make it hard to combine these conversion + * to ostid_to FID. so we will do lmm_oi/fid conversion separately + * + * We can tell the lmm_oi by this way, + * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 + * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL + * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, + * lmm_oi.f_ver = 0 + * + * But currently lmm_oi/lsm_oi does not have any "real" usages, + * except for printing some information, and the user can always + * get the real FID from LMA, besides this multiple case check might + * make swab more complicate. So we will keep using id/seq for lmm_oi. + */ + +static inline void fid_to_lmm_oi(const struct lu_fid *fid, + struct ost_id *oi) +{ + oi->oi.oi_id = fid_oid(fid); + oi->oi.oi_seq = fid_seq(fid); +} + +static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) +{ + oi->oi.oi_seq = seq; +} + +static inline __u64 lmm_oi_id(struct ost_id *oi) +{ + return oi->oi.oi_id; +} + +static inline __u64 lmm_oi_seq(struct ost_id *oi) +{ + return oi->oi.oi_seq; +} + +static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); +} + +static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); +} + +/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */ + +#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data)) +#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) + +#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" +#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" +#define XATTR_USER_PREFIX "user." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_SECURITY_PREFIX "security." +#define XATTR_LUSTRE_PREFIX "lustre." + +#define XATTR_NAME_LOV "trusted.lov" +#define XATTR_NAME_LMA "trusted.lma" +#define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_LINK "trusted.link" +#define XATTR_NAME_FID "trusted.fid" +#define XATTR_NAME_VERSION "trusted.version" +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_HSM "trusted.hsm" +#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace" + +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +#define OBD_MD_FLID (0x00000001ULL) /* object ID */ +#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ +#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ +#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ +#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ +#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ +#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ +#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ +#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ +#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ +#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ +#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ +#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ +#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */ +/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */ +#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ +#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ +#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ +#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ +#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ +#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */ +/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */ +#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */ +#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ +#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ +#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */ + /* ->mds if epoch opens or closes */ +#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ +#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ +#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ + +#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ +#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ + +/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire, + * and it is already obsolete since 2.3 */ +/* #define OBD_MD_MDTIDX (0x0000000800000000ULL) */ + +#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ +#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ +#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ +#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */ +#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */ +#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */ +#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes + * under lock */ +#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ + +#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */ +#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */ +#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */ +#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */ + +#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ + +#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ + OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ + OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP) + +/* don't forget obdo_fid which is way down at the bottom so it can + * come after the definition of llog_cookie */ + +enum hss_valid { + HSS_SETMASK = 0x01, + HSS_CLEARMASK = 0x02, + HSS_ARCHIVE_ID = 0x04, +}; + +struct hsm_state_set { + __u32 hss_valid; + __u32 hss_archive_id; + __u64 hss_setmask; + __u64 hss_clearmask; +}; + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss); + +extern void lustre_swab_obd_statfs (struct obd_statfs *os); + +/* ost_body.data values for OST_BRW */ + +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous + * transfer and is not accounted in + * the grant. */ +#define OBD_BRW_CHECK 0x10 +#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ +#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ +#define OBD_BRW_NOQUOTA 0x100 +#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ +#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ +#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ + +#define OBD_OBJECT_EOF 0xffffffffffffffffULL + +#define OST_MIN_PRECREATE 32 +#define OST_MAX_PRECREATE 20000 + +struct obd_ioobj { + struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ + __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, + * now (PTLRPC_BULK_OPS_COUNT - 1) in + * high 16 bits in 2.4 and later */ + __u32 ioo_bufcnt; /* number of niobufs for this object */ +}; + +#define IOOBJ_MAX_BRW_BITS 16 +#define IOOBJ_TYPE_MASK ((1U << IOOBJ_MAX_BRW_BITS) - 1) +#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) +#define ioobj_max_brw_set(ioo, num) \ +do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) + +extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo); + +/* multiple of 8 bytes => can array */ +struct niobuf_remote { + __u64 offset; + __u32 len; + __u32 flags; +}; + +extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); + +/* lock value block communicated between the filter and llite */ + +/* OST_LVB_ERR_INIT is needed because the return code in rc is + * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ +#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL +#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL +#define OST_LVB_IS_ERR(blocks) \ + ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) +#define OST_LVB_SET_ERR(blocks, rc) \ + do { blocks = OST_LVB_ERR_INIT + rc; } while (0) +#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) + +struct ost_lvb_v1 { + __u64 lvb_size; + obd_time lvb_mtime; + obd_time lvb_atime; + obd_time lvb_ctime; + __u64 lvb_blocks; +}; + +extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); + +struct ost_lvb { + __u64 lvb_size; + obd_time lvb_mtime; + obd_time lvb_atime; + obd_time lvb_ctime; + __u64 lvb_blocks; + __u32 lvb_mtime_ns; + __u32 lvb_atime_ns; + __u32 lvb_ctime_ns; + __u32 lvb_padding; +}; + +extern void lustre_swab_ost_lvb(struct ost_lvb *lvb); + +/* + * lquota data structures + */ + +#ifndef QUOTABLOCK_BITS +#define QUOTABLOCK_BITS 10 +#endif + +#ifndef QUOTABLOCK_SIZE +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) +#endif + +#ifndef toqb +#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS) +#endif + +/* The lquota_id structure is an union of all the possible identifier types that + * can be used with quota, this includes: + * - 64-bit user ID + * - 64-bit group ID + * - a FID which can be used for per-directory quota in the future */ +union lquota_id { + struct lu_fid qid_fid; /* FID for per-directory quota */ + __u64 qid_uid; /* user identifier */ + __u64 qid_gid; /* group identifier */ +}; + +/* quotactl management */ +struct obd_quotactl { + __u32 qc_cmd; + __u32 qc_type; /* see Q_* flag below */ + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; +}; + +extern void lustre_swab_obd_quotactl(struct obd_quotactl *q); + +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + +#define Q_COPY(out, in, member) (out)->member = (in)->member + +#define QCTL_COPY(out, in) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ +} while (0) + +/* Body of quota request used for quota acquire/release RPCs between quota + * master (aka QMT) and slaves (ak QSD). */ +struct quota_body { + struct lu_fid qb_fid; /* FID of global index packing the pool ID + * and type (data or metadata) as well as + * the quota type (user or group). */ + union lquota_id qb_id; /* uid or gid or directory FID */ + __u32 qb_flags; /* see below */ + __u32 qb_padding; + __u64 qb_count; /* acquire/release count (kbytes/inodes) */ + __u64 qb_usage; /* current slave usage (kbytes/inodes) */ + __u64 qb_slv_ver; /* slave index file version */ + struct lustre_handle qb_lockh; /* per-ID lock handle */ + struct lustre_handle qb_glb_lockh; /* global lock handle */ + __u64 qb_padding1[4]; +}; + +/* When the quota_body is used in the reply of quota global intent + * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */ +#define qb_slv_fid qb_fid +/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in + * quota reply */ +#define qb_qunit qb_usage + +#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */ +#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */ +#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */ +#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */ + +extern void lustre_swab_quota_body(struct quota_body *b); + +/* Quota types currently supported */ +enum { + LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */ + LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */ + LQUOTA_TYPE_MAX +}; + +/* There are 2 different resource types on which a quota limit can be enforced: + * - inodes on the MDTs + * - blocks on the OSTs */ +enum { + LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */ + LQUOTA_RES_DT = 0x02, + LQUOTA_LAST_RES, + LQUOTA_FIRST_RES = LQUOTA_RES_MD +}; +#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1) + +/* + * Space accounting support + * Format of an accounting record, providing disk usage information for a given + * user or group + */ +struct lquota_acct_rec { /* 16 bytes */ + __u64 bspace; /* current space in use */ + __u64 ispace; /* current # inodes in use */ +}; + +/* + * Global quota index support + * Format of a global record, providing global quota settings for a given quota + * identifier + */ +struct lquota_glb_rec { /* 32 bytes */ + __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */ + __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */ + __u64 qbr_time; /* grace time, in seconds */ + __u64 qbr_granted; /* how much is granted to slaves, in #inodes or + * kbytes */ +}; + +/* + * Slave index support + * Format of a slave record, recording how much space is granted to a given + * slave + */ +struct lquota_slv_rec { /* 8 bytes */ + __u64 qsr_granted; /* space granted to the slave for the key=ID, + * in #inodes or kbytes */ +}; + +/* Data structures associated with the quota locks */ + +/* Glimpse descriptor used for the index & per-ID quota locks */ +struct ldlm_gl_lquota_desc { + union lquota_id gl_id; /* quota ID subject to the glimpse */ + __u64 gl_flags; /* see LQUOTA_FL* below */ + __u64 gl_ver; /* new index version */ + __u64 gl_hardlimit; /* new hardlimit or qunit value */ + __u64 gl_softlimit; /* new softlimit */ + __u64 gl_time; + __u64 gl_pad2; +}; +#define gl_qunit gl_hardlimit /* current qunit value used when + * glimpsing per-ID quota locks */ + +/* quota glimpse flags */ +#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ + +/* LVB used with quota (global and per-ID) locks */ +struct lquota_lvb { + __u64 lvb_flags; /* see LQUOTA_FL* above */ + __u64 lvb_id_may_rel; /* space that might be released later */ + __u64 lvb_id_rel; /* space released by the slave for this ID */ + __u64 lvb_id_qunit; /* current qunit value */ + __u64 lvb_pad1; +}; + +extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); + +/* LVB used with global quota lock */ +#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */ + +/* op codes */ +typedef enum { + QUOTA_DQACQ = 601, + QUOTA_DQREL = 602, + QUOTA_LAST_OPC +} quota_cmd_t; +#define QUOTA_FIRST_OPC QUOTA_DQACQ + +/* + * MDS REQ RECORDS + */ + +/* opcodes */ +typedef enum { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GETSTATUS = 40, + MDS_STATFS = 41, + MDS_PIN = 42, + MDS_UNPIN = 43, + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ + MDS_WRITEPAGE = 51, + MDS_IS_SUBDIR = 52, + MDS_GET_INFO = 53, + MDS_HSM_STATE_GET = 54, + MDS_HSM_STATE_SET = 55, + MDS_HSM_ACTION = 56, + MDS_HSM_PROGRESS = 57, + MDS_HSM_REQUEST = 58, + MDS_HSM_CT_REGISTER = 59, + MDS_HSM_CT_UNREGISTER = 60, + MDS_SWAP_LAYOUTS = 61, + MDS_LAST_OPC +} mds_cmd_t; + +#define MDS_FIRST_OPC MDS_GETATTR + + +/* opcodes for object update */ +typedef enum { + UPDATE_OBJ = 1000, + UPDATE_LAST_OPC +} update_cmd_t; + +#define UPDATE_FIRST_OPC UPDATE_OBJ + +/* + * Do not exceed 63 + */ + +typedef enum { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + REINT_SETXATTR = 7, + REINT_RMENTRY = 8, +// REINT_WRITE = 9, + REINT_MAX +} mds_reint_t, mdt_reint_t; + +extern void lustre_swab_generic_32s (__u32 *val); + +/* the disposition of the intent outlines what was executed */ +#define DISP_IT_EXECD 0x00000001 +#define DISP_LOOKUP_EXECD 0x00000002 +#define DISP_LOOKUP_NEG 0x00000004 +#define DISP_LOOKUP_POS 0x00000008 +#define DISP_OPEN_CREATE 0x00000010 +#define DISP_OPEN_OPEN 0x00000020 +#define DISP_ENQ_COMPLETE 0x00400000 +#define DISP_ENQ_OPEN_REF 0x00800000 +#define DISP_ENQ_CREATE_REF 0x01000000 +#define DISP_OPEN_LOCK 0x02000000 + +/* INODE LOCK PARTS */ +#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */ +#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */ +#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */ +#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */ +#define MDS_INODELOCK_PERM 0x000010 /* for permission */ + +#define MDS_INODELOCK_MAXSHIFT 4 +/* This FULL lock is useful to take on unlink sort of operations */ +#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1) + +extern void lustre_swab_ll_fid (struct ll_fid *fid); + +/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * name[2,3] fields that need to be used for the quota id (also a FID). */ +enum { + LUSTRE_RES_ID_SEQ_OFF = 0, + LUSTRE_RES_ID_VER_OID_OFF = 1, + LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ + LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, + LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +#define MDS_STATUS_CONN 1 +#define MDS_STATUS_LOV 2 + +/* mdt_thread_info.mti_flags. */ +enum md_op_flags { + /* The flag indicates Size-on-MDS attributes are changed. */ + MF_SOM_CHANGE = (1 << 0), + /* Flags indicates an epoch opens or closes. */ + MF_EPOCH_OPEN = (1 << 1), + MF_EPOCH_CLOSE = (1 << 2), + MF_MDC_CANCEL_FID1 = (1 << 3), + MF_MDC_CANCEL_FID2 = (1 << 4), + MF_MDC_CANCEL_FID3 = (1 << 5), + MF_MDC_CANCEL_FID4 = (1 << 6), + /* There is a pending attribute update. */ + MF_SOM_AU = (1 << 7), + /* Cancel OST locks while getattr OST attributes. */ + MF_GETATTR_LOCK = (1 << 8), + MF_GET_MDT_IDX = (1 << 9), +}; + +#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE) + +#define LUSTRE_BFLAG_UNCOMMITTED_WRITES 0x1 + +/* these should be identical to their EXT4_*_FL counterparts, they are + * redefined here only to avoid dragging in fs/ext4/ext4.h */ +#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */ +#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */ +#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */ +#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ + +/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values + * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire + * protocol equivalents of LDISKFS_*_FL values stored on disk, while + * the S_* flags are kernel-internal values that change between kernel + * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. + * See b=16526 for a full history. */ +static inline int ll_ext_to_inode_flags(int flags) +{ + return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | + ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | + ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | +#if defined(S_DIRSYNC) + ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | +#endif + ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); +} + +static inline int ll_inode_to_ext_flags(int iflags) +{ + return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | + ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | + ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | +#if defined(S_DIRSYNC) + ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | +#endif + ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); +} + +struct mdt_body { + struct lu_fid fid1; + struct lu_fid fid2; + struct lustre_handle handle; + __u64 valid; + __u64 size; /* Offset, in the case of MDS_READPAGE */ + obd_time mtime; + obd_time atime; + obd_time ctime; + __u64 blocks; /* XID, in the case of MDS_READPAGE */ + __u64 ioepoch; + __u64 unused1; /* was "ino" until 2.4.0 */ + __u32 fsuid; + __u32 fsgid; + __u32 capability; + __u32 mode; + __u32 uid; + __u32 gid; + __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */ + __u32 rdev; + __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 unused2; /* was "generation" until 2.4.0 */ + __u32 suppgid; + __u32 eadatasize; + __u32 aclsize; + __u32 max_mdsize; + __u32 max_cookiesize; + __u32 uid_h; /* high 32-bits of uid, for FUID */ + __u32 gid_h; /* high 32-bits of gid, for FUID */ + __u32 padding_5; /* also fix lustre_swab_mdt_body */ + __u64 padding_6; + __u64 padding_7; + __u64 padding_8; + __u64 padding_9; + __u64 padding_10; +}; /* 216 */ + +extern void lustre_swab_mdt_body (struct mdt_body *b); + +struct mdt_ioepoch { + struct lustre_handle handle; + __u64 ioepoch; + __u32 flags; + __u32 padding; +}; + +extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b); + +/* permissions for md_perm.mp_perm */ +enum { + CFS_SETUID_PERM = 0x01, + CFS_SETGID_PERM = 0x02, + CFS_SETGRP_PERM = 0x04, + CFS_RMTACL_PERM = 0x08, + CFS_RMTOWN_PERM = 0x10 +}; + +/* inode access permission for remote user, the inode info are omitted, + * for client knows them. */ +struct mdt_remote_perm { + __u32 rp_uid; + __u32 rp_gid; + __u32 rp_fsuid; + __u32 rp_fsuid_h; + __u32 rp_fsgid; + __u32 rp_fsgid_h; + __u32 rp_access_perm; /* MAY_READ/WRITE/EXEC */ + __u32 rp_padding; +}; + +extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p); + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_cap; + __u32 sa_fsuid; + __u32 sa_fsuid_h; + __u32 sa_fsgid; + __u32 sa_fsgid_h; + __u32 sa_suppgid; + __u32 sa_suppgid_h; + __u32 sa_padding_1; + __u32 sa_padding_1_h; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + obd_time sa_mtime; + obd_time sa_atime; + obd_time sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_bias; /* some operation flags */ + __u32 sa_padding_3; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa); + +/* + * Attribute flags used in mdt_rec_setattr::sa_valid. + * The kernel's #defines for ATTR_* should not be used over the network + * since the client and MDS may run different kernels (see bug 13828) + * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. + */ +#define MDS_ATTR_MODE 0x1ULL /* = 1 */ +#define MDS_ATTR_UID 0x2ULL /* = 2 */ +#define MDS_ATTR_GID 0x4ULL /* = 4 */ +#define MDS_ATTR_SIZE 0x8ULL /* = 8 */ +#define MDS_ATTR_ATIME 0x10ULL /* = 16 */ +#define MDS_ATTR_MTIME 0x20ULL /* = 32 */ +#define MDS_ATTR_CTIME 0x40ULL /* = 64 */ +#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */ +#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */ +#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */ +#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */ +#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */ +#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ +#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ +#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */ +#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ + +#ifndef FMODE_READ +#define FMODE_READ 00000001 +#define FMODE_WRITE 00000002 +#endif + +#define MDS_FMODE_CLOSED 00000000 +#define MDS_FMODE_EXEC 00000004 +/* IO Epoch is opened on a closed file. */ +#define MDS_FMODE_EPOCH 01000000 +/* IO Epoch is opened on a file truncate. */ +#define MDS_FMODE_TRUNC 02000000 +/* Size-on-MDS Attribute Update is pending. */ +#define MDS_FMODE_SOM 04000000 + +#define MDS_OPEN_CREATED 00000010 +#define MDS_OPEN_CROSS 00000020 + +#define MDS_OPEN_CREAT 00000100 +#define MDS_OPEN_EXCL 00000200 +#define MDS_OPEN_TRUNC 00001000 +#define MDS_OPEN_APPEND 00002000 +#define MDS_OPEN_SYNC 00010000 +#define MDS_OPEN_DIRECTORY 00200000 + +#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ +#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ +#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ +#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. + * We do not support JOIN FILE + * anymore, reserve this flags + * just for preventing such bit + * to be reused. */ + +#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ +#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ +#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ +#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ +#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or + * hsm restore) */ +#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created + unlinked */ + +/* permission for create non-directory file */ +#define MAY_CREATE (1 << 7) +/* permission for create directory file */ +#define MAY_LINK (1 << 8) +/* permission for delete from the directory */ +#define MAY_UNLINK (1 << 9) +/* source's permission for rename */ +#define MAY_RENAME_SRC (1 << 10) +/* target's permission for rename */ +#define MAY_RENAME_TAR (1 << 11) +/* part (parent's) VTX permission check */ +#define MAY_VTX_PART (1 << 12) +/* full VTX permission check */ +#define MAY_VTX_FULL (1 << 13) +/* lfs rgetfacl permission check */ +#define MAY_RGETFACL (1 << 14) + +enum { + MDS_CHECK_SPLIT = 1 << 0, + MDS_CROSS_REF = 1 << 1, + MDS_VTX_BYPASS = 1 << 2, + MDS_PERM_BYPASS = 1 << 3, + MDS_SOM = 1 << 4, + MDS_QUOTA_IGNORE = 1 << 5, + MDS_CLOSE_CLEANUP = 1 << 6, + MDS_KEEP_ORPHAN = 1 << 7, + MDS_RECOV_OPEN = 1 << 8, + MDS_DATA_MODIFIED = 1 << 9, + MDS_CREATE_VOLATILE = 1 << 10, + MDS_OWNEROVERRIDE = 1 << 11, +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_cap; + __u32 cr_fsuid; + __u32 cr_fsuid_h; + __u32 cr_fsgid; + __u32 cr_fsgid_h; + __u32 cr_suppgid1; + __u32 cr_suppgid1_h; + __u32 cr_suppgid2; + __u32 cr_suppgid2_h; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_old_handle; /* handle in case of open replay */ + obd_time cr_time; + __u64 cr_rdev; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* rr_blocks */ + __u32 cr_mode; + __u32 cr_bias; + /* use of helpers set/get_mrc_cr_flags() is needed to access + * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to + * extend cr_flags size without breaking 1.8 compat */ + __u32 cr_flags_l; /* for use with open, low 32 bits */ + __u32 cr_flags_h; /* for use with open, high 32 bits */ + __u32 cr_umask; /* umask for create */ + __u32 cr_padding_4; /* rr_padding_4 */ +}; + +static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags) +{ + mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll); + mrc->cr_flags_h = (__u32)(flags >> 32); +} + +static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc) +{ + return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32)); +} + +/* instance of mdt_reint_rec */ +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_cap; + __u32 lk_fsuid; + __u32 lk_fsuid_h; + __u32 lk_fsgid; + __u32 lk_fsgid_h; + __u32 lk_suppgid1; + __u32 lk_suppgid1_h; + __u32 lk_suppgid2; + __u32 lk_suppgid2_h; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + obd_time lk_time; + __u64 lk_padding_1; /* rr_atime */ + __u64 lk_padding_2; /* rr_ctime */ + __u64 lk_padding_3; /* rr_size */ + __u64 lk_padding_4; /* rr_blocks */ + __u32 lk_bias; + __u32 lk_padding_5; /* rr_mode */ + __u32 lk_padding_6; /* rr_flags */ + __u32 lk_padding_7; /* rr_padding_2 */ + __u32 lk_padding_8; /* rr_padding_3 */ + __u32 lk_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_cap; + __u32 ul_fsuid; + __u32 ul_fsuid_h; + __u32 ul_fsgid; + __u32 ul_fsgid_h; + __u32 ul_suppgid1; + __u32 ul_suppgid1_h; + __u32 ul_suppgid2; + __u32 ul_suppgid2_h; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + obd_time ul_time; + __u64 ul_padding_2; /* rr_atime */ + __u64 ul_padding_3; /* rr_ctime */ + __u64 ul_padding_4; /* rr_size */ + __u64 ul_padding_5; /* rr_blocks */ + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; /* rr_flags */ + __u32 ul_padding_7; /* rr_padding_2 */ + __u32 ul_padding_8; /* rr_padding_3 */ + __u32 ul_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_cap; + __u32 rn_fsuid; + __u32 rn_fsuid_h; + __u32 rn_fsgid; + __u32 rn_fsgid_h; + __u32 rn_suppgid1; + __u32 rn_suppgid1_h; + __u32 rn_suppgid2; + __u32 rn_suppgid2_h; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + obd_time rn_time; + __u64 rn_padding_1; /* rr_atime */ + __u64 rn_padding_2; /* rr_ctime */ + __u64 rn_padding_3; /* rr_size */ + __u64 rn_padding_4; /* rr_blocks */ + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; /* rr_flags */ + __u32 rn_padding_6; /* rr_padding_2 */ + __u32 rn_padding_7; /* rr_padding_3 */ + __u32 rn_padding_8; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_cap; + __u32 sx_fsuid; + __u32 sx_fsuid_h; + __u32 sx_fsgid; + __u32 sx_fsgid_h; + __u32 sx_suppgid1; + __u32 sx_suppgid1_h; + __u32 sx_suppgid2; + __u32 sx_suppgid2_h; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three are rr_fid2 */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + obd_time sx_time; + __u64 sx_padding_5; /* rr_ctime */ + __u64 sx_padding_6; /* rr_size */ + __u64 sx_padding_7; /* rr_blocks */ + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; /* rr_flags */ + __u32 sx_padding_9; /* rr_padding_2 */ + __u32 sx_padding_10; /* rr_padding_3 */ + __u32 sx_padding_11; /* rr_padding_4 */ +}; + +/* + * mdt_rec_reint is the template for all mdt_reint_xxx structures. + * Do NOT change the size of various members, otherwise the value + * will be broken in lustre_swab_mdt_rec_reint(). + * + * If you add new members in other mdt_reint_xxx structres and need to use the + * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. + */ +struct mdt_rec_reint { + __u32 rr_opcode; + __u32 rr_cap; + __u32 rr_fsuid; + __u32 rr_fsuid_h; + __u32 rr_fsgid; + __u32 rr_fsgid_h; + __u32 rr_suppgid1; + __u32 rr_suppgid1_h; + __u32 rr_suppgid2; + __u32 rr_suppgid2_h; + struct lu_fid rr_fid1; + struct lu_fid rr_fid2; + obd_time rr_mtime; + obd_time rr_atime; + obd_time rr_ctime; + __u64 rr_size; + __u64 rr_blocks; + __u32 rr_bias; + __u32 rr_mode; + __u32 rr_flags; + __u32 rr_flags_h; + __u32 rr_umask; + __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ +}; + +extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); + +struct lmv_desc { + __u32 ld_tgt_count; /* how many MDS's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default MEA_MAGIC_* */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ + struct obd_uuid ld_uuid; +}; + +extern void lustre_swab_lmv_desc (struct lmv_desc *ld); + +/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ +struct lmv_stripe_md { + __u32 mea_magic; + __u32 mea_count; + __u32 mea_master; + __u32 mea_padding; + char mea_pool_name[LOV_MAXPOOLNAME]; + struct lu_fid mea_ids[0]; +}; + +extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea); + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +enum fld_rpc_opc { + FLD_QUERY = 900, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + +/* + * LOV data structures + */ + +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + +#define LOV_DESC_MAGIC 0xB0CCDE5C + +/* LOV settings descriptor (should only contain static info) */ +struct lov_desc { + __u32 ld_tgt_count; /* how many OBD's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ + __u64 ld_default_stripe_size; /* in bytes */ + __u64 ld_default_stripe_offset; /* in bytes */ + __u32 ld_padding_0; /* unused */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ + struct obd_uuid ld_uuid; +}; + +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + +extern void lustre_swab_lov_desc (struct lov_desc *ld); + +/* + * LDLM requests: + */ +/* opcodes -- MUST be distinct from OST/MDS opcodes */ +typedef enum { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_SET_INFO = 107, + LDLM_LAST_OPC +} ldlm_cmd_t; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define RES_NAME_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + +extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id); + +static inline int ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) +{ + return !memcmp(res0, res1, sizeof(*res0)); +} + +/* lock types */ +typedef enum { + LCK_MINMODE = 0, + LCK_EX = 1, + LCK_PW = 2, + LCK_PR = 4, + LCK_CW = 8, + LCK_CR = 16, + LCK_NL = 32, + LCK_GROUP = 64, + LCK_COS = 128, + LCK_MAXMODE +} ldlm_mode_t; + +#define LCK_MODE_NUM 8 + +typedef enum { + LDLM_PLAIN = 10, + LDLM_EXTENT = 11, + LDLM_FLOCK = 12, + LDLM_IBITS = 13, + LDLM_MAX_TYPE +} ldlm_type_t; + +#define LDLM_MIN_TYPE LDLM_PLAIN + +struct ldlm_extent { + __u64 start; + __u64 end; + __u64 gid; +}; + +static inline int ldlm_extent_overlap(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->end) && (ex2->start <= ex1->end); +} + +/* check if @ex1 contains @ex2 */ +static inline int ldlm_extent_contain(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->start) && (ex1->end >= ex2->end); +} + +struct ldlm_inodebits { + __u64 bits; +}; + +struct ldlm_flock_wire { + __u64 lfw_start; + __u64 lfw_end; + __u64 lfw_owner; + __u32 lfw_padding; + __u32 lfw_pid; +}; + +/* it's important that the fields of the ldlm_extent structure match + * the first fields of the ldlm_flock structure because there is only + * one ldlm_swab routine to process the ldlm_policy_data_t union. if + * this ever changes we will need to swab the union differently based + * on the resource type. */ + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock_wire l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_wire_policy_data_t; + +extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d); + +union ldlm_gl_desc { + struct ldlm_gl_lquota_desc lquota_desc; +}; + +extern void lustre_swab_gl_desc(union ldlm_gl_desc *); + +struct ldlm_intent { + __u64 opc; +}; + +extern void lustre_swab_ldlm_intent (struct ldlm_intent *i); + +struct ldlm_resource_desc { + ldlm_type_t lr_type; + __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */ + struct ldlm_res_id lr_name; +}; + +extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r); + +struct ldlm_lock_desc { + struct ldlm_resource_desc l_resource; + ldlm_mode_t l_req_mode; + ldlm_mode_t l_granted_mode; + ldlm_wire_policy_data_t l_policy_data; +}; + +extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l); + +#define LDLM_LOCKREQ_HANDLES 2 +#define LDLM_ENQUEUE_CANCEL_OFF 1 + +struct ldlm_request { + __u32 lock_flags; + __u32 lock_count; + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; +}; + +extern void lustre_swab_ldlm_request (struct ldlm_request *rq); + +/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available. + * Otherwise, 2 are available. */ +#define ldlm_request_bufsize(count,type) \ +({ \ + int _avail = LDLM_LOCKREQ_HANDLES; \ + _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \ + sizeof(struct ldlm_request) + \ + (count > _avail ? count - _avail : 0) * \ + sizeof(struct lustre_handle); \ +}) + +struct ldlm_reply { + __u32 lock_flags; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle; + __u64 lock_policy_res1; + __u64 lock_policy_res2; +}; + +extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); + +#define ldlm_flags_to_wire(flags) ((__u32)(flags)) +#define ldlm_flags_from_wire(flags) ((__u64)(flags)) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +typedef enum { + MGS_CONNECT = 250, + MGS_DISCONNECT, + MGS_EXCEPTION, /* node died, etc. */ + MGS_TARGET_REG, /* whenever target starts up */ + MGS_TARGET_DEL, + MGS_SET_INFO, + MGS_CONFIG_READ, + MGS_LAST_OPC +} mgs_cmd_t; +#define MGS_FIRST_OPC MGS_CONNECT + +#define MGS_PARAM_MAXLEN 1024 +#define KEY_SET_INFO "set_info" + +struct mgs_send_param { + char mgs_param[MGS_PARAM_MAXLEN]; +}; + +/* We pass this info to the MGS so it can write config logs */ +#define MTI_NAME_MAXLEN 64 +#define MTI_PARAM_MAXLEN 4096 +#define MTI_NIDS_MAX 32 +struct mgs_target_info { + __u32 mti_lustre_ver; + __u32 mti_stripe_index; + __u32 mti_config_ver; + __u32 mti_flags; + __u32 mti_nid_count; + __u32 mti_instance; /* Running instance of target */ + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/ + char mti_params[MTI_PARAM_MAXLEN]; +}; +extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); + +struct mgs_nidtbl_entry { + __u64 mne_version; /* table version of this entry */ + __u32 mne_instance; /* target instance # */ + __u32 mne_index; /* target index */ + __u32 mne_length; /* length of this entry - by bytes */ + __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ + __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ + __u8 mne_nid_size; /* size of each NID, by bytes */ + __u8 mne_nid_count; /* # of NIDs in buffer */ + union { + lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ + } u; +}; +extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); + +struct mgs_config_body { + char mcb_name[MTI_NAME_MAXLEN]; /* logname */ + __u64 mcb_offset; /* next index of config log to request */ + __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */ + __u8 mcb_reserved; + __u8 mcb_bits; /* bits unit size of config log */ + __u32 mcb_units; /* # of units for bulk transfer */ +}; +extern void lustre_swab_mgs_config_body(struct mgs_config_body *body); + +struct mgs_config_res { + __u64 mcr_offset; /* index of last config log */ + __u64 mcr_size; /* size of the log */ +}; +extern void lustre_swab_mgs_config_res(struct mgs_config_res *body); + +/* Config marker flags (in config log) */ +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_EXCLUDE 0x10 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + __u32 cm_vers; /* lustre release version number */ + __u32 cm_padding; /* 64 bit align */ + obd_time cm_createtime; /*when this record was first created */ + obd_time cm_canceltime; /*when this record is no longer valid*/ + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +extern void lustre_swab_cfg_marker(struct cfg_marker *marker, + int swab, int size); + +/* + * Opcodes for multiple servers. + */ + +typedef enum { + OBD_PING = 400, + OBD_LOG_CANCEL, + OBD_QC_CALLBACK, + OBD_IDX_READ, + OBD_LAST_OPC +} obd_cmd_t; +#define OBD_FIRST_OPC OBD_PING + +/* catalog of log objects */ + +/** Identifier for a single log object */ +struct llog_logid { + struct ost_id lgl_oi; + __u32 lgl_ogen; +} __attribute__((packed)); + +/** Records written to the CATALOGS list */ +#define CATLIST "CATALOGS" +struct llog_catid { + struct llog_logid lci_logid; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; +} __attribute__((packed)); + +/* Log data record types - there is no specific reason that these need to + * be related to the RPC opcodes, but no reason not to (may be handy later?) + */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +typedef enum { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | + REINT_UNLINK, /* obsolete after 2.5.0 */ + MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_UNLINK, + /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ + MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ + CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, + CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +} llog_op_type; + +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) + +/** Log record header - stored in little endian order. + * Each record must start with this struct, end with a llog_rec_tail, + * and be a multiple of 256 bits in size. + */ +struct llog_rec_hdr { + __u32 lrh_len; + __u32 lrh_index; + __u32 lrh_type; + __u32 lrh_id; +}; + +struct llog_rec_tail { + __u32 lrt_len; + __u32 lrt_index; +}; + +/* Where data follow just after header */ +#define REC_DATA(ptr) \ + ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) + +#define REC_DATA_LEN(rec) \ + (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ + sizeof(struct llog_rec_tail)) + +struct llog_logid_rec { + struct llog_rec_hdr lid_hdr; + struct llog_logid lid_id; + __u32 lid_padding1; + __u64 lid_padding2; + __u64 lid_padding3; + struct llog_rec_tail lid_tail; +} __attribute__((packed)); + +struct llog_unlink_rec { + struct llog_rec_hdr lur_hdr; + obd_id lur_oid; + obd_count lur_oseq; + obd_count lur_count; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_unlink64_rec { + struct llog_rec_hdr lur_hdr; + struct lu_fid lur_fid; + obd_count lur_count; /* to destroy the lost precreated */ + __u32 lur_padding1; + __u64 lur_padding2; + __u64 lur_padding3; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_padding; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +struct llog_size_change_rec { + struct llog_rec_hdr lsc_hdr; + struct ll_fid lsc_fid; + __u32 lsc_ioepoch; + __u32 lsc_padding1; + __u64 lsc_padding2; + __u64 lsc_padding3; + struct llog_rec_tail lsc_tail; +} __attribute__((packed)); + +#define CHANGELOG_MAGIC 0xca103000 + +/** \a changelog_rec_type's that can't be masked */ +#define CHANGELOG_MINMASK (1 << CL_MARK) +/** bits covering all \a changelog_rec_type's */ +#define CHANGELOG_ALLMASK 0XFFFFFFFF +/** default \a changelog_rec_type mask */ +#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE) + +/* changelog llog name, needed by client replicators */ +#define CHANGELOG_CATALOG "changelog_catalog" + +struct changelog_setinfo { + __u64 cs_recno; + __u32 cs_id; +} __attribute__((packed)); + +/** changelog record */ +struct llog_changelog_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +struct llog_changelog_ext_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_ext_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +#define CHANGELOG_USER_PREFIX "cl" + +struct llog_changelog_user_rec { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + __u32 cur_padding; + __u64 cur_endrec; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +/* Old llog gen for compatibility */ +struct llog_gen { + __u64 mnt_cnt; + __u64 conn_cnt; +} __attribute__((packed)); + +struct llog_gen_rec { + struct llog_rec_hdr lgr_hdr; + struct llog_gen lgr_gen; + __u64 padding1; + __u64 padding2; + __u64 padding3; + struct llog_rec_tail lgr_tail; +}; + +/* On-disk header structure of each log object, stored in little endian order */ +#define LLOG_CHUNK_SIZE 8192 +#define LLOG_HEADER_SIZE (96) +#define LLOG_BITMAP_BYTES (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE) + +#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ + +/* flags for the logs */ +enum llog_flag { + LLOG_F_ZAP_WHEN_EMPTY = 0x1, + LLOG_F_IS_CAT = 0x2, + LLOG_F_IS_PLAIN = 0x4, +}; + +struct llog_log_hdr { + struct llog_rec_hdr llh_hdr; + obd_time llh_timestamp; + __u32 llh_count; + __u32 llh_bitmap_offset; + __u32 llh_size; + __u32 llh_flags; + __u32 llh_cat_idx; + /* for a catalog the first plain slot is next to it */ + struct obd_uuid llh_tgtuuid; + __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23]; + __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)]; + struct llog_rec_tail llh_tail; +} __attribute__((packed)); + +#define LLOG_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ + llh->llh_bitmap_offset - \ + sizeof(llh->llh_tail)) * 8) + +/** log cookies are used to reference a specific log file and a record therein */ +struct llog_cookie { + struct llog_logid lgc_lgl; + __u32 lgc_subsys; + __u32 lgc_index; + __u32 lgc_padding; +} __attribute__((packed)); + +/** llog protocol */ +enum llogd_rpc_ops { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, + LLOG_ORIGIN_HANDLE_WRITE_REC = 504, + LLOG_ORIGIN_HANDLE_CLOSE = 505, + LLOG_ORIGIN_CONNECT = 506, + LLOG_CATINFO = 507, /* deprecated */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/ + LLOG_LAST_OPC, + LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE +}; + +struct llogd_body { + struct llog_logid lgd_logid; + __u32 lgd_ctxt_idx; + __u32 lgd_llh_flags; + __u32 lgd_index; + __u32 lgd_saved_index; + __u32 lgd_len; + __u64 lgd_cur_offset; +} __attribute__((packed)); + +struct llogd_conn_body { + struct llog_gen lgdc_gen; + struct llog_logid lgdc_logid; + __u32 lgdc_ctxt_idx; +} __attribute__((packed)); + +/* Note: 64-bit types are 64-bit aligned in structure */ +struct obdo { + obd_valid o_valid; /* hot fields in this obdo */ + struct ost_id o_oi; + obd_id o_parent_seq; + obd_size o_size; /* o_size-o_blocks == ost_lvb */ + obd_time o_mtime; + obd_time o_atime; + obd_time o_ctime; + obd_blocks o_blocks; /* brw: cli sent cached bytes */ + obd_size o_grant; + + /* 32-bit fields start here: keep an even number of them via padding */ + obd_blksize o_blksize; /* optimal IO blocksize */ + obd_mode o_mode; /* brw: cli sent cache remain */ + obd_uid o_uid; + obd_gid o_gid; + obd_flag o_flags; + obd_count o_nlink; /* brw: checksum */ + obd_count o_parent_oid; + obd_count o_misc; /* brw: o_dropped */ + + __u64 o_ioepoch; /* epoch in ost writes */ + __u32 o_stripe_idx; /* holds stripe idx */ + __u32 o_parent_ver; + struct lustre_handle o_handle; /* brw: lock handle to prolong + * locks */ + struct llog_cookie o_lcookie; /* destroy: unlink cookie from + * MDS */ + __u32 o_uid_h; + __u32 o_gid_h; + + __u64 o_data_version; /* getattr: sum of iversion for + * each stripe. + * brw: grant space consumed on + * the client for the write */ + __u64 o_padding_4; + __u64 o_padding_5; + __u64 o_padding_6; +}; + +#define o_dirty o_blocks +#define o_undirty o_mode +#define o_dropped o_misc +#define o_cksum o_nlink +#define o_grant_used o_data_version + +static inline void lustre_set_wire_obdo(struct obdo *wobdo, struct obdo *lobdo) +{ + memcpy(wobdo, lobdo, sizeof(*lobdo)); + wobdo->o_flags &= ~OBD_FL_LOCAL_MASK; +} + +static inline void lustre_get_wire_obdo(struct obdo *lobdo, struct obdo *wobdo) +{ + obd_flag local_flags = 0; + + if (lobdo->o_valid & OBD_MD_FLFLAGS) + local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK; + + LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK)); + + memcpy(lobdo, wobdo, sizeof(*lobdo)); + if (local_flags != 0) { + lobdo->o_valid |= OBD_MD_FLFLAGS; + lobdo->o_flags &= ~OBD_FL_LOCAL_MASK; + lobdo->o_flags |= local_flags; + } +} + +extern void lustre_swab_obdo (struct obdo *o); + +/* request structure for OST's */ +struct ost_body { + struct obdo oa; +}; + +/* Key for FIEMAP to be used in get_info calls */ +struct ll_fiemap_info_key { + char name[8]; + struct obdo oa; + struct ll_user_fiemap fiemap; +}; + +extern void lustre_swab_ost_body (struct ost_body *b); +extern void lustre_swab_ost_last_id(obd_id *id); +extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap); + +extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); +extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); + +/* llog_swab.c */ +extern void lustre_swab_llogd_body (struct llogd_body *d); +extern void lustre_swab_llog_hdr (struct llog_log_hdr *h); +extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d); +extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec); + +struct lustre_cfg; +extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); + +/* Functions for dumping PTLRPC fields */ +void dump_rniobuf(struct niobuf_remote *rnb); +void dump_ioo(struct obd_ioobj *nb); +void dump_obdo(struct obdo *oa); +void dump_ost_body(struct ost_body *ob); +void dump_rcs(__u32 *rc); + +#define IDX_INFO_MAGIC 0x3D37CC37 + +/* Index file transfer through the network. The server serializes the index into + * a byte stream which is sent to the client via a bulk transfer */ +struct idx_info { + __u32 ii_magic; + + /* reply: see idx_info_flags below */ + __u32 ii_flags; + + /* request & reply: number of lu_idxpage (to be) transferred */ + __u16 ii_count; + __u16 ii_pad0; + + /* request: requested attributes passed down to the iterator API */ + __u32 ii_attrs; + + /* request & reply: index file identifier (FID) */ + struct lu_fid ii_fid; + + /* reply: version of the index file before starting to walk the index. + * Please note that the version can be modified at any time during the + * transfer */ + __u64 ii_version; + + /* request: hash to start with: + * reply: hash of the first entry of the first lu_idxpage and hash + * of the entry to read next if any */ + __u64 ii_hash_start; + __u64 ii_hash_end; + + /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is + * set */ + __u16 ii_keysize; + + /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC + * is set */ + __u16 ii_recsize; + + __u32 ii_pad1; + __u64 ii_pad2; + __u64 ii_pad3; +}; +extern void lustre_swab_idx_info(struct idx_info *ii); + +#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */ + +/* List of flags used in idx_info::ii_flags */ +enum idx_info_flags { + II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */ + II_FL_VARKEY = 1 << 1, /* keys can be of variable size */ + II_FL_VARREC = 1 << 2, /* records can be of variable size */ + II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */ +}; + +#define LIP_MAGIC 0x8A6D6B6C + +/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */ +struct lu_idxpage { + /* 16-byte header */ + __u32 lip_magic; + __u16 lip_flags; + __u16 lip_nr; /* number of entries in the container */ + __u64 lip_pad0; /* additional padding for future use */ + + /* key/record pairs are stored in the remaining 4080 bytes. + * depending upon the flags in idx_info::ii_flags, each key/record + * pair might be preceded by: + * - a hash value + * - the key size (II_FL_VARKEY is set) + * - the record size (II_FL_VARREC is set) + * + * For the time being, we only support fixed-size key & record. */ + char lip_entries[0]; +}; +extern void lustre_swab_lip_header(struct lu_idxpage *lip); + +#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries)) + +/* Gather all possible type associated with a 4KB container */ +union lu_page { + struct lu_dirpage lp_dir; /* for MDS_READPAGE */ + struct lu_idxpage lp_idx; /* for OBD_IDX_READ */ + char lp_array[LU_PAGE_SIZE]; +}; + +/* security opcodes */ +typedef enum { + SEC_CTX_INIT = 801, + SEC_CTX_INIT_CONT = 802, + SEC_CTX_FINI = 803, + SEC_LAST_OPC, + SEC_FIRST_OPC = SEC_CTX_INIT +} sec_cmd_t; + +/* + * capa related definitions + */ +#define CAPA_HMAC_MAX_LEN 64 +#define CAPA_HMAC_KEY_MAX_LEN 56 + +/* NB take care when changing the sequence of elements this struct, + * because the offset info is used in find_capa() */ +struct lustre_capa { + struct lu_fid lc_fid; /** fid */ + __u64 lc_opc; /** operations allowed */ + __u64 lc_uid; /** file owner */ + __u64 lc_gid; /** file group */ + __u32 lc_flags; /** HMAC algorithm & flags */ + __u32 lc_keyid; /** key# used for the capability */ + __u32 lc_timeout; /** capa timeout value (sec) */ + __u32 lc_expiry; /** expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa(struct lustre_capa *c); + +/** lustre_capa::lc_opc */ +enum { + CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */ + CAPA_OPC_BODY_READ = 1<<1, /**< read object data */ + CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */ + CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */ + CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */ + CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ + CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ + CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ + CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ + CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ +}; + +#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) +#define CAPA_OPC_MDS_ONLY \ + (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ + CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) +#define CAPA_OPC_OSS_ONLY \ + (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ + CAPA_OPC_OSS_DESTROY) +#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY +#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) + +/* MDS capability covers object capability for operations of body r/w + * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w, + * while OSS capability only covers object capability for operations of + * oss data(file content) r/w/truncate. + */ +static inline int capa_for_mds(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0; +} + +static inline int capa_for_oss(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0; +} + +/* lustre_capa::lc_hmac_alg */ +enum { + CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */ + CAPA_HMAC_ALG_MAX, +}; + +#define CAPA_FL_MASK 0x00ffffff +#define CAPA_HMAC_ALG_MASK 0xff000000 + +struct lustre_capa_key { + __u64 lk_seq; /**< mds# */ + __u32 lk_keyid; /**< key# */ + __u32 lk_padding; + __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); + +/** The link ea holds 1 \a link_ea_entry for each hardlink */ +#define LINK_EA_MAGIC 0x11EAF1DFUL +struct link_ea_header { + __u32 leh_magic; + __u32 leh_reccount; + __u64 leh_len; /* total size */ + /* future use */ + __u32 padding1; + __u32 padding2; +}; + +/** Hardlink data is name and parent fid. + * Stored in this crazy struct for maximum packing and endian-neutrality + */ +struct link_ea_entry { + /** __u16 stored big-endian, unaligned */ + unsigned char lee_reclen[2]; + unsigned char lee_parent_fid[sizeof(struct lu_fid)]; + char lee_name[0]; +}__attribute__((packed)); + +/** fid2path request/reply structure */ +struct getinfo_fid2path { + struct lu_fid gf_fid; + __u64 gf_recno; + __u32 gf_linkno; + __u32 gf_pathlen; + char gf_path[0]; +} __attribute__((packed)); + +void lustre_swab_fid2path (struct getinfo_fid2path *gf); + +enum { + LAYOUT_INTENT_ACCESS = 0, + LAYOUT_INTENT_READ = 1, + LAYOUT_INTENT_WRITE = 2, + LAYOUT_INTENT_GLIMPSE = 3, + LAYOUT_INTENT_TRUNC = 4, + LAYOUT_INTENT_RELEASE = 5, + LAYOUT_INTENT_RESTORE = 6 +}; + +/* enqueue layout lock with intent */ +struct layout_intent { + __u32 li_opc; /* intent operation for enqueue, read, write etc */ + __u32 li_flags; + __u64 li_start; + __u64 li_end; +}; + +void lustre_swab_layout_intent(struct layout_intent *li); + +/** + * On the wire version of hsm_progress structure. + * + * Contains the userspace hsm_progress and some internal fields. + */ +struct hsm_progress_kernel { + /* Field taken from struct hsm_progress */ + lustre_fid hpk_fid; + __u64 hpk_cookie; + struct hsm_extent hpk_extent; + __u16 hpk_flags; + __u16 hpk_errval; /* positive val */ + __u32 hpk_padding1; + /* Additional fields */ + __u64 hpk_data_version; + __u64 hpk_padding2; +} __attribute__((packed)); + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_current_action(struct hsm_current_action *action); +extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui); +extern void lustre_swab_hsm_request(struct hsm_request *hr); + +/** + * These are object update opcode under UPDATE_OBJ, which is currently + * being used by cross-ref operations between MDT. + * + * During the cross-ref operation, the Master MDT, which the client send the + * request to, will disassembly the operation into object updates, then OSP + * will send these updates to the remote MDT to be executed. + * + * Update request format + * magic: UPDATE_BUFFER_MAGIC_V1 + * Count: How many updates in the req. + * bufs[0] : following are packets of object. + * update[0]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * update[1]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * .......... + * update[7]: type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * Current 8 maxim updates per object update request. + * + ******************************************************************* + * update reply format: + * + * ur_version: UPDATE_REPLY_V1 + * ur_count: The count of the reply, which is usually equal + * to the number of updates in the request. + * ur_lens: The reply lengths of each object update. + * + * replies: 1st update reply [4bytes_ret: other body] + * 2nd update reply [4bytes_ret: other body] + * ..... + * nth update reply [4bytes_ret: other body] + * + * For each reply of the update, the format would be + * result(4 bytes):Other stuff + */ + +#define UPDATE_MAX_OPS 10 +#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001 +#define UPDATE_BUFFER_MAGIC UPDATE_BUFFER_MAGIC_V1 +#define UPDATE_BUF_COUNT 8 +enum object_update_op { + OBJ_CREATE = 1, + OBJ_DESTROY = 2, + OBJ_REF_ADD = 3, + OBJ_REF_DEL = 4, + OBJ_ATTR_SET = 5, + OBJ_ATTR_GET = 6, + OBJ_XATTR_SET = 7, + OBJ_XATTR_GET = 8, + OBJ_INDEX_LOOKUP = 9, + OBJ_INDEX_INSERT = 10, + OBJ_INDEX_DELETE = 11, + OBJ_LAST +}; + +struct update { + __u32 u_type; + __u32 u_batchid; + struct lu_fid u_fid; + __u32 u_lens[UPDATE_BUF_COUNT]; + __u32 u_bufs[0]; +}; + +struct update_buf { + __u32 ub_magic; + __u32 ub_count; + __u32 ub_bufs[0]; +}; + +#define UPDATE_REPLY_V1 0x00BD0001 +struct update_reply { + __u32 ur_version; + __u32 ur_count; + __u32 ur_lens[0]; +}; + +void lustre_swab_update_buf(struct update_buf *ub); +void lustre_swab_update_reply_buf(struct update_reply *ur); + +/** layout swap request structure + * fid1 and fid2 are in mdt_body + */ +struct mdc_swap_layouts { + __u64 msl_flags; +} __packed; + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); + +#endif +/** @} lustreidl */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h new file mode 100644 index 000000000000..1c87a61a7fc1 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan Yong + */ + +#ifndef _LUSTRE_LFSCK_USER_H +# define _LUSTRE_LFSCK_USER_H + +enum lfsck_param_flags { + /* Reset LFSCK iterator position to the device beginning. */ + LPF_RESET = 0x0001, + + /* Exit when fail. */ + LPF_FAILOUT = 0x0002, + + /* Dryrun mode, only check without modification */ + LPF_DRYRUN = 0x0004, +}; + +enum lfsck_type { + /* For MDT-OST consistency check/repair. */ + LT_LAYOUT = 0x0001, + + /* For MDT-MDT consistency check/repair. */ + LT_DNE = 0x0002, + + /* For FID-in-dirent and linkEA consistency check/repair. */ + LT_NAMESPACE = 0x0004, +}; + +#define LFSCK_VERSION_V1 1 +#define LFSCK_VERSION_V2 2 + +#define LFSCK_TYPES_ALL ((__u16)(~0)) +#define LFSCK_TYPES_DEF ((__u16)0) +#define LFSCK_TYPES_SUPPORTED LT_NAMESPACE + +#define LFSCK_SPEED_NO_LIMIT 0 +#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT + +enum lfsck_start_valid { + LSV_SPEED_LIMIT = 0x00000001, + LSV_ERROR_HANDLE = 0x00000002, + LSV_DRYRUN = 0x00000004, +}; + +/* Arguments for starting lfsck. */ +struct lfsck_start { + /* Which arguments are valid, see 'enum lfsck_start_valid'. */ + __u32 ls_valid; + + /* How many items can be scanned at most per second. */ + __u32 ls_speed_limit; + + /* For compatibility between user space tools and kernel service. */ + __u16 ls_version; + + /* Which LFSCK components to be (have been) started. */ + __u16 ls_active; + + /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */ + __u16 ls_flags; + + /* For 64-bits aligned. */ + __u16 ls_padding; +}; + +#endif /* _LUSTRE_LFSCK_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h new file mode 100644 index 000000000000..eaa94f5cab96 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h @@ -0,0 +1,1146 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LUSTRE_USER_H +#define _LUSTRE_USER_H + +/** \defgroup lustreuser lustreuser + * + * @{ + */ + +#include +#include + +/* for statfs() */ +#define LL_SUPER_MAGIC 0x0BD00BD0 + +#ifndef FSFILT_IOC_GETFLAGS +#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long) +#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long) +#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) +#define FSFILT_IOC_SETVERSION _IOW('f', 4, long) +#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long) +#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long) +#define FSFILT_IOC_FIEMAP _IOWR('f', 11, struct ll_user_fiemap) +#endif + +/* FIEMAP flags supported by Lustre */ +#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) + +enum obd_statfs_state { + OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ + OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ + OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ +}; + +struct obd_statfs { + __u64 os_type; + __u64 os_blocks; + __u64 os_bfree; + __u64 os_bavail; + __u64 os_files; + __u64 os_ffree; + __u8 os_fsid[40]; + __u32 os_bsize; + __u32 os_namelen; + __u64 os_maxbytes; + __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */ + __u32 os_fprecreated; /* objs available now to the caller */ + /* used in QoS code to find preferred + * OSTs */ + __u32 os_spare2; + __u32 os_spare3; + __u32 os_spare4; + __u32 os_spare5; + __u32 os_spare6; + __u32 os_spare7; + __u32 os_spare8; + __u32 os_spare9; +}; + +/** + * File IDentifier. + * + * FID is a cluster-wide unique identifier of a file or an object (stripe). + * FIDs are never reused. + **/ +struct lu_fid { + /** + * FID sequence. Sequence is a unit of migration: all files (objects) + * with FIDs from a given sequence are stored on the same server. + * Lustre should support 2^64 objects, so even if each sequence + * has only a single object we can still enumerate 2^64 objects. + **/ + __u64 f_seq; + /* FID number within sequence. */ + __u32 f_oid; + /** + * FID version, used to distinguish different versions (in the sense + * of snapshots, etc.) of the same file system object. Not currently + * used. + **/ + __u32 f_ver; +}; + +struct filter_fid { + struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */ +}; + +/* keep this one for compatibility */ +struct filter_fid_old { + struct lu_fid ff_parent; + __u64 ff_objid; + __u64 ff_seq; +}; + +/* Userspace should treat lu_fid as opaque, and only use the following methods + * to print or parse them. Other functions (e.g. compare, swab) could be moved + * here from lustre_idl.h if needed. */ +typedef struct lu_fid lustre_fid; + +/** + * Following struct for object attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + * Added to all objects since Lustre 2.4 as contains self FID + */ +struct lustre_mdt_attrs { + /** + * Bitfield for supported data in this structure. From enum lma_compat. + * lma_self_fid and lma_flags are always available. + */ + __u32 lma_compat; + /** + * Per-file incompat feature list. Lustre version should support all + * flags set in this field. The supported feature mask is available in + * LMA_INCOMPAT_SUPP. + */ + __u32 lma_incompat; + /** FID of this inode */ + struct lu_fid lma_self_fid; +}; + +/** + * Prior to 2.4, the LMA structure also included SOM attributes which has since + * been moved to a dedicated xattr + * lma_flags was also removed because of lma_compat/incompat fields. + */ +#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) + +/** + * OST object IDentifier. + */ +struct ost_id { + union { + struct ostid { + __u64 oi_id; + __u64 oi_seq; + } oi; + struct lu_fid oi_fid; + }; +}; + +#define DOSTID LPX64":"LPU64 +#define POSTID(oi) ostid_seq(oi), ostid_id(oi) + +/* + * The ioctl naming rules: + * LL_* - works on the currently opened filehandle instead of parent dir + * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) + * *_MDC_* - gets/sets data related to MDC + * *_LOV_* - gets/sets data related to OSC/LOV + * *FILE* - called on parent dir and passes in a filename + * *STRIPE* - set/get lov_user_md + * *INFO - set/get lov_user_mds_data + */ +/* see for ioctl numberss 101-150 */ +#define LL_IOC_GETFLAGS _IOR ('f', 151, long) +#define LL_IOC_SETFLAGS _IOW ('f', 152, long) +#define LL_IOC_CLRFLAGS _IOW ('f', 153, long) +/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */ +#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) +/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */ +#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) +/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */ +#define LL_IOC_LOV_SETEA _IOW ('f', 156, long) +#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long) +#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid) +#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long) +#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long) +/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */ +#define LL_IOC_QUOTACHECK _IOW ('f', 160, int) +/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */ +#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */ +#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) +#define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *) +#define LL_IOC_FLUSHCTX _IOW ('f', 166, long) +#define LL_IOC_RMTACL _IOW ('f', 167, long) +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) +#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) +#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) +#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) +#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) +#define LL_IOC_PATH2FID _IOR ('f', 173, long) +#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) +#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) + +/* see for ioctl numbers 177-210 */ + +#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) +#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) +#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) +#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) +#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) +#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) +#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) +#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) +#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ + struct lustre_swap_layouts) +#define LL_IOC_HSM_ACTION _IOR('f', 220, \ + struct hsm_current_action) +/* see for ioctl numbers 221-232 */ + +#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) +#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) +#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64) + +#define LL_STATFS_LMV 1 +#define LL_STATFS_LOV 2 +#define LL_STATFS_NODELAY 4 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) +#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) +#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) + +/* Keep these for backward compartability. */ +#define LL_IOC_OBD_STATFS IOC_OBD_STATFS +#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE + + +#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ + +/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags. + * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY + * which was added since kernel 2.6.36, so we redefine it as 020000000. + * To be compatible with old version's statically linked binary, finally we + * define it as (020000000 | 0100000000). + * */ +#define O_LOV_DELAY_CREATE 0120000000 + +#define LL_FILE_IGNORE_LOCK 0x00000001 +#define LL_FILE_GROUP_LOCKED 0x00000002 +#define LL_FILE_READAHEA 0x00000004 +#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ +#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ +#define LL_FILE_RMTACL 0x00000020 + +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 + +#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */ +#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/ + +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_FIRST 0x100 + +#define LOV_MAXPOOLNAME 16 +#define LOV_POOLNAMEF "%.16s" + +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) +#define LOV_MAX_STRIPE_COUNT_OLD 160 +/* This calculation is crafted so that input of 4096 will result in 160 + * which in turn is equal to old maximal stripe count. + * XXX: In fact this is too simpified for now, what it also need is to get + * ea_type argument to clearly know how much space each stripe consumes. + * + * The limit of 12 pages is somewhat arbitrary, but is a reasonably large + * allocation that is sufficient for the current generation of systems. + * + * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */ +#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ +#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ + +#define lov_user_ost_data lov_user_ost_data_v1 +struct lov_user_ost_data_v1 { /* per-stripe data structure */ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ +} __attribute__((packed)); + +#define lov_user_md lov_user_md_v1 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed)); + +/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to + * use this. It is unsafe to #define those values in this header as it + * is possible the application has already #included . */ +#ifdef HAVE_LOV_USER_MDS_DATA +#define lov_user_mds_data lov_user_mds_data_v1 +struct lov_user_mds_data_v1 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v3 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */ +} __attribute__((packed)); +#endif + +/* keep this to be the same size as lov_user_ost_data_v1 */ +struct lmv_user_mds_data { + struct lu_fid lum_fid; + __u32 lum_padding; + __u32 lum_mds; +}; + +/* lum_type */ +enum { + LMV_STRIPE_TYPE = 0, + LMV_DEFAULT_TYPE = 1, +}; + +#define lmv_user_md lmv_user_md_v1 +struct lmv_user_md_v1 { + __u32 lum_magic; /* must be the first field */ + __u32 lum_stripe_count; /* dirstripe count */ + __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ + __u32 lum_hash_type; /* Dir stripe policy */ + __u32 lum_type; /* LMV type: default or normal */ + __u32 lum_padding1; + __u32 lum_padding2; + __u32 lum_padding3; + char lum_pool_name[LOV_MAXPOOLNAME]; + struct lmv_user_mds_data lum_objects[0]; +}; + +static inline int lmv_user_md_size(int stripes, int lmm_magic) +{ + return sizeof(struct lmv_user_md) + + stripes * sizeof(struct lmv_user_mds_data); +} + +extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum); + +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +#define UUID_MAX 40 +struct obd_uuid { + char uuid[UUID_MAX]; +}; + +static inline int obd_uuid_equals(const struct obd_uuid *u1, + const struct obd_uuid *u2) +{ + return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; +} + +static inline int obd_uuid_empty(struct obd_uuid *uuid) +{ + return uuid->uuid[0] == '\0'; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) +{ + strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(struct obd_uuid *uuid) +{ + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done... + we're always null-terminated, even in a race. */ + static char temp[sizeof(*uuid)]; + memcpy(temp, uuid->uuid, sizeof(*uuid) - 1); + temp[sizeof(*uuid) - 1] = '\0'; + return temp; + } + return (char *)(uuid->uuid); +} + +/* Extract fsname from uuid (or target name) of a target + e.g. (myfs-OST0007_UUID -> myfs) + see also deuuidify. */ +static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) +{ + char *p; + + strncpy(buf, uuid, buflen - 1); + buf[buflen - 1] = '\0'; + p = strrchr(buf, '-'); + if (p) + *p = '\0'; +} + +/* printf display format + e.g. printf("file FID is "DFID"\n", PFID(fid)); */ +#define DFID_NOBRACE LPX64":0x%x:0x%x" +#define DFID "["DFID_NOBRACE"]" +#define PFID(fid) \ + (fid)->f_seq, \ + (fid)->f_oid, \ + (fid)->f_ver + +/* scanf input parse format -- strip '[' first. + e.g. sscanf(fidstr, SFID, RFID(&fid)); */ +/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX"" +liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *' +liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *' +*/ +#define SFID "0x"LPX64i":0x%x:0x%x" +#define RFID(fid) \ + &((fid)->f_seq), \ + &((fid)->f_oid), \ + &((fid)->f_ver) + + +/********* Quotas **********/ + +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */ + +#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ + +struct if_quotacheck { + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 + +/* permission */ +#define N_PERMS_MAX 64 + +struct perm_downcall_data { + __u64 pdd_nid; + __u32 pdd_perm; + __u32 pdd_padding; +}; + +struct identity_downcall_data { + __u32 idd_magic; + __u32 idd_err; + __u32 idd_uid; + __u32 idd_gid; + __u32 idd_nperms; + __u32 idd_ngroups; + struct perm_downcall_data idd_perms[N_PERMS_MAX]; + __u32 idd_groups[0]; +}; + +/* for non-mapped uid/gid */ +#define NOBODY_UID 99 +#define NOBODY_GID 99 + +#define INVALID_ID (-1) + +enum { + RMT_LSETFACL = 1, + RMT_LGETFACL = 2, + RMT_RSETFACL = 3, + RMT_RGETFACL = 4 +}; + +#ifdef NEED_QUOTA_DEFS +#ifndef QIF_BLIMITS +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) +#endif + +#endif /* !__KERNEL__ */ + +/* lustre volatile file support + * file name header: .^L^S^T^R:volatile" + */ +#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" +#define LUSTRE_VOLATILE_HDR_LEN 14 +/* hdr + MDT index */ +#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:" + +typedef enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +} lustre_quota_version_t; + +/* XXX: same as if_dqinfo struct in kernel */ +struct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +/* XXX: same as if_dqblk struct in kernel, plus one padding */ +struct obd_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 dqb_padding; +}; + +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + +struct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) +struct lustre_swap_layouts { + __u64 sl_flags; + __u32 sl_fd; + __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; +}; + + +/********* Changelogs **********/ +/** Changelog record types */ +enum changelog_rec_type { + CL_MARK = 0, + CL_CREATE = 1, /* namespace */ + CL_MKDIR = 2, /* namespace */ + CL_HARDLINK = 3, /* namespace */ + CL_SOFTLINK = 4, /* namespace */ + CL_MKNOD = 5, /* namespace */ + CL_UNLINK = 6, /* namespace */ + CL_RMDIR = 7, /* namespace */ + CL_RENAME = 8, /* namespace */ + CL_EXT = 9, /* namespace extended record (2nd half of rename) */ + CL_OPEN = 10, /* not currently used */ + CL_CLOSE = 11, /* may be written to log only with mtime change */ + CL_IOCTL = 12, + CL_TRUNC = 13, + CL_SETATTR = 14, + CL_XATTR = 15, + CL_HSM = 16, /* HSM specific events, see flags */ + CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ + CL_CTIME = 18, + CL_ATIME = 19, + CL_LAYOUT = 20, + CL_LAST +}; + +static inline const char *changelog_type2str(int type) { + static const char *changelog_str[] = { + "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", + "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "IOCTL", "TRUNC", + "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "LAYOUT" + }; + + if (type >= 0 && type < CL_LAST) + return changelog_str[type]; + return NULL; +} + +/* per-record flags */ +#define CLF_VERSION 0x1000 +#define CLF_EXT_VERSION 0x2000 +#define CLF_FLAGSHIFT 12 +#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1) +#define CLF_VERMASK (~CLF_FLAGMASK) +/* Anything under the flagmask may be per-type (if desired) */ +/* Flags for unlink */ +#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ +#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ + /* HSM cleaning needed */ +/* Flags for rename */ +#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of target */ + +/* Flags for HSM */ +/* 12b used (from high weight to low weight): + * 2b for flags + * 3b for event + * 7b for error code + */ +#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ +#define CLF_HSM_ERR_H 6 +#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ +#define CLF_HSM_EVENT_H 9 +#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ +#define CLF_HSM_FLAG_H 11 +#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ +#define CLF_HSM_SPARE_H 15 +#define CLF_HSM_LAST 15 + +/* Remove bits higher than _h, then extract the value + * between _h and _l by shifting lower weigth to bit 0. */ +#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ + >> (CLF_HSM_LAST - _h + _l)) + +#define CLF_HSM_SUCCESS 0x00 +#define CLF_HSM_MAXERROR 0x7E +#define CLF_HSM_ERROVERFLOW 0x7F + +#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ + +/* 3 bits field => 8 values allowed */ +enum hsm_event { + HE_ARCHIVE = 0, + HE_RESTORE = 1, + HE_CANCEL = 2, + HE_RELEASE = 3, + HE_REMOVE = 4, + HE_STATE = 5, + HE_SPARE1 = 6, + HE_SPARE2 = 7, +}; + +static inline enum hsm_event hsm_get_cl_event(__u16 flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L); +} + +static inline void hsm_set_cl_event(int *flags, enum hsm_event he) +{ + *flags |= (he << CLF_HSM_EVENT_L); +} + +static inline __u16 hsm_get_cl_flags(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); +} + +static inline void hsm_set_cl_flags(int *flags, int bits) +{ + *flags |= (bits << CLF_HSM_FLAG_L); +} + +static inline int hsm_get_cl_error(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); +} + +static inline void hsm_set_cl_error(int *flags, int error) +{ + *flags |= (error << CLF_HSM_ERR_L); +} + +#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec)) + +struct changelog_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< parent fid */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save + * space, only rename uses changelog_ext_rec, while others use changelog_rec to + * store records. + */ +struct changelog_ext_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags & CLF_FLAGMASK) | + CLF_EXT_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< target parent fid */ + lustre_fid cr_sfid; /**< source fid, or zero */ + lustre_fid cr_spfid; /**< source parent fid, or zero */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +#define CHANGELOG_REC_EXTENDED(rec) \ + (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION) + +static inline int changelog_rec_size(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec): + sizeof(*rec); +} + +static inline char *changelog_rec_name(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? + ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name; +} + +static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec) +{ + return rec->cr_namelen - strlen(rec->cr_name) - 1; +} + +static inline char *changelog_rec_sname(struct changelog_ext_rec *rec) +{ + return rec->cr_name + strlen(rec->cr_name) + 1; +} + +struct ioc_changelog { + __u64 icc_recno; + __u32 icc_mdtindex; + __u32 icc_id; + __u32 icc_flags; +}; + +enum changelog_message_type { + CL_RECORD = 10, /* message is a changelog_rec */ + CL_EOF = 11, /* at end of current changelog */ +}; + +/********* Misc **********/ + +struct ioc_data_version { + __u64 idv_version; + __u64 idv_flags; /* See LL_DV_xxx */ +}; +#define LL_DV_NOFLUSH 0x01 /* Do not take READ EXTENT LOCK before sampling + version. Dirty caches are left unchanged. */ + +#ifndef offsetof +# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) +#endif + +#define dot_lustre_name ".lustre" + + +/********* HSM **********/ + +/** HSM per-file state + * See HSM_FLAGS below. + */ +enum hsm_states { + HS_EXISTS = 0x00000001, + HS_DIRTY = 0x00000002, + HS_RELEASED = 0x00000004, + HS_ARCHIVED = 0x00000008, + HS_NORELEASE = 0x00000010, + HS_NOARCHIVE = 0x00000020, + HS_LOST = 0x00000040, +}; + +/* HSM user-setable flags. */ +#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) + +/* Other HSM flags. */ +#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) + +/* + * All HSM-related possible flags that could be applied to a file. + * This should be kept in sync with hsm_states. + */ +#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) + +/** + * HSM request progress state + */ +enum hsm_progress_states { + HPS_WAITING = 1, + HPS_RUNNING = 2, + HPS_DONE = 3, +}; +#define HPS_NONE 0 + +static inline char *hsm_progress_state2name(enum hsm_progress_states s) +{ + switch (s) { + case HPS_WAITING: return "waiting"; + case HPS_RUNNING: return "running"; + case HPS_DONE: return "done"; + default: return "unknown"; + } +} + +struct hsm_extent { + __u64 offset; + __u64 length; +} __attribute__((packed)); + +/** + * Current HSM states of a Lustre file. + * + * This structure purpose is to be sent to user-space mainly. It describes the + * current HSM flags and in-progress action. + */ +struct hsm_user_state { + /** Current HSM states, from enum hsm_states. */ + __u32 hus_states; + __u32 hus_archive_id; + /** The current undergoing action, if there is one */ + __u32 hus_in_progress_state; + __u32 hus_in_progress_action; + struct hsm_extent hus_in_progress_location; + char hus_extended_info[]; +}; + +struct hsm_state_set_ioc { + struct lu_fid hssi_fid; + __u64 hssi_setmask; + __u64 hssi_clearmask; +}; + +/* + * This structure describes the current in-progress action for a file. + * it is retuned to user space and send over the wire + */ +struct hsm_current_action { + /** The current undergoing action, if there is one */ + /* state is one of hsm_progress_states */ + __u32 hca_state; + /* action is one of hsm_user_action */ + __u32 hca_action; + struct hsm_extent hca_location; +}; + +/***** HSM user requests ******/ +/* User-generated (lfs/ioctl) request types */ +enum hsm_user_action { + HUA_NONE = 1, /* no action (noop) */ + HUA_ARCHIVE = 10, /* copy to hsm */ + HUA_RESTORE = 11, /* prestage */ + HUA_RELEASE = 12, /* drop ost objects */ + HUA_REMOVE = 13, /* remove from archive */ + HUA_CANCEL = 14 /* cancel a request */ +}; + +static inline char *hsm_user_action2name(enum hsm_user_action a) +{ + switch (a) { + case HUA_NONE: return "NOOP"; + case HUA_ARCHIVE: return "ARCHIVE"; + case HUA_RESTORE: return "RESTORE"; + case HUA_RELEASE: return "RELEASE"; + case HUA_REMOVE: return "REMOVE"; + case HUA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* + * List of hr_flags (bit field) + */ +#define HSM_FORCE_ACTION 0x0001 +/* used by CT, connot be set by user */ +#define HSM_GHOST_COPY 0x0002 + +/** + * Contains all the fixed part of struct hsm_user_request. + * + */ +struct hsm_request { + __u32 hr_action; /* enum hsm_user_action */ + __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ + __u64 hr_flags; /* request flags */ + __u32 hr_itemcount; /* item count in hur_user_item vector */ + __u32 hr_data_len; +}; + +struct hsm_user_item { + lustre_fid hui_fid; + struct hsm_extent hui_extent; +} __attribute__((packed)); + +struct hsm_user_request { + struct hsm_request hur_request; + struct hsm_user_item hur_user_item[0]; + /* extra data blob at end of struct (after all + * hur_user_items), only use helpers to access it + */ +} __attribute__((packed)); + +/** Return pointer to data field in a hsm user request */ +static inline void *hur_data(struct hsm_user_request *hur) +{ + return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); +} + +/** Compute the current length of the provided hsm_user_request. */ +static inline int hur_len(struct hsm_user_request *hur) +{ + return offsetof(struct hsm_user_request, + hur_user_item[hur->hur_request.hr_itemcount]) + + hur->hur_request.hr_data_len; +} + +/****** HSM RPCs to copytool *****/ +/* Message types the copytool may receive */ +enum hsm_message_type { + HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ +}; + +/* Actions the copytool may be instructed to take for a given action_item */ +enum hsm_copytool_action { + HSMA_NONE = 10, /* no action */ + HSMA_ARCHIVE = 20, /* arbitrary offset */ + HSMA_RESTORE = 21, + HSMA_REMOVE = 22, + HSMA_CANCEL = 23 +}; + +static inline char *hsm_copytool_action2name(enum hsm_copytool_action a) +{ + switch (a) { + case HSMA_NONE: return "NOOP"; + case HSMA_ARCHIVE: return "ARCHIVE"; + case HSMA_RESTORE: return "RESTORE"; + case HSMA_REMOVE: return "REMOVE"; + case HSMA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* Copytool item action description */ +struct hsm_action_item { + __u32 hai_len; /* valid size of this struct */ + __u32 hai_action; /* hsm_copytool_action, but use known size */ + lustre_fid hai_fid; /* Lustre FID to operated on */ + lustre_fid hai_dfid; /* fid used for data access */ + struct hsm_extent hai_extent; /* byte range to operate on */ + __u64 hai_cookie; /* action cookie from coordinator */ + __u64 hai_gid; /* grouplock id */ + char hai_data[0]; /* variable length */ +} __attribute__((packed)); + +/* + * helper function which print in hexa the first bytes of + * hai opaque field + * \param hai [IN] record to print + * \param buffer [OUT] output buffer + * \param len [IN] max buffer len + * \retval buffer + */ +static inline char *hai_dump_data_field(struct hsm_action_item *hai, + char *buffer, int len) +{ + int i, sz, data_len; + char *ptr; + + ptr = buffer; + sz = len; + data_len = hai->hai_len - sizeof(*hai); + for (i = 0 ; (i < data_len) && (sz > 0) ; i++) + { + int cnt; + + cnt = snprintf(ptr, sz, "%.2X", + (unsigned char)hai->hai_data[i]); + ptr += cnt; + sz -= cnt; + } + *ptr = '\0'; + return buffer; +} + +/* Copytool action list */ +#define HAL_VERSION 1 +#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ +struct hsm_action_list { + __u32 hal_version; + __u32 hal_count; /* number of hai's to follow */ + __u64 hal_compound_id; /* returned by coordinator */ + __u64 hal_flags; + __u32 hal_archive_id; /* which archive backend */ + __u32 padding1; + char hal_fsname[0]; /* null-terminated */ + /* struct hsm_action_item[hal_count] follows, aligned on 8-byte + boundaries. See hai_zero */ +} __attribute__((packed)); + +#ifndef HAVE_CFS_SIZE_ROUND +static inline int cfs_size_round (int val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +/* Return pointer to first hai in action list */ +static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal) +{ + return (struct hsm_action_item *)(hal->hal_fsname + + cfs_size_round(strlen(hal-> \ + hal_fsname))); +} +/* Return pointer to next hai */ +static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai) +{ + return (struct hsm_action_item *)((char *)hai + + cfs_size_round(hai->hai_len)); +} + +/* Return size of an hsm_action_list */ +static inline int hal_size(struct hsm_action_list *hal) +{ + int i, sz; + struct hsm_action_item *hai; + + sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname)); + hai = hai_zero(hal); + for (i = 0 ; i < hal->hal_count ; i++) { + sz += cfs_size_round(hai->hai_len); + hai = hai_next(hai); + } + return(sz); +} + +/* Copytool progress reporting */ +#define HP_FLAG_COMPLETED 0x01 +#define HP_FLAG_RETRY 0x02 + +struct hsm_progress { + lustre_fid hp_fid; + __u64 hp_cookie; + struct hsm_extent hp_extent; + __u16 hp_flags; + __u16 hp_errval; /* positive val */ + __u32 padding; +}; + +/** + * Use by copytool during any hsm request they handled. + * This structure is initialized by llapi_hsm_copy_start() + * which is an helper over the ioctl() interface + * Store Lustre, internal use only, data. + */ +struct hsm_copy { + __u64 hc_data_version; + __u16 hc_flags; + __u16 hc_errval; /* positive val */ + __u32 padding; + struct hsm_action_item hc_hai; +}; + +/** @} lustreuser */ + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h new file mode 100644 index 000000000000..63da66506639 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h @@ -0,0 +1,310 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTREAPI_H_ +#define _LUSTREAPI_H_ + +/** \defgroup llapi llapi + * + * @{ + */ + +#include + +typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args); + +/* lustreapi message severity level */ +enum llapi_message_level { + LLAPI_MSG_OFF = 0, + LLAPI_MSG_FATAL = 1, + LLAPI_MSG_ERROR = 2, + LLAPI_MSG_WARN = 3, + LLAPI_MSG_NORMAL = 4, + LLAPI_MSG_INFO = 5, + LLAPI_MSG_DEBUG = 6, + LLAPI_MSG_MAX +}; + +/* the bottom three bits reserved for llapi_message_level */ +#define LLAPI_MSG_MASK 0x00000007 +#define LLAPI_MSG_NO_ERRNO 0x00000010 + +extern void llapi_msg_set_level(int level); +extern void llapi_error(int level, int rc, char *fmt, ...); +#define llapi_err_noerrno(level, fmt, a...) \ + llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a) +extern void llapi_printf(int level, char *fmt, ...); +extern int llapi_file_create(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern); +extern int llapi_file_open(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern); +extern int llapi_file_create_pool(const char *name, + unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +extern int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +extern int llapi_poollist(const char *name); +extern int llapi_get_poollist(const char *name, char **poollist, int list_size, + char *buffer, int buffer_size); +extern int llapi_get_poolmembers(const char *poolname, char **members, + int list_size, char *buffer, int buffer_size); +extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); +#define HAVE_LLAPI_FILE_LOOKUP +extern int llapi_file_lookup(int dirfd, const char *name); + +#define VERBOSE_COUNT 0x1 +#define VERBOSE_SIZE 0x2 +#define VERBOSE_OFFSET 0x4 +#define VERBOSE_POOL 0x8 +#define VERBOSE_DETAIL 0x10 +#define VERBOSE_OBJID 0x20 +#define VERBOSE_GENERATION 0x40 +#define VERBOSE_MDTINDEX 0x80 +#define VERBOSE_ALL (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \ + VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION) + +struct find_param { + unsigned int maxdepth; + time_t atime; + time_t mtime; + time_t ctime; + int asign; /* cannot be bitfields due to using pointers to */ + int csign; /* access them during argument parsing. */ + int msign; + int type; + int size_sign:2, /* these need to be signed values */ + stripesize_sign:2, + stripecount_sign:2; + unsigned long long size; + unsigned long long size_units; + uid_t uid; + gid_t gid; + + unsigned long zeroend:1, + recursive:1, + exclude_pattern:1, + exclude_type:1, + exclude_obd:1, + exclude_mdt:1, + exclude_gid:1, + exclude_uid:1, + check_gid:1, /* group ID */ + check_uid:1, /* user ID */ + check_pool:1, /* LOV pool name */ + check_size:1, /* file size */ + exclude_pool:1, + exclude_size:1, + exclude_atime:1, + exclude_mtime:1, + exclude_ctime:1, + get_lmv:1, /* get MDT list from LMV */ + raw:1, /* do not fill in defaults */ + check_stripesize:1, /* LOV stripe size */ + exclude_stripesize:1, + check_stripecount:1, /* LOV stripe count */ + exclude_stripecount:1; + + int verbose; + int quiet; + + /* regular expression */ + char *pattern; + + char *print_fmt; + + struct obd_uuid *obduuid; + int num_obds; + int num_alloc_obds; + int obdindex; + int *obdindexes; + + struct obd_uuid *mdtuuid; + int num_mdts; + int num_alloc_mdts; + int mdtindex; + int *mdtindexes; + int file_mdtindex; + + int lumlen; + struct lov_user_mds_data *lmd; + + char poolname[LOV_MAXPOOLNAME + 1]; + + int fp_lmv_count; + struct lmv_user_md *fp_lmv_md; + + unsigned long long stripesize; + unsigned long long stripesize_units; + unsigned long long stripecount; + + /* In-process parameters. */ + unsigned long got_uuids:1, + obds_printed:1, + have_fileinfo:1; /* file attrs and LOV xattr */ + unsigned int depth; + dev_t st_dev; +}; + +extern int llapi_ostlist(char *path, struct find_param *param); +extern int llapi_uuid_match(char *real_uuid, char *search_uuid); +extern int llapi_getstripe(char *path, struct find_param *param); +extern int llapi_find(char *path, struct find_param *param); + +extern int llapi_file_fget_mdtidx(int fd, int *mdtidx); +extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset, + int stripe_count, int stripe_pattern, + char *poolname); +int llapi_direntry_remove(char *dname); +extern int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, + struct obd_uuid *uuid_buf); +extern int llapi_ping(char *obd_type, char *obd_name); +extern int llapi_target_check(int num_types, char **obd_types, char *dir); +extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); +extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid); +extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); +extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); +extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count); +extern int llapi_is_lustre_mnttype(const char *type); +extern int llapi_search_ost(char *fsname, char *poolname, char *ostname); +extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt); +extern int parse_size(char *optarg, unsigned long long *size, + unsigned long long *size_units, int bytes_spec); +extern int llapi_search_mounts(const char *pathname, int index, + char *mntdir, char *fsname); +extern int llapi_search_fsname(const char *pathname, char *fsname); +extern int llapi_getname(const char *path, char *buf, size_t size); + +extern void llapi_ping_target(char *obd_type, char *obd_name, + char *obd_uuid, void *args); + +extern int llapi_search_rootpath(char *pathname, const char *fsname); + +struct mntent; +#define HAVE_LLAPI_IS_LUSTRE_MNT +extern int llapi_is_lustre_mnt(struct mntent *mnt); +extern int llapi_quotachown(char *path, int flag); +extern int llapi_quotacheck(char *mnt, int check_type); +extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk); +extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl); +extern int llapi_target_iterate(int type_num, char **obd_type, void *args, + llapi_cb_t cb); +extern int llapi_get_connect_flags(const char *mnt, __u64 *flags); +extern int llapi_lsetfacl(int argc, char *argv[]); +extern int llapi_lgetfacl(int argc, char *argv[]); +extern int llapi_rsetfacl(int argc, char *argv[]); +extern int llapi_rgetfacl(int argc, char *argv[]); +extern int llapi_cp(int argc, char *argv[]); +extern int llapi_ls(int argc, char *argv[]); +extern int llapi_fid2path(const char *device, const char *fidstr, char *path, + int pathlen, long long *recno, int *linkno); +extern int llapi_path2fid(const char *path, lustre_fid *fid); +extern int llapi_fd2fid(const int fd, lustre_fid *fid); + +extern int llapi_get_version(char *buffer, int buffer_size, char **version); +extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags); +extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus); +extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask, + __u32 archive_id); + +extern int llapi_create_volatile_idx(char *directory, int idx, int mode); +static inline int llapi_create_volatile(char *directory, int mode) +{ + return llapi_create_volatile_idx(directory, -1, mode); +} + + +extern int llapi_fswap_layouts(const int fd1, const int fd2, + __u64 dv1, __u64 dv2, __u64 flags); +extern int llapi_swap_layouts(const char *path1, const char *path2, + __u64 dv1, __u64 dv2, __u64 flags); + +/* Changelog interface. priv is private state, managed internally + by these functions */ +#define CHANGELOG_FLAG_FOLLOW 0x01 /* Not yet implemented */ +#define CHANGELOG_FLAG_BLOCK 0x02 /* Blocking IO makes sense in case of + slow user parsing of the records, but it also prevents us from cleaning + up if the records are not consumed. */ + +/* Records received are in extentded format now, though most of them are still + * written in disk in changelog_rec format (to save space and time), it's + * converted to extented format in the lustre api to ease changelog analysis. */ +#define HAVE_CHANGELOG_EXTEND_REC 1 + +extern int llapi_changelog_start(void **priv, int flags, const char *mdtname, + long long startrec); +extern int llapi_changelog_fini(void **priv); +extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech); +extern int llapi_changelog_free(struct changelog_ext_rec **rech); +/* Allow records up to endrec to be destroyed; requires registered id. */ +extern int llapi_changelog_clear(const char *mdtname, const char *idstr, + long long endrec); + +/* HSM copytool interface. + * priv is private state, managed internally by these functions + */ +struct hsm_copytool_private; +extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv, + char *fsname, int flags, + int archive_count, int *archives); +extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv); +extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv, + struct hsm_action_list **hal, int *msgsize); +extern int llapi_hsm_copytool_free(struct hsm_action_list **hal); +extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy, + const struct hsm_action_item *hai); +extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy, + const struct hsm_progress *hp); +extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp); +extern int llapi_hsm_import(const char *dst, int archive, struct stat *st, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, + char *pool_name, lustre_fid *newfid); + +/* HSM user interface */ +extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount, + int data_len); +extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request); +extern int llapi_hsm_current_action(const char *path, + struct hsm_current_action *hca); +/** @} llapi */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h new file mode 100644 index 000000000000..5cfb87b180c3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_acl.h @@ -0,0 +1,42 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_acl.h + */ + +#ifndef _LUSTRE_ACL_H +#define _LUSTRE_ACL_H + +#include + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h new file mode 100644 index 000000000000..d77bffc0b59d --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_capa.h @@ -0,0 +1,305 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_capa.h + * + * Author: Lai Siyao + */ + +#ifndef __LINUX_CAPA_H_ +#define __LINUX_CAPA_H_ + +/** \defgroup capa capa + * + * @{ + */ + +/* + * capability + */ +#include +#include + +#define CAPA_TIMEOUT 1800 /* sec, == 30 min */ +#define CAPA_KEY_TIMEOUT (24 * 60 * 60) /* sec, == 1 days */ + +struct capa_hmac_alg { + const char *ha_name; + int ha_len; + int ha_keylen; +}; + +#define DEF_CAPA_HMAC_ALG(name, type, len, keylen) \ +[CAPA_HMAC_ALG_ ## type] = { \ + .ha_name = name, \ + .ha_len = len, \ + .ha_keylen = keylen, \ +} + +struct client_capa { + struct inode *inode; + struct list_head lli_list; /* link to lli_oss_capas */ +}; + +struct target_capa { + struct hlist_node c_hash; /* link to capa hash */ +}; + +struct obd_capa { + struct list_head c_list; /* link to capa_list */ + + struct lustre_capa c_capa; /* capa */ + atomic_t c_refc; /* ref count */ + cfs_time_t c_expiry; /* jiffies */ + spinlock_t c_lock; /* protect capa content */ + int c_site; + + union { + struct client_capa cli; + struct target_capa tgt; + } u; +}; + +enum { + CAPA_SITE_CLIENT = 0, + CAPA_SITE_SERVER, + CAPA_SITE_MAX +}; + +static inline struct lu_fid *capa_fid(struct lustre_capa *capa) +{ + return &capa->lc_fid; +} + +static inline __u64 capa_opc(struct lustre_capa *capa) +{ + return capa->lc_opc; +} + +static inline __u64 capa_uid(struct lustre_capa *capa) +{ + return capa->lc_uid; +} + +static inline __u64 capa_gid(struct lustre_capa *capa) +{ + return capa->lc_gid; +} + +static inline __u32 capa_flags(struct lustre_capa *capa) +{ + return capa->lc_flags & 0xffffff; +} + +static inline __u32 capa_alg(struct lustre_capa *capa) +{ + return (capa->lc_flags >> 24); +} + +static inline __u32 capa_keyid(struct lustre_capa *capa) +{ + return capa->lc_keyid; +} + +static inline __u64 capa_key_seq(struct lustre_capa_key *key) +{ + return key->lk_seq; +} + +static inline __u32 capa_key_keyid(struct lustre_capa_key *key) +{ + return key->lk_keyid; +} + +static inline __u32 capa_timeout(struct lustre_capa *capa) +{ + return capa->lc_timeout; +} + +static inline __u32 capa_expiry(struct lustre_capa *capa) +{ + return capa->lc_expiry; +} + +void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *, + const char *fmt, ... ); +#define DEBUG_CAPA(level, capa, fmt, args...) \ +do { \ + if (((level) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (level)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + _debug_capa((capa), &msgdata, fmt, ##args); \ + } \ +} while (0) + +#define DEBUG_CAPA_KEY(level, k, fmt, args...) \ +do { \ +CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n", \ + ##args, k, capa_key_seq(k), capa_key_keyid(k)); \ +} while (0) + +typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *); + +/* obdclass/capa.c */ +extern struct list_head capa_list[]; +extern spinlock_t capa_lock; +extern int capa_count[]; +extern struct kmem_cache *capa_cachep; + +struct hlist_head *init_capa_hash(void); +void cleanup_capa_hash(struct hlist_head *hash); + +struct obd_capa *capa_add(struct hlist_head *hash, + struct lustre_capa *capa); +struct obd_capa *capa_lookup(struct hlist_head *hash, + struct lustre_capa *capa, int alive); + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key); +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +void capa_cpy(void *dst, struct obd_capa *ocapa); +static inline struct obd_capa *alloc_capa(int site) +{ + struct obd_capa *ocapa; + + if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER)) + return ERR_PTR(-EINVAL); + + OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep); + if (unlikely(!ocapa)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ocapa->c_list); + atomic_set(&ocapa->c_refc, 1); + spin_lock_init(&ocapa->c_lock); + ocapa->c_site = site; + if (ocapa->c_site == CAPA_SITE_CLIENT) + INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + else + INIT_HLIST_NODE(&ocapa->u.tgt.c_hash); + + return ocapa; +} + +static inline struct obd_capa *capa_get(struct obd_capa *ocapa) +{ + if (!ocapa) + return NULL; + + atomic_inc(&ocapa->c_refc); + return ocapa; +} + +static inline void capa_put(struct obd_capa *ocapa) +{ + if (!ocapa) + return; + + if (atomic_read(&ocapa->c_refc) == 0) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for"); + LBUG(); + } + + if (atomic_dec_and_test(&ocapa->c_refc)) { + LASSERT(list_empty(&ocapa->c_list)); + if (ocapa->c_site == CAPA_SITE_CLIENT) { + LASSERT(list_empty(&ocapa->u.cli.lli_list)); + } else { + struct hlist_node *hnode; + + hnode = &ocapa->u.tgt.c_hash; + LASSERT(!hnode->next && !hnode->pprev); + } + OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa)); + } +} + +static inline int open_flags_to_accmode(int flags) +{ + int mode = flags; + + if ((mode + 1) & O_ACCMODE) + mode++; + if (mode & O_TRUNC) + mode |= 2; + + return mode; +} + +static inline __u64 capa_open_opc(int mode) +{ + return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ; +} + +static inline void set_capa_expiry(struct obd_capa *ocapa) +{ + cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry, + cfs_time_current_sec()); + ocapa->c_expiry = cfs_time_add(cfs_time_current(), + cfs_time_seconds(expiry)); +} + +static inline int capa_is_expired_sec(struct lustre_capa *capa) +{ + return (capa->lc_expiry - cfs_time_current_sec() <= 0); +} + +static inline int capa_is_expired(struct obd_capa *ocapa) +{ + return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current()); +} + +static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc) +{ + return (capa_opc(capa) & opc) == opc; +} + +struct filter_capa_key { + struct list_head k_list; + struct lustre_capa_key k_key; +}; + +enum { + LC_ID_NONE = 0, + LC_ID_PLAIN = 1, + LC_ID_CONVERT = 2 +}; + +#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT) + +/** @} capa */ + +#endif /* __LINUX_CAPA_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h new file mode 100644 index 000000000000..f12429f38215 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h @@ -0,0 +1,299 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_CFG_H +#define _LUSTRE_CFG_H + +/** \defgroup cfg cfg + * + * @{ + */ + +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)])) + +/** If the LCFG_REQUIRED bit is set in a configuration command, + * then the client is required to understand this parameter + * in order to mount the filesystem. If it does not understand + * a REQUIRED command the client mount will fail. */ +#define LCFG_REQUIRED 0x0001000 + +enum lcfg_command_type { + LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ + LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ + LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ + LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup */ + LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ + LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from a niduuid */ + LCFG_MOUNTOPT = 0x00cf007, /**< create a profile (mdc, osc) */ + LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ + LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ + LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ + LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to an obd */ + LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ + LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ + LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ + LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ + LCFG_MARKER = 0x00cf010, /**< metadata about next cfg rec */ + LCFG_LOG_START = 0x00ce011, /**< mgc only, process a cfg log */ + LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ + LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, inactive */ + LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ + LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ + LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ + LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ + LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ + LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ + LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ + LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ + LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre + * cleanup cleanup */ +}; + +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_bufcount; +}; + +struct lustre_cfg { + __u32 lcfg_version; + __u32 lcfg_command; + + __u32 lcfg_num; + __u32 lcfg_flags; + __u64 lcfg_nid; + __u32 lcfg_nal; /* not used any more */ + + __u32 lcfg_bufcount; + __u32 lcfg_buflens[0]; +}; + +enum cfg_record_type { + PORTALS_CFG_TYPE = 1, + LUSTRE_CFG_TYPE = 123, +}; + +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) \ + ? 0 \ + : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + __u32 index, + void *buf, + __u32 buflen) +{ + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + if (bufs == NULL) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; +} + +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + __u32 index, + char *str) +{ + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} + +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} + +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) +{ + int i; + int offset; + int bufcount; + LASSERT (lcfg != NULL); + LASSERT (index >= 0); + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += cfs_size_round(lcfg->lcfg_buflens[i]); + return (char *)lcfg + offset; +} + +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + int i; + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); + } +} + +static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) +{ + char *s; + + if (lcfg->lcfg_buflens[index] == 0) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (s == NULL) + return NULL; + + /* + * make sure it's NULL terminated, even if this kills a char + * of data. Try to use the padding first though. + */ + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + int last = min((int)lcfg->lcfg_buflens[index], + cfs_size_round(lcfg->lcfg_buflens[index]) - 1); + char lost = s[last]; + s[last] = '\0'; + if (lost != '\0') { + CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", + index, s, lost); + } + } + return s; +} + +static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens) +{ + int i; + int len; + ENTRY; + + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += cfs_size_round(buflens[i]); + + RETURN(cfs_size_round(len)); +} + + +#include + +static inline struct lustre_cfg *lustre_cfg_new(int cmd, + struct lustre_cfg_bufs *bufs) +{ + struct lustre_cfg *lcfg; + char *ptr; + int i; + + ENTRY; + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) + RETURN(ERR_PTR(-ENOMEM)); + + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; + + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr); + } + RETURN(lcfg); +} + +static inline void lustre_cfg_free(struct lustre_cfg *lcfg) +{ + int len; + + len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); + + OBD_FREE(lcfg, len); + EXIT; + return; +} + +static inline int lustre_cfg_sanity_check(void *buf, int len) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; + ENTRY; + if (!lcfg) + RETURN(-EINVAL); + + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + RETURN(-EINVAL); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + RETURN(-EINVAL); + + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + RETURN(-EINVAL); + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + RETURN(-EINVAL); + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + RETURN(-EINVAL); + + RETURN(0); +} + +#include + +#ifndef INVALID_UID +#define INVALID_UID (-1) +#endif + +/** @} cfg */ + +#endif // _LUSTRE_CFG_H diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h new file mode 100644 index 000000000000..3d9e4462af43 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_debug.h @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_DEBUG_H +#define _LUSTRE_DEBUG_H + +/** \defgroup debug debug + * + * @{ + */ + +#include +#include + +#include + +#define ASSERT_MAX_SIZE_MB 60000ULL +#define ASSERT_PAGE_INDEX(index, OP) \ +do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) { \ + CERROR("bad page index %lu > %llu\n", index, \ + ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +#define ASSERT_FILE_OFFSET(offset, OP) \ +do { if (offset > ASSERT_MAX_SIZE_MB << 20) { \ + CERROR("bad file offset %llu > %llu\n", offset, \ + ASSERT_MAX_SIZE_MB << 20); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +/* lib/debug.c */ +void dump_lniobuf(struct niobuf_local *lnb); +int dump_req(struct ptlrpc_request *req); +void dump_lsm(int level, struct lov_stripe_md *lsm); +int block_debug_setup(void *addr, int len, __u64 off, __u64 id); +int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id); + +/** @} debug */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h new file mode 100644 index 000000000000..c2504c5546aa --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_disk.h @@ -0,0 +1,553 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_disk.h + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ + +#include +#include + +/****************** on-disk files *********************/ + +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" +#define LOV_OBJID "lov_objid" +#define LOV_OBJSEQ "lov_objseq" +#define HEALTH_CHECK "health_check" +#define CAPA_KEYS "capa_keys" +#define CHANGELOG_USERS "changelog_users" +#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS" +#define QMT_DIR "quota_master" +#define QSD_DIR "quota_slave" + +/****************** persistent mount data *********************/ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST | \ + LDD_F_SV_TYPE_MGS) +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server */ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate config logs for this fs or server */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +#define LDD_F_UPGRADE14 0x0200 +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** all nodes are specified as service nodes */ +#define LDD_F_NO_PRIMNODE 0x1000 +/** IR enable flag */ +#define LDD_F_IR_CAPABLE 0x2000 +/** the MGS refused to register the target. */ +#define LDD_F_ERROR 0x4000 + +/* opc for target register */ +#define LDD_F_OPC_REG 0x10000000 +#define LDD_F_OPC_UNREG 0x20000000 +#define LDD_F_OPC_READY 0x40000000 +#define LDD_F_OPC_MASK 0xf0000000 + +#define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK) + +#define LDD_F_MASK 0xFFFF + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LDISKFS2, + LDD_MT_ZFS, + LDD_MT_LAST +}; + +static inline char *mt_str(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "ext3", + "ldiskfs", + "smfs", + "reiserfs", + "ldiskfs2", + "zfs", + }; + return mount_type_string[mt]; +} + +static inline char *mt_type(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "osd-ldiskfs", + "osd-ldiskfs", + "osd-smfs", + "osd-reiserfs", + "osd-ldiskfs", + "osd-zfs", + }; + return mount_type_string[mt]; +} + +#define LDD_INCOMPAT_SUPP 0 +#define LDD_ROCOMPAT_SUPP 0 + +#define LDD_MAGIC 0x1dd00001 + +/* On-disk configuration file. In host-endian order. */ +struct lustre_disk_data { + __u32 ldd_magic; + __u32 ldd_feature_compat; /* compatible feature flags */ + __u32 ldd_feature_rocompat;/* read-only compatible feature flags */ + __u32 ldd_feature_incompat;/* incompatible feature flags */ + + __u32 ldd_config_ver; /* config rewrite count - not used */ + __u32 ldd_flags; /* LDD_SV_TYPE */ + __u32 ldd_svindex; /* server index (0001), must match + svname */ + __u32 ldd_mount_type; /* target fs type LDD_MT_* */ + char ldd_fsname[64]; /* filesystem this server is part of, + MTI_NAME_MAXLEN */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ + +/*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */ +/*1024*/__u8 ldd_padding[4096 - 1024]; +/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */ +/*8192*/char ldd_params[4096]; /* key=value pairs */ +}; + + +#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS) +#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \ + LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/* Make the mdt/ost server obd name based on the filesystem name */ +static inline int server_make_name(__u32 flags, __u16 index, char *fs, + char *name) +{ + if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { + if (!(flags & LDD_F_SV_ALL)) + sprintf(name, "%.8s%c%s%04x", fs, + (flags & LDD_F_VIRGIN) ? ':' : + ((flags & LDD_F_WRITECONF) ? '=' : '-'), + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", + index); + } else if (flags & LDD_F_SV_TYPE_MGS) { + sprintf(name, "MGS"); + } else { + CERROR("unknown server type %#x\n", flags); + return 1; + } + return 0; +} + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + everything as string options */ + +#define LMD_MAGIC 0xbdacbd03 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + __u32 lmd_magic; + __u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ + char *lmd_opts; /* lustre mount options (as opposed to + _device_ mount options) */ + char *lmd_params; /* lustre params */ + __u32 *lmd_exclude; /* array of OSTs to ignore */ + char *lmd_mgs; /* MGS nid */ + char *lmd_osd_type; /* OSD type */ +}; + +#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ +#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ +#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, + no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ +#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ +#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ +#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ +#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ +#define LMD_FLG_IAM 0x0400 /* IAM dir */ +#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ +#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ +#define LMD_FLG_UPDATE 0x2000 /* update parameters */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + + +/****************** last_rcvd file *********************/ + +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 +#define lr_epoch(a) ((a) >> LR_EPOCH_BITS) +#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */ +#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */ + +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif + +/* + * This limit is arbitrary (131072 clients on x86), but it is convenient to use + * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation. + * If we need more than 131072 clients (order-2 allocation on x86) then this + * should become an array of single-page pointers that are allocated on demand. + */ +#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8) +#define LR_MAX_CLIENTS (128 * 1024UL) +#else +#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8) +#endif + +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 +/** 2.0 server, interop flag to show server version is changed */ +#define OBD_COMPAT_20 0x00000008 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** Size-on-MDS is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 +/** filesystem using iam format to store directory entries */ +#define OBD_INCOMPAT_IAM_DIR 0x00000040 +/** LMA attribute contains per-inode incompatible flags */ +#define OBD_INCOMPAT_LMA 0x00000080 +/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16 + * bits are now used to store a generation. Once we start changing the layout + * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count + * will be confused by interpreting stripe_count | gen << 16 as the actual + * stripe count */ +#define OBD_INCOMPAT_LMM_VER 0x00000100 +/** multiple OI files for MDT */ +#define OBD_INCOMPAT_MULTI_OI 0x00000200 + +/* Data stored per server at the head of the last_rcvd file. In le32 order. + This should be common to filter_internal.h, lustre_mds.h */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_compat14; /* reserved - compat with old last_rcvd */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_osd_index; /* index number of OST in LOV */ + __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */ + __u32 lsd_start_epoch; /* VBR: start epoch from last boot */ + /** transaction values since lsd_trans_table_time */ + __u64 lsd_trans_table[LR_EXPIRE_INTERVALS]; + /** start point of transno table below */ + __u32 lsd_trans_table_time; /* time of first slot in table above */ + __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */ + __u8 lsd_padding[LR_SERVER_SIZE - 288]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + /* VBR: last versions */ + __u64 lcd_pre_versions[4]; + __u32 lcd_last_epoch; + /** orphans handling for delayed export rely on that */ + __u32 lcd_first_epoch; + __u8 lcd_padding[LR_CLIENT_SIZE - 128]; +}; + +/* bug20354: the lcd_uuid for export of clients may be wrong */ +static inline void check_lcd(char *obd_name, int index, + struct lsd_client_data *lcd) +{ + int length = sizeof(lcd->lcd_uuid); + if (strnlen((char*)lcd->lcd_uuid, length) == length) { + lcd->lcd_uuid[length - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports" + "stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +/* last_rcvd handling */ +static inline void lsd_le_to_cpu(struct lr_server_data *buf, + struct lr_server_data *lsd) +{ + int i; + memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno); + lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14); + lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count); + lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat); + lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat); + lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat); + lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size); + lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start); + lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size); + lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count); + lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid); + lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen); + memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid)); + lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index); + lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1); + lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]); + lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time); + lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals); +} + +static inline void lsd_cpu_to_le(struct lr_server_data *lsd, + struct lr_server_data *buf) +{ + int i; + memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid)); + buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno); + buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14); + buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count); + buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat); + buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat); + buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat); + buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size); + buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start); + buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size); + buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count); + buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid); + buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen); + memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid)); + buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index); + buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1); + buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]); + buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time); + buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals); +} + +static inline void lcd_le_to_cpu(struct lsd_client_data *buf, + struct lsd_client_data *lcd) +{ + memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid)); + lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno); + lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid); + lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result); + lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data); + lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno); + lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid); + lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result); + lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data); + lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]); + lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]); + lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]); + lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]); + lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch); + lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch); +} + +static inline void lcd_cpu_to_le(struct lsd_client_data *lcd, + struct lsd_client_data *buf) +{ + memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid)); + buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno); + buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid); + buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result); + buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data); + buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno); + buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid); + buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result); + buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data); + buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]); + buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]); + buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]); + buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]); + buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch); + buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch); +} + +static inline __u64 lcd_last_transno(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ? + lcd->lcd_last_transno : lcd->lcd_last_close_transno); +} + +static inline __u64 lcd_last_xid(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ? + lcd->lcd_last_xid : lcd->lcd_last_close_xid); +} + +/****************** superblock additional info *********************/ + +struct ll_sb_info; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ + struct vfsmount *lsi_srv_mnt; /* the one server mount */ + atomic_t lsi_mounts; /* references to the srv_mnt */ + char lsi_svname[MTI_NAME_MAXLEN]; + char lsi_osd_obdname[64]; + char lsi_osd_uuid[64]; + struct obd_export *lsi_osd_exp; + char lsi_osd_type[16]; + char lsi_fstype[16]; + struct backing_dev_info lsi_bdi; /* each client mountpoint needs + own backing_dev_info */ +}; + +#define LSI_UMOUNT_FAILOVER 0x00200000 +#define LSI_BDI_INITIALIZED 0x00400000 + +#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +#define s2lsi_nocast(sb) ((sb)->s_fs_info) + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) +#define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags) +#define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev) + + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct vfsmount *lmi_mnt; + struct list_head lmi_list_chain; +}; + +/* on-disk structure describing local object OIDs storage + * the structure to be used with any sequence managed by + * local object library */ +struct los_ondisk { + __u32 lso_magic; + __u32 lso_next_oid; +}; + +#define LOS_MAGIC 0xdecafbee + +/****************** prototypes *********************/ + +/* obd_mount.c */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr); +int server_name2index(const char *svname, __u32 *idx, const char **endptr); +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize); + +int lustre_put_lsi(struct super_block *sb); +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4); +int lustre_start_mgc(struct super_block *sb); +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)); +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)); +int lustre_common_put_super(struct super_block *sb); + + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type); + +/** @} disk */ + +#endif // _LUSTRE_DISK_H diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h new file mode 100644 index 000000000000..f6eaed810621 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h @@ -0,0 +1,1668 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** \defgroup LDLM Lustre Distributed Lock Manager + * + * Lustre DLM is based on VAX DLM. + * Its two main roles are: + * - To provide locking assuring consistency of data on all Lustre nodes. + * - To allow clients to cache state protected by a lock by holding the + * lock until a conflicting lock is requested or it is expired by the LRU. + * + * @{ + */ + +#ifndef _LUSTRE_DLM_H__ +#define _LUSTRE_DLM_H__ + +#include + +#include +#include +#include +#include +#include /* for interval_node{}, ldlm_extent */ +#include + +struct obd_ops; +struct obd_device; + +#define OBD_LDLM_DEVICENAME "ldlm" + +#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) +#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000)) +#define LDLM_CTIME_AGE_LIMIT (10) +#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 + +/** + * LDLM non-error return states + */ +typedef enum { + ELDLM_OK = 0, + + ELDLM_LOCK_CHANGED = 300, + ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, + ELDLM_LOCK_WOULDBLOCK = 304, + + ELDLM_NAMESPACE_EXISTS = 400, + ELDLM_BAD_NAMESPACE = 401 +} ldlm_error_t; + +/** + * LDLM namespace type. + * The "client" type is actually an indication that this is a narrow local view + * into complete namespace on the server. Such namespaces cannot make any + * decisions about lack of conflicts or do any autonomous lock granting without + * first speaking to a server. + */ +typedef enum { + LDLM_NAMESPACE_SERVER = 1 << 0, + LDLM_NAMESPACE_CLIENT = 1 << 1 +} ldlm_side_t; + +/** + * Declaration of flags sent through the wire. + **/ +#define LDLM_FL_LOCK_CHANGED 0x000001 /* extent, mode, or resource changed */ + +/** + * If the server returns one of these flags, then the lock was put on that list. + * If the client sends one of these flags (during recovery ONLY!), it wants the + * lock added to the specified list, no questions asked. + */ +#define LDLM_FL_BLOCK_GRANTED 0x000002 +#define LDLM_FL_BLOCK_CONV 0x000004 +#define LDLM_FL_BLOCK_WAIT 0x000008 + +/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */ + +#define LDLM_FL_AST_SENT 0x000020 /* blocking or cancel packet was + * queued for sending. */ +/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040 moved to non-wire flags */ +/* Used to be LDLM_FL_CANCEL 0x000080 moved to non-wire flags */ + +/** + * Lock is being replayed. This could probably be implied by the fact that one + * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. + */ +#define LDLM_FL_REPLAY 0x000100 + +#define LDLM_FL_INTENT_ONLY 0x000200 /* Don't grant lock, just do intent. */ + +/* Used to be LDLM_FL_LOCAL_ONLY 0x000400 moved to non-wire flags */ +/* Used to be LDLM_FL_FAILED 0x000800 moved to non-wire flags */ + +#define LDLM_FL_HAS_INTENT 0x001000 /* lock request has intent */ + +/* Used to be LDLM_FL_CANCELING 0x002000 moved to non-wire flags */ +/* Used to be LDLM_FL_LOCAL 0x004000 moved to non-wire flags */ + +#define LDLM_FL_DISCARD_DATA 0x010000 /* discard (no writeback) on cancel */ + +#define LDLM_FL_NO_TIMEOUT 0x020000 /* Blocked by group lock - wait + * indefinitely */ + +/** file & record locking */ +#define LDLM_FL_BLOCK_NOWAIT 0x040000 /* Server told not to wait if blocked. + * For AGL, OST will not send glimpse + * callback. */ +#define LDLM_FL_TEST_LOCK 0x080000 // return blocking lock + +/* Used to be LDLM_FL_LVB_READY 0x100000 moved to non-wire flags */ +/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */ +/* Used to be LDLM_FL_NO_LRU 0x400000 moved to non-wire flags */ + +/* Immediatelly cancel such locks when they block some other locks. Send + * cancel notification to original lock holder, but expect no reply. This is + * for clients (like liblustre) that cannot be expected to reliably response + * to blocking AST. */ +#define LDLM_FL_CANCEL_ON_BLOCK 0x800000 + +/* Flags flags inherited from parent lock when doing intents. */ +#define LDLM_INHERIT_FLAGS (LDLM_FL_CANCEL_ON_BLOCK) + +/* Used to be LDLM_FL_CP_REQD 0x1000000 moved to non-wire flags */ +/* Used to be LDLM_FL_CLEANED 0x2000000 moved to non-wire flags */ +/* Used to be LDLM_FL_ATOMIC_CB 0x4000000 moved to non-wire flags */ +/* Used to be LDLM_FL_BL_AST 0x10000000 moved to non-wire flags */ +/* Used to be LDLM_FL_BL_DONE 0x20000000 moved to non-wire flags */ + +/* measure lock contention and return -EUSERS if locking contention is high */ +#define LDLM_FL_DENY_ON_CONTENTION 0x40000000 + +/* These are flags that are mapped into the flags and ASTs of blocking locks */ +#define LDLM_AST_DISCARD_DATA 0x80000000 /* Add FL_DISCARD to blocking ASTs */ + +/* Flags sent in AST lock_flags to be mapped into the receiving lock. */ +#define LDLM_AST_FLAGS (LDLM_FL_DISCARD_DATA) + +/* + * -------------------------------------------------------------------------- + * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above + * 0x80000000 will not be sent over the wire. + * -------------------------------------------------------------------------- + */ + +/** + * Declaration of flags not sent through the wire. + **/ + +/** + * Used for marking lock as a target for -EINTR while cp_ast sleep + * emulation + race with upcoming bl_ast. + */ +#define LDLM_FL_FAIL_LOC 0x100000000ULL + +/** + * Used while processing the unused list to know that we have already + * handled this lock and decided to skip it. + */ +#define LDLM_FL_SKIPPED 0x200000000ULL +/* this lock is being destroyed */ +#define LDLM_FL_CBPENDING 0x400000000ULL +/* not a real flag, not saved in lock */ +#define LDLM_FL_WAIT_NOREPROC 0x800000000ULL +/* cancellation callback already run */ +#define LDLM_FL_CANCEL 0x1000000000ULL +#define LDLM_FL_LOCAL_ONLY 0x2000000000ULL +/* don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_FAILED 0x4000000000ULL +/* lock cancel has already been sent */ +#define LDLM_FL_CANCELING 0x8000000000ULL +/* local lock (ie, no srv/cli split) */ +#define LDLM_FL_LOCAL 0x10000000000ULL +/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that + * the LVB filling happens _after_ the lock has been granted, so another thread + * can match it before the LVB has been updated. As a dirty hack, we set + * LDLM_FL_LVB_READY only after we've done the LVB poop. + * this is only needed on LOV/OSC now, where LVB is actually used and callers + * must set it in input flags. + * + * The proper fix is to do the granting inside of the completion AST, which can + * be replaced with a LVB-aware wrapping function for OSC locks. That change is + * pretty high-risk, though, and would need a lot more testing. */ +#define LDLM_FL_LVB_READY 0x20000000000ULL +/* A lock contributes to the known minimum size (KMS) calculation until it has + * finished the part of its cancelation that performs write back on its dirty + * pages. It can remain on the granted list during this whole time. Threads + * racing to update the KMS after performing their writeback need to know to + * exclude each other's locks from the calculation as they walk the granted + * list. */ +#define LDLM_FL_KMS_IGNORE 0x40000000000ULL +/* completion AST to be executed */ +#define LDLM_FL_CP_REQD 0x80000000000ULL +/* cleanup_resource has already handled the lock */ +#define LDLM_FL_CLEANED 0x100000000000ULL +/* optimization hint: LDLM can run blocking callback from current context + * w/o involving separate thread. in order to decrease cs rate */ +#define LDLM_FL_ATOMIC_CB 0x200000000000ULL + +/* It may happen that a client initiates two operations, e.g. unlink and + * mkdir, such that the server sends a blocking AST for conflicting + * locks to this client for the first operation, whereas the second + * operation has canceled this lock and is waiting for rpc_lock which is + * taken by the first operation. LDLM_FL_BL_AST is set by + * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel + * (ELC) code from cancelling it. + * + * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock + * cache is dropped to let ldlm_callback_handler() return EINVAL to the + * server. It is used when ELC RPC is already prepared and is waiting + * for rpc_lock, too late to send a separate CANCEL RPC. */ +#define LDLM_FL_BL_AST 0x400000000000ULL +#define LDLM_FL_BL_DONE 0x800000000000ULL +/* Don't put lock into the LRU list, so that it is not canceled due to aging. + * Used by MGC locks, they are cancelled only at unmount or by callback. */ +#define LDLM_FL_NO_LRU 0x1000000000000ULL + +/** + * The blocking callback is overloaded to perform two functions. These flags + * indicate which operation should be performed. + */ +#define LDLM_CB_BLOCKING 1 +#define LDLM_CB_CANCELING 2 + +/** + * \name Lock Compatibility Matrix. + * + * A lock has both a type (extent, flock, inode bits, or plain) and a mode. + * Lock types are described in their respective implementation files: + * ldlm_{extent,flock,inodebits,plain}.c. + * + * There are six lock modes along with a compatibility matrix to indicate if + * two locks are compatible. + * + * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock + * on the parent. + * - PW: Protective Write (normal write) mode. When a client requests a write + * lock from an OST, a lock with PW mode will be issued. + * - PR: Protective Read (normal read) mode. When a client requests a read from + * an OST, a lock with PR mode will be issued. Also, if the client opens a + * file for execution, it is granted a lock with PR mode. + * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client + * requests a write lock during a file open operation. + * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants + * an inodebit lock with the CR mode on the intermediate path component. + * - NL Null mode. + * + *
+ *       NL  CR  CW  PR  PW  EX
+ *  NL    1   1   1   1   1   1
+ *  CR    1   1   1   1   1   0
+ *  CW    1   1   1   0   0   0
+ *  PR    1   1   0   1   0   0
+ *  PW    1   1   0   0   0   0
+ *  EX    1   0   0   0   0   0
+ * 
+ */ +/** @{ */ +#define LCK_COMPAT_EX LCK_NL +#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) +#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) +#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) +#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) +#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) +#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) +/** @} Lock Compatibility Matrix */ + +extern ldlm_mode_t lck_compat_array[]; + +static inline void lockmode_verify(ldlm_mode_t mode) +{ + LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); +} + +static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode) +{ + return (lck_compat_array[exist_mode] & new_mode); +} + +/* + * + * cluster name spaces + * + */ + +#define DLM_OST_NAMESPACE 1 +#define DLM_MDS_NAMESPACE 2 + +/* XXX + - do we just separate this by security domains and use a prefix for + multiple namespaces in the same domain? + - +*/ + +/** + * Locking rules for LDLM: + * + * lr_lock + * + * lr_lock + * waiting_locks_spinlock + * + * lr_lock + * led_lock + * + * lr_lock + * ns_lock + * + * lr_lvb_mutex + * lr_lock + * + */ + +struct ldlm_pool; +struct ldlm_lock; +struct ldlm_resource; +struct ldlm_namespace; + +/** + * Operations on LDLM pools. + * LDLM pool is a pool of locks in the namespace without any implicitly + * specified limits. + * Locks in the pool are organized in LRU. + * Local memory pressure or server instructions (e.g. mempressure on server) + * can trigger freeing of locks from the pool + */ +struct ldlm_pool_ops { + /** Recalculate pool \a pl usage */ + int (*po_recalc)(struct ldlm_pool *pl); + /** Cancel at least \a nr locks from pool \a pl */ + int (*po_shrink)(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; + +/** One second for pools thread check interval. Each pool has own period. */ +#define LDLM_POOLS_THREAD_PERIOD (1) + +/** ~6% margin for modest pools. See ldlm_pool.c for details. */ +#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) + +/** Default recalc period for server side pools in sec. */ +#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) + +/** Default recalc period for client side pools in sec. */ +#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) + +/** + * LDLM pool structure to track granted locks. + * For purposes of determining when to release locks on e.g. memory pressure. + * This feature is commonly referred to as lru_resize. + */ +struct ldlm_pool { + /** Pool proc directory. */ + proc_dir_entry_t *pl_proc_dir; + /** Pool name, must be long enough to hold compound proc entry name. */ + char pl_name[100]; + /** Lock for protecting SLV/CLV updates. */ + spinlock_t pl_lock; + /** Number of allowed locks in in pool, both, client and server side. */ + atomic_t pl_limit; + /** Number of granted locks in */ + atomic_t pl_granted; + /** Grant rate per T. */ + atomic_t pl_grant_rate; + /** Cancel rate per T. */ + atomic_t pl_cancel_rate; + /** Server lock volume (SLV). Protected by pl_lock. */ + __u64 pl_server_lock_volume; + /** Current biggest client lock volume. Protected by pl_lock. */ + __u64 pl_client_lock_volume; + /** Lock volume factor. SLV on client is calculated as following: + * server_slv * lock_volume_factor. */ + atomic_t pl_lock_volume_factor; + /** Time when last SLV from server was obtained. */ + time_t pl_recalc_time; + /** Recalculation period for pool. */ + time_t pl_recalc_period; + /** Recalculation and shrink operations. */ + struct ldlm_pool_ops *pl_ops; + /** Number of planned locks for next period. */ + int pl_grant_plan; + /** Pool statistics. */ + struct lprocfs_stats *pl_stats; +}; + +typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, + void *req_cookie, ldlm_mode_t mode, __u64 flags, + void *data); + +typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); + +/** + * LVB operations. + * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could + * be associated with an LDLM lock and transferred from client to server and + * back. + * + * Currently LVBs are used by: + * - OSC-OST code to maintain current object size/times + * - layout lock code to return the layout when the layout lock is granted + */ +struct ldlm_valblock_ops { + int (*lvbo_init)(struct ldlm_resource *res); + int (*lvbo_update)(struct ldlm_resource *res, + struct ptlrpc_request *r, + int increase); + int (*lvbo_free)(struct ldlm_resource *res); + /* Return size of lvb data appropriate RPC size can be reserved */ + int (*lvbo_size)(struct ldlm_lock *lock); + /* Called to fill in lvb data to RPC buffer @buf */ + int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen); +}; + +/** + * LDLM pools related, type of lock pool in the namespace. + * Greedy means release cached locks aggressively + */ +typedef enum { + LDLM_NAMESPACE_GREEDY = 1 << 0, + LDLM_NAMESPACE_MODEST = 1 << 1 +} ldlm_appetite_t; + +/** + * Default values for the "max_nolock_size", "contention_time" and + * "contended_locks" namespace tunables. + */ +#define NS_DEFAULT_MAX_NOLOCK_BYTES 0 +#define NS_DEFAULT_CONTENTION_SECONDS 2 +#define NS_DEFAULT_CONTENDED_LOCKS 32 + +struct ldlm_ns_bucket { + /** back pointer to namespace */ + struct ldlm_namespace *nsb_namespace; + /** + * Estimated lock callback time. Used by adaptive timeout code to + * avoid spurious client evictions due to unresponsiveness when in + * fact the network or overall system load is at fault + */ + struct adaptive_timeout nsb_at_estimate; +}; + +enum { + /** LDLM namespace lock stats */ + LDLM_NSS_LOCKS = 0, + LDLM_NSS_LAST +}; + +typedef enum { + /** invalide type */ + LDLM_NS_TYPE_UNKNOWN = 0, + /** mdc namespace */ + LDLM_NS_TYPE_MDC, + /** mds namespace */ + LDLM_NS_TYPE_MDT, + /** osc namespace */ + LDLM_NS_TYPE_OSC, + /** ost namespace */ + LDLM_NS_TYPE_OST, + /** mgc namespace */ + LDLM_NS_TYPE_MGC, + /** mgs namespace */ + LDLM_NS_TYPE_MGT, +} ldlm_ns_type_t; + +/** + * LDLM Namespace. + * + * Namespace serves to contain locks related to a particular service. + * There are two kinds of namespaces: + * - Server namespace has knowledge of all locks and is therefore authoritative + * to make decisions like what locks could be granted and what conflicts + * exist during new lock enqueue. + * - Client namespace only has limited knowledge about locks in the namespace, + * only seeing locks held by the client. + * + * Every Lustre service has one server namespace present on the server serving + * that service. Every client connected to the service has a client namespace + * for it. + * Every lock obtained by client in that namespace is actually represented by + * two in-memory locks. One on the server and one on the client. The locks are + * linked by a special cookie by which one node can tell to the other which lock + * it actually means during communications. Such locks are called remote locks. + * The locks held by server only without any reference to a client are called + * local locks. + */ +struct ldlm_namespace { + /** Backward link to OBD, required for LDLM pool to store new SLV. */ + struct obd_device *ns_obd; + + /** Flag indicating if namespace is on client instead of server */ + ldlm_side_t ns_client; + + /** Resource hash table for namespace. */ + cfs_hash_t *ns_rs_hash; + + /** serialize */ + spinlock_t ns_lock; + + /** big refcount (by bucket) */ + atomic_t ns_bref; + + /** + * Namespace connect flags supported by server (may be changed via + * /proc, LRU resize may be disabled/enabled). + */ + __u64 ns_connect_flags; + + /** Client side original connect flags supported by server. */ + __u64 ns_orig_connect_flags; + + /** + * Position in global namespace list linking all namespaces on + * the node. + */ + struct list_head ns_list_chain; + + /** + * List of unused locks for this namespace. This list is also called + * LRU lock list. + * Unused locks are locks with zero reader/writer reference counts. + * This list is only used on clients for lock caching purposes. + * When we want to release some locks voluntarily or if server wants + * us to release some locks due to e.g. memory pressure, we take locks + * to release from the head of this list. + * Locks are linked via l_lru field in \see struct ldlm_lock. + */ + struct list_head ns_unused_list; + /** Number of locks in the LRU list above */ + int ns_nr_unused; + + /** + * Maximum number of locks permitted in the LRU. If 0, means locks + * are managed by pools and there is no preset limit, rather it is all + * controlled by available memory on this client and on server. + */ + unsigned int ns_max_unused; + /** Maximum allowed age (last used time) for locks in the LRU */ + unsigned int ns_max_age; + /** + * Server only: number of times we evicted clients due to lack of reply + * to ASTs. + */ + unsigned int ns_timeouts; + /** + * Number of seconds since the file change time after which the + * MDT will return an UPDATE lock along with a LOOKUP lock. + * This allows the client to start caching negative dentries + * for a directory and may save an RPC for a later stat. + */ + unsigned int ns_ctime_age_limit; + + /** + * Used to rate-limit ldlm_namespace_dump calls. + * \see ldlm_namespace_dump. Increased by 10 seconds every time + * it is called. + */ + cfs_time_t ns_next_dump; + + /** "policy" function that does actual lock conflict determination */ + ldlm_res_policy ns_policy; + + /** + * LVB operations for this namespace. + * \see struct ldlm_valblock_ops + */ + struct ldlm_valblock_ops *ns_lvbo; + + /** + * Used by filter code to store pointer to OBD of the service. + * Should be dropped in favor of \a ns_obd + */ + void *ns_lvbp; + + /** + * Wait queue used by __ldlm_namespace_free. Gets woken up every time + * a resource is removed. + */ + wait_queue_head_t ns_waitq; + /** LDLM pool structure for this namespace */ + struct ldlm_pool ns_pool; + /** Definition of how eagerly unused locks will be released from LRU */ + ldlm_appetite_t ns_appetite; + + /** + * If more than \a ns_contended_locks are found, the resource is + * considered to be contended. Lock enqueues might specify that no + * contended locks should be granted + */ + unsigned ns_contended_locks; + + /** + * The resources in this namespace remember contended state during + * \a ns_contention_time, in seconds. + */ + unsigned ns_contention_time; + + /** + * Limit size of contended extent locks, in bytes. + * If extended lock is requested for more then this many bytes and + * caller instructs us not to grant contended locks, we would disregard + * such a request. + */ + unsigned ns_max_nolock_size; + + /** Limit of parallel AST RPC count. */ + unsigned ns_max_parallel_ast; + + /** Callback to cancel locks before replaying it during recovery. */ + ldlm_cancel_for_recovery ns_cancel_for_recovery; + + /** LDLM lock stats */ + struct lprocfs_stats *ns_stats; + + /** + * Flag to indicate namespace is being freed. Used to determine if + * recalculation of LDLM pool statistics should be skipped. + */ + unsigned ns_stopping:1; +}; + +/** + * Returns 1 if namespace \a ns is a client namespace. + */ +static inline int ns_is_client(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_CLIENT; +} + +/** + * Returns 1 if namespace \a ns is a server namespace. + */ +static inline int ns_is_server(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_SERVER; +} + +/** + * Returns 1 if namespace \a ns supports early lock cancel (ELC). + */ +static inline int ns_connect_cancelset(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); +} + +/** + * Returns 1 if this namespace supports lru_resize. + */ +static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_for_recovery arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel_for_recovery = arg; +} + +struct ldlm_lock; + +/** Type for blocking callback function of a lock. */ +typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +/** Type for completion callback function of a lock. */ +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, + void *data); +/** Type for glimpse callback function of a lock. */ +typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); +/** Type for weight callback function of a lock. */ +typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock); + +/** Work list for sending GL ASTs to multiple locks. */ +struct ldlm_glimpse_work { + struct ldlm_lock *gl_lock; /* lock to glimpse */ + struct list_head gl_list; /* linkage to other gl work structs */ + __u32 gl_flags;/* see LDLM_GL_WORK_* below */ + union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in + * glimpse callback request */ +}; + +/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */ +#define LDLM_GL_WORK_NOFREE 0x1 + +/** Interval node data for each LDLM_EXTENT lock. */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree management */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/** + * Interval tree for extent locks. + * The interval tree must be accessed under the resource lock. + * Interval trees are used for granted extent locks to speed up conflicts + * lookup. See ldlm/interval_tree.c for more details. + */ +struct ldlm_interval_tree { + /** Tree size. */ + int lit_size; + ldlm_mode_t lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actual ldlm_interval */ +}; + +/** Whether to track references to exports by LDLM locks. */ +#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) + +/** Cancel flags. */ +typedef enum { + LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ + LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ + LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST + * in the same RPC */ +} ldlm_cancel_flags_t; + +struct ldlm_flock { + __u64 start; + __u64 end; + __u64 owner; + __u64 blocking_owner; + struct obd_export *blocking_export; + /* Protected by the hash lock */ + __u32 blocking_refs; + __u32 pid; +}; + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_policy_data_t; + +void ldlm_convert_policy_to_wire(ldlm_type_t type, + const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type, + const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); + +enum lvb_type { + LVB_T_NONE = 0, + LVB_T_OST = 1, + LVB_T_LQUOTA = 2, + LVB_T_LAYOUT = 3, +}; + +/** + * LDLM lock structure + * + * Represents a single LDLM lock and its state in memory. Each lock is + * associated with a single ldlm_resource, the object which is being + * locked. There may be multiple ldlm_locks on a single resource, + * depending on the lock type and whether the locks are conflicting or + * not. + */ +struct ldlm_lock { + /** + * Local lock handle. + * When remote side wants to tell us about a lock, they address + * it by this opaque handle. The handle does not hold a + * reference on the ldlm_lock, so it can be safely passed to + * other threads or nodes. When the lock needs to be accessed + * from the handle, it is looked up again in the lock table, and + * may no longer exist. + * + * Must be first in the structure. + */ + struct portals_handle l_handle; + /** + * Lock reference count. + * This is how many users have pointers to actual structure, so that + * we do not accidentally free lock structure that is in use. + */ + atomic_t l_refc; + /** + * Internal spinlock protects l_resource. We should hold this lock + * first before taking res_lock. + */ + spinlock_t l_lock; + /** + * Pointer to actual resource this lock is in. + * ldlm_lock_change_resource() can change this. + */ + struct ldlm_resource *l_resource; + /** + * List item for client side LRU list. + * Protected by ns_lock in struct ldlm_namespace. + */ + struct list_head l_lru; + /** + * Linkage to resource's lock queues according to current lock state. + * (could be granted, waiting or converting) + * Protected by lr_lock in struct ldlm_resource. + */ + struct list_head l_res_link; + /** + * Tree node for ldlm_extent. + */ + struct ldlm_interval *l_tree_node; + /** + * Per export hash of locks. + * Protected by per-bucket exp->exp_lock_hash locks. + */ + struct hlist_node l_exp_hash; + /** + * Per export hash of flock locks. + * Protected by per-bucket exp->exp_flock_hash locks. + */ + struct hlist_node l_exp_flock_hash; + /** + * Requested mode. + * Protected by lr_lock. + */ + ldlm_mode_t l_req_mode; + /** + * Granted mode, also protected by lr_lock. + */ + ldlm_mode_t l_granted_mode; + /** Lock completion handler pointer. Called when lock is granted. */ + ldlm_completion_callback l_completion_ast; + /** + * Lock blocking AST handler pointer. + * It plays two roles: + * - as a notification of an attempt to queue a conflicting lock (once) + * - as a notification when the lock is being cancelled. + * + * As such it's typically called twice: once for the initial conflict + * and then once more when the last user went away and the lock is + * cancelled (could happen recursively). + */ + ldlm_blocking_callback l_blocking_ast; + /** + * Lock glimpse handler. + * Glimpse handler is used to obtain LVB updates from a client by + * server + */ + ldlm_glimpse_callback l_glimpse_ast; + + /** XXX apparently unused "weight" handler. To be removed? */ + ldlm_weigh_callback l_weigh_ast; + + /** + * Lock export. + * This is a pointer to actual client export for locks that were granted + * to clients. Used server-side. + */ + struct obd_export *l_export; + /** + * Lock connection export. + * Pointer to server export on a client. + */ + struct obd_export *l_conn_export; + + /** + * Remote lock handle. + * If the lock is remote, this is the handle of the other side lock + * (l_handle) + */ + struct lustre_handle l_remote_handle; + + /** + * Representation of private data specific for a lock type. + * Examples are: extent range for extent lock or bitmask for ibits locks + */ + ldlm_policy_data_t l_policy_data; + + /** + * Lock state flags. + * Like whenever we receive any blocking requests for this lock, etc. + * Protected by lr_lock. + */ + __u64 l_flags; + /** + * Lock r/w usage counters. + * Protected by lr_lock. + */ + __u32 l_readers; + __u32 l_writers; + /** + * If the lock is granted, a process sleeps on this waitq to learn when + * it's no longer in use. If the lock is not granted, a process sleeps + * on this waitq to learn when it becomes granted. + */ + wait_queue_head_t l_waitq; + + /** + * Seconds. It will be updated if there is any activity related to + * the lock, e.g. enqueue the lock or send blocking AST. + */ + cfs_time_t l_last_activity; + + /** + * Time last used by e.g. being matched by lock match. + * Jiffies. Should be converted to time if needed. + */ + cfs_time_t l_last_used; + + /** Originally requested extent for the extent lock. */ + struct ldlm_extent l_req_extent; + + unsigned int l_failed:1, + /** + * Set for locks that were removed from class hash table and will be + * destroyed when last reference to them is released. Set by + * ldlm_lock_destroy_internal(). + * + * Protected by lock and resource locks. + */ + l_destroyed:1, + /* + * it's set in lock_res_and_lock() and unset in unlock_res_and_lock(). + * + * NB: compared with check_res_locked(), checking this bit is cheaper. + * Also, spin_is_locked() is deprecated for kernel code; one reason is + * because it works only for SMP so user needs to add extra macros like + * LASSERT_SPIN_LOCKED for uniprocessor kernels. + */ + l_res_locked:1, + /* + * It's set once we call ldlm_add_waiting_lock_res_locked() + * to start the lock-timeout timer and it will never be reset. + * + * Protected by lock_res_and_lock(). + */ + l_waited:1, + /** Flag whether this is a server namespace lock. */ + l_ns_srv:1; + + /* + * Client-side-only members. + */ + + enum lvb_type l_lvb_type; + + /** + * Temporary storage for a LVB received during an enqueue operation. + */ + __u32 l_lvb_len; + void *l_lvb_data; + + /** Private storage for lock user. Opaque to LDLM. */ + void *l_ast_data; + + /* + * Server-side-only members. + */ + + /** + * Connection cookie for the client originating the operation. + * Used by Commit on Share (COS) code. Currently only used for + * inodebits locks on MDS. + */ + __u64 l_client_cookie; + + /** + * List item for locks waiting for cancellation from clients. + * The lists this could be linked into are: + * waiting_locks_list (protected by waiting_locks_spinlock), + * then if the lock timed out, it is moved to + * expired_lock_thread.elt_expired_locks for further processing. + * Protected by elt_lock. + */ + struct list_head l_pending_chain; + + /** + * Set when lock is sent a blocking AST. Time in seconds when timeout + * is reached and client holding this lock could be evicted. + * This timeout could be further extended by e.g. certain IO activity + * under this lock. + * \see ost_rw_prolong_locks + */ + cfs_time_t l_callback_timeout; + + /** Local PID of process which created this lock. */ + __u32 l_pid; + + /** + * Number of times blocking AST was sent for this lock. + * This is for debugging. Valid values are 0 and 1, if there is an + * attempt to send blocking AST more than once, an assertion would be + * hit. \see ldlm_work_bl_ast_lock + */ + int l_bl_ast_run; + /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ + struct list_head l_bl_ast; + /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ + struct list_head l_cp_ast; + /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ + struct list_head l_rk_ast; + + /** + * Pointer to a conflicting lock that caused blocking AST to be sent + * for this lock + */ + struct ldlm_lock *l_blocking_lock; + + /** + * Protected by lr_lock, linkages to "skip lists". + * For more explanations of skip lists see ldlm/ldlm_inodebits.c + */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; + + /** Reference tracking structure to debug leaked locks. */ + struct lu_ref l_reference; +#if LUSTRE_TRACKS_LOCK_EXP_REFS + /* Debugging stuff for bug 20498, for tracking export references. */ + /** number of export references taken */ + int l_exp_refs_nr; + /** link all locks referencing one export */ + struct list_head l_exp_refs_link; + /** referenced export object */ + struct obd_export *l_exp_refs_target; +#endif + /** + * export blocking dlm lock list, protected by + * l_export->exp_bl_list_lock. + * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock + * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock. + */ + struct list_head l_exp_list; +}; + +/** + * LDLM resource description. + * Basically, resource is a representation for a single object. + * Object has a name which is currently 4 64-bit integers. LDLM user is + * responsible for creation of a mapping between objects it wants to be + * protected and resource names. + * + * A resource can only hold locks of a single lock type, though there may be + * multiple ldlm_locks on a single resource, depending on the lock type and + * whether the locks are conflicting or not. + */ +struct ldlm_resource { + struct ldlm_ns_bucket *lr_ns_bucket; + + /** + * List item for list in namespace hash. + * protected by ns_lock + */ + struct hlist_node lr_hash; + + /** Spinlock to protect locks under this resource. */ + spinlock_t lr_lock; + + /** + * protected by lr_lock + * @{ */ + /** List of locks in granted state */ + struct list_head lr_granted; + /** List of locks waiting to change their granted mode (converted) */ + struct list_head lr_converting; + /** + * List of locks that could not be granted due to conflicts and + * that are waiting for conflicts to go away */ + struct list_head lr_waiting; + /** @} */ + + /* XXX No longer needed? Remove ASAP */ + ldlm_mode_t lr_most_restr; + + /** Type of locks this resource can hold. Only one type per resource. */ + ldlm_type_t lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ + + /** Resource name */ + struct ldlm_res_id lr_name; + /** Reference count for this resource */ + atomic_t lr_refcount; + + /** + * Interval trees (only for extent locks) for all modes of this resource + */ + struct ldlm_interval_tree lr_itree[LCK_MODE_NUM]; + + /** + * Server-side-only lock value block elements. + * To serialize lvbo_init. + */ + struct mutex lr_lvb_mutex; + int lr_lvb_len; + /** protected by lr_lock */ + void *lr_lvb_data; + + /** When the resource was considered as contended. */ + cfs_time_t lr_contention_time; + /** List of references to this resource. For debugging. */ + struct lu_ref lr_reference; + + struct inode *lr_lvb_inode; +}; + +static inline bool ldlm_has_layout(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; +} + +static inline char * +ldlm_ns_name(struct ldlm_namespace *ns) +{ + return ns->ns_rs_hash->hs_name; +} + +static inline struct ldlm_namespace * +ldlm_res_to_ns(struct ldlm_resource *res) +{ + return res->lr_ns_bucket->nsb_namespace; +} + +static inline struct ldlm_namespace * +ldlm_lock_to_ns(struct ldlm_lock *lock) +{ + return ldlm_res_to_ns(lock->l_resource); +} + +static inline char * +ldlm_lock_to_ns_name(struct ldlm_lock *lock) +{ + return ldlm_ns_name(ldlm_lock_to_ns(lock)); +} + +static inline struct adaptive_timeout * +ldlm_lock_to_ns_at(struct ldlm_lock *lock) +{ + return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; +} + +static inline int ldlm_lvbo_init(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL) + return ns->ns_lvbo->lvbo_init(res); + + return 0; +} + +static inline int ldlm_lvbo_size(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL) + return ns->ns_lvbo->lvbo_size(lock); + + return 0; +} + +static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL) { + LASSERT(ns->ns_lvbo->lvbo_fill != NULL); + return ns->ns_lvbo->lvbo_fill(lock, buf, len); + } + return 0; +} + +struct ldlm_ast_work { + struct ldlm_lock *w_lock; + int w_blocking; + struct ldlm_lock_desc w_desc; + struct list_head w_list; + int w_flags; + void *w_data; + int w_datalen; +}; + +/** + * Common ldlm_enqueue parameters + */ +struct ldlm_enqueue_info { + __u32 ei_type; /** Type of the lock being enqueued. */ + __u32 ei_mode; /** Mode of the lock being enqueued. */ + void *ei_cb_bl; /** blocking lock callback */ + void *ei_cb_cp; /** lock completion callback */ + void *ei_cb_gl; /** lock glimpse callback */ + void *ei_cb_wg; /** lock weigh callback */ + void *ei_cbdata; /** Data to be passed into callbacks. */ +}; + +extern struct obd_ops ldlm_obd_ops; + +extern char *ldlm_lockname[]; +extern char *ldlm_typename[]; +extern char *ldlm_it2str(int it); + +/** + * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. + * For the cases where we do not have actual lock to print along + * with a debugging message that is ldlm-related + */ +#define LDLM_DEBUG_NOLOCK(format, a...) \ + CDEBUG(D_DLMTRACE, "### " format "\n" , ##a) + +/** + * Support function for lock information printing into debug logs. + * \see LDLM_DEBUG + */ +#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ +} while(0) + +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Rate-limited version of lock printing function. + */ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ + static cfs_debug_limit_state_t _ldlm_cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ + ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\ +} while (0) + +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + +/** Non-rate-limited lock printing function for debugging purposes. */ +#define LDLM_DEBUG(lock, fmt, a...) do { \ + if (likely(lock != NULL)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ + ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ + "### " fmt , ##a); \ + } else { \ + LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ + } \ +} while (0) + +typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, + int first_enq, ldlm_error_t *err, + struct list_head *work_list); + +/** + * Return values for lock iterators. + * Also used during deciding of lock grants and cancellations. + */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 2 /* stop iterating */ + +typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); + +/** \defgroup ldlm_iterator Lock iterators + * + * LDLM provides for a way to iterate through every lock on a resource or + * namespace or every resource in a namespace. + * @{ */ +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure); +void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, + void *closure); +int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_iterator_t iter, void *data); +/** @} ldlm_iterator */ + +int ldlm_replay_locks(struct obd_import *imp); + +/* ldlm_flock.c */ +int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); + +/* ldlm_extent.c */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); + +struct ldlm_callback_suite { + ldlm_completion_callback lcs_completion; + ldlm_blocking_callback lcs_blocking; + ldlm_glimpse_callback lcs_glimpse; + ldlm_weigh_callback lcs_weigh; +}; + +/* ldlm_lockd.c */ +int ldlm_del_waiting_lock(struct ldlm_lock *lock); +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout); +int ldlm_get_ref(void); +void ldlm_put_ref(void); +int ldlm_init_export(struct obd_export *exp); +void ldlm_destroy_export(struct obd_export *exp); +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); + +/* ldlm_lock.c */ +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg); +void ldlm_lock2handle(const struct ldlm_lock *lock, + struct lustre_handle *lockh); +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); +void ldlm_cancel_callback(struct ldlm_lock *); +int ldlm_lock_remove_from_lru(struct ldlm_lock *); +int ldlm_lock_set_data(struct lustre_handle *, void *); + +/** + * Obtain a lock reference by its handle. + */ +static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) +{ + return __ldlm_handle2lock(h, 0); +} + +#define LDLM_LOCK_REF_DEL(lock) \ + lu_ref_del(&lock->l_reference, "handle", current) + +static inline struct ldlm_lock * +ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) +{ + struct ldlm_lock *lock; + + lock = __ldlm_handle2lock(h, flags); + if (lock != NULL) + LDLM_LOCK_REF_DEL(lock); + return lock; +} + +/** + * Update Lock Value Block Operations (LVBO) on a resource taking into account + * data from reqest \a r + */ +static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, + struct ptlrpc_request *r, int increase) +{ + if (ldlm_res_to_ns(res)->ns_lvbo && + ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) { + return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r, + increase); + } + return 0; +} + +int ldlm_error2errno(ldlm_error_t error); +ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this + * confuses user-space. */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp); +#endif + +/** + * Release a temporary lock reference obtained by ldlm_handle2lock() or + * __ldlm_handle2lock(). + */ +#define LDLM_LOCK_PUT(lock) \ +do { \ + LDLM_LOCK_REF_DEL(lock); \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +/** + * Release a lock reference obtained by some other means (see + * LDLM_LOCK_PUT()). + */ +#define LDLM_LOCK_RELEASE(lock) \ +do { \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +#define LDLM_LOCK_GET(lock) \ +({ \ + ldlm_lock_get(lock); \ + /*LDLM_DEBUG((lock), "get");*/ \ + lock; \ +}) + +#define ldlm_lock_list_put(head, member, count) \ +({ \ + struct ldlm_lock *_lock, *_next; \ + int c = count; \ + list_for_each_entry_safe(_lock, _next, head, member) { \ + if (c-- == 0) \ + break; \ + list_del_init(&_lock->member); \ + LDLM_LOCK_RELEASE(_lock); \ + } \ + LASSERT(c <= 0); \ +}) + +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); +void ldlm_lock_put(struct ldlm_lock *lock); +void ldlm_lock_destroy(struct ldlm_lock *lock); +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); +void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode); +int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); +void ldlm_lock_fail_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); +ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, + const struct ldlm_res_id *, ldlm_type_t type, + ldlm_policy_data_t *, ldlm_mode_t mode, + struct lustre_handle *, int unref); +ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh, + __u64 *bits); +struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, + __u32 *flags); +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode); +void ldlm_lock_cancel(struct ldlm_lock *lock); +void ldlm_reprocess_all(struct ldlm_resource *res); +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns); +void ldlm_lock_dump_handle(int level, struct lustre_handle *); +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); + +/* resource.c */ +struct ldlm_namespace * +ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, ldlm_appetite_t apt, + ldlm_ns_type_t ns_type); +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client); +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client); +void ldlm_namespace_get(struct ldlm_namespace *ns); +void ldlm_namespace_put(struct ldlm_namespace *ns); +int ldlm_proc_setup(void); +#ifdef LPROCFS +void ldlm_proc_cleanup(void); +#else +static inline void ldlm_proc_cleanup(void) {} +#endif + +/* resource.c - internal */ +struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, + struct ldlm_resource *parent, + const struct ldlm_res_id *, + ldlm_type_t type, int create); +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); +int ldlm_resource_putref(struct ldlm_resource *res); +void ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock); +void ldlm_resource_unlink_lock(struct ldlm_lock *lock); +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); +void ldlm_dump_all_namespaces(ldlm_side_t client, int level); +void ldlm_namespace_dump(int level, struct ldlm_namespace *); +void ldlm_resource_dump(int level, struct ldlm_resource *); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + const struct ldlm_res_id *); + +#define LDLM_RESOURCE_ADDREF(res) do { \ + lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +#define LDLM_RESOURCE_DELREF(res) do { \ + lu_ref_del(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +/* ldlm_request.c */ +int ldlm_expired_completion_wait(void *data); +/** \defgroup ldlm_local_ast Default AST handlers for local locks + * These AST handlers are typically used for server-side local locks and are + * also used by client-side lock handlers to perform minimum level base + * processing. + * @{ */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +/** @} ldlm_local_ast */ + +/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. + * These are typically used by client and server (*_local versions) + * to obtain and release locks. + * @{ */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async); +int ldlm_prep_enqueue_req(struct obd_export *exp, + struct ptlrpc_request *req, + struct list_head *cancels, + int count); +int ldlm_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len); +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + __u64 *flags, void *lvb, __u32 lvb_len, + struct lustre_handle *lockh, int rc); +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh); +int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, + void *data, __u32 data_len); +int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags); +int ldlm_cli_update_pool(struct ptlrpc_request *req); +int ldlm_cli_cancel(struct lustre_handle *lockh, + ldlm_cancel_flags_t cancel_flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_cancel_flags_t flags, void *opaque); +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque); +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head, + int count, ldlm_cancel_flags_t flags); +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, int lock_flags, + ldlm_cancel_flags_t cancel_flags, void *opaque); +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + ldlm_cancel_flags_t flags); +int ldlm_cli_cancel_list(struct list_head *head, int count, + struct ptlrpc_request *req, ldlm_cancel_flags_t flags); +/** @} ldlm_cli_api */ + +/* mds/handler.c */ +/* This has to be here because recursive inclusion sucks. */ +int intent_disposition(struct ldlm_reply *rep, int flag); +void intent_set_disposition(struct ldlm_reply *rep, int flag); + + +/* ioctls for trying requests */ +#define IOC_LDLM_TYPE 'f' +#define IOC_LDLM_MIN_NR 40 + +#define IOC_LDLM_TEST _IOWR('f', 40, long) +#define IOC_LDLM_DUMP _IOWR('f', 41, long) +#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long) +#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) +#define IOC_LDLM_MAX_NR 43 + +/** + * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more + * than one lock_res is dead-lock safe. + */ +enum lock_res_type { + LRT_NORMAL, + LRT_NEW +}; + +/** Lock resource. */ +static inline void lock_res(struct ldlm_resource *res) +{ + spin_lock(&res->lr_lock); +} + +/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ +static inline void lock_res_nested(struct ldlm_resource *res, + enum lock_res_type mode) +{ + spin_lock_nested(&res->lr_lock, mode); +} + +/** Unlock resource. */ +static inline void unlock_res(struct ldlm_resource *res) +{ + spin_unlock(&res->lr_lock); +} + +/** Check if resource is already locked, assert if not. */ +static inline void check_res_locked(struct ldlm_resource *res) +{ + LASSERT(spin_is_locked(&res->lr_lock)); +} + +struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock); +void unlock_res_and_lock(struct ldlm_lock *lock); + +/* ldlm_pool.c */ +/** \defgroup ldlm_pools Various LDLM pool related functions + * There are not used outside of ldlm. + * @{ + */ +void ldlm_pools_recalc(ldlm_side_t client); +int ldlm_pools_init(void); +void ldlm_pools_fini(void); + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, ldlm_side_t client); +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask); +void ldlm_pool_fini(struct ldlm_pool *pl); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); +int ldlm_pool_recalc(struct ldlm_pool *pl); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl); +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); +/** @} */ + +#endif +/** @} LDLM */ diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h new file mode 100644 index 000000000000..b94f76a3301b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_EACL_H +#define _LUSTRE_EACL_H + +/** \defgroup eacl eacl + * + * @{ + */ + +#ifdef CONFIG_FS_POSIX_ACL + +#include + +typedef struct { + __u16 e_tag; + __u16 e_perm; + __u32 e_id; + __u32 e_stat; +} ext_acl_xattr_entry; + +typedef struct { + __u32 a_count; + ext_acl_xattr_entry a_entries[0]; +} ext_acl_xattr_header; + +#define CFS_ACL_XATTR_SIZE(count, prefix) \ + (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry)) + +#define CFS_ACL_XATTR_COUNT(size, prefix) \ + (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry)) + + +extern ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size); +extern int +lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size, + posix_acl_xattr_header **out); +extern void +lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size); +extern void +lustre_ext_acl_xattr_free(ext_acl_xattr_header *header); +extern int +lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out); +extern ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header); + +#endif /* CONFIG_FS_POSIX_ACL */ + +/** @} eacl */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h new file mode 100644 index 000000000000..d61c020a4643 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_export.h @@ -0,0 +1,389 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_export PortalRPC export definitions + * + * @{ + */ + +#ifndef __EXPORT_H +#define __EXPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include +#include +#include + +struct mds_client_data; +struct mdt_client_data; +struct mds_idmap_table; +struct mdt_idmap_table; + +/** + * Target-specific export data + */ +struct tg_export_data { + /** Protects led_lcd below */ + struct mutex ted_lcd_lock; + /** Per-client data for each export */ + struct lsd_client_data *ted_lcd; + /** Offset of record in last_rcvd file */ + loff_t ted_lr_off; + /** Client index in last_rcvd file */ + int ted_lr_idx; +}; + +/** + * MDT-specific export data + */ +struct mdt_export_data { + struct tg_export_data med_ted; + /** List of all files opened by client on this MDT */ + struct list_head med_open_head; + spinlock_t med_open_lock; /* med_open_head, mfd_list */ + /** Bitmask of all ibit locks this MDT understands */ + __u64 med_ibits_known; + struct mutex med_idmap_mutex; + struct lustre_idmap_table *med_idmap; +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_locks; +}; + +/* In-memory access to client data from OST struct */ +/** Filter (oss-side) specific import data */ +struct filter_export_data { + struct tg_export_data fed_ted; + spinlock_t fed_lock; /**< protects fed_mod_list */ + long fed_dirty; /* in bytes */ + long fed_grant; /* in bytes */ + struct list_head fed_mod_list; /* files being modified */ + int fed_mod_count;/* items in fed_writing list */ + long fed_pending; /* bytes just being written */ + __u32 fed_group; + __u8 fed_pagesize; /* log2 of client page size */ +}; + +struct mgs_export_data { + struct list_head med_clients; /* mgc fs client via this exp */ + spinlock_t med_lock; /* protect med_clients */ +}; + +/** + * per-NID statistics structure. + * It tracks access patterns to this export on a per-client-NID basis + */ +struct nid_stat { + lnet_nid_t nid; + struct hlist_node nid_hash; + struct list_head nid_list; + struct obd_device *nid_obd; + struct proc_dir_entry *nid_proc; + struct lprocfs_stats *nid_stats; + struct lprocfs_stats *nid_ldlm_stats; + atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash + exp_nid_stats */ +}; + +#define nidstat_getref(nidstat) \ +do { \ + atomic_inc(&(nidstat)->nid_exp_ref_count); \ +} while(0) + +#define nidstat_putref(nidstat) \ +do { \ + atomic_dec(&(nidstat)->nid_exp_ref_count); \ + LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \ + "stat %p nid_exp_ref_count < 0\n", nidstat); \ +} while(0) + +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + +/** + * Export structure. Represents target-side of connection in portals. + * Also used in Lustre to connect between layers on the same node when + * there is no network-connection in-between. + * For every connected client there is an export structure on the server + * attached to the same obd device. + */ +struct obd_export { + /** + * Export handle, it's id is provided to client on connect + * Subsequent client RPCs contain this handle id to identify + * what export they are talking to. + */ + struct portals_handle exp_handle; + atomic_t exp_refcount; + /** + * Set of counters below is to track where export references are + * kept. The exp_rpc_count is used for reconnect handling also, + * the cb_count and locks_count are for debug purposes only for now. + * The sum of them should be less than exp_refcount by 3 + */ + atomic_t exp_rpc_count; /* RPC references */ + atomic_t exp_cb_count; /* Commit callback references */ + /** Number of queued replay requests to be processes */ + atomic_t exp_replay_count; + atomic_t exp_locks_count; /** Lock references */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS + struct list_head exp_locks_list; + spinlock_t exp_locks_list_guard; +#endif + /** UUID of client connected to this export */ + struct obd_uuid exp_client_uuid; + /** To link all exports on an obd device */ + struct list_head exp_obd_chain; + struct hlist_node exp_uuid_hash; /** uuid-export hash*/ + struct hlist_node exp_nid_hash; /** nid-export hash */ + /** + * All exports eligible for ping evictor are linked into a list + * through this field in "most time since last request on this export" + * order + * protected by obd_dev_lock + */ + struct list_head exp_obd_chain_timed; + /** Obd device of this export */ + struct obd_device *exp_obd; + /** + * "reverse" import to send requests (e.g. from ldlm) back to client + * exp_lock protect its change + */ + struct obd_import *exp_imp_reverse; + struct nid_stat *exp_nid_stats; + struct lprocfs_stats *exp_md_stats; + /** Active connetion */ + struct ptlrpc_connection *exp_connection; + /** Connection count value from last succesful reconnect rpc */ + __u32 exp_conn_cnt; + /** Hash list of all ldlm locks granted on this export */ + cfs_hash_t *exp_lock_hash; + /** + * Hash list for Posix lock deadlock detection, added with + * ldlm_lock::l_exp_flock_hash. + */ + cfs_hash_t *exp_flock_hash; + struct list_head exp_outstanding_replies; + struct list_head exp_uncommitted_replies; + spinlock_t exp_uncommitted_replies_lock; + /** Last committed transno for this export */ + __u64 exp_last_committed; + /** When was last request received */ + cfs_time_t exp_last_request_time; + /** On replay all requests waiting for replay are linked here */ + struct list_head exp_req_replay_queue; + /** + * protects exp_flags, exp_outstanding_replies and the change + * of exp_imp_reverse + */ + spinlock_t exp_lock; + /** Compatibility flags for this export are embedded into + * exp_connect_data */ + struct obd_connect_data exp_connect_data; + enum obd_option exp_flags; + unsigned long exp_failed:1, + exp_in_recovery:1, + exp_disconnected:1, + exp_connecting:1, + /** VBR: export missed recovery */ + exp_delayed:1, + /** VBR: failed version checking */ + exp_vbr_failed:1, + exp_req_replay_needed:1, + exp_lock_replay_needed:1, + exp_need_sync:1, + exp_flvr_changed:1, + exp_flvr_adapt:1, + exp_libclient:1, /* liblustre client? */ + /* client timed out and tried to reconnect, + * but couldn't because of active rpcs */ + exp_abort_active_req:1, + /* if to swap nidtbl entries for 2.2 clients. + * Only used by the MGS to fix LU-1644. */ + exp_need_mne_swab:1; + /* also protected by exp_lock */ + enum lustre_sec_part exp_sp_peer; + struct sptlrpc_flavor exp_flvr; /* current */ + struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ + cfs_time_t exp_flvr_expire[2]; /* seconds */ + + /** protects exp_hp_rpcs */ + spinlock_t exp_rpc_lock; + struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ + + /** blocking dlm lock list, protected by exp_bl_list_lock */ + struct list_head exp_bl_list; + spinlock_t exp_bl_list_lock; + + /** Target specific data */ + union { + struct tg_export_data eu_target_data; + struct mdt_export_data eu_mdt_data; + struct filter_export_data eu_filter_data; + struct ec_export_data eu_ec_data; + struct mgs_export_data eu_mgs_data; + } u; +}; + +#define exp_target_data u.eu_target_data +#define exp_mdt_data u.eu_mdt_data +#define exp_filter_data u.eu_filter_data +#define exp_ec_data u.eu_ec_data + +static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags; +} + +static inline __u64 exp_connect_flags(struct obd_export *exp) +{ + return *exp_connect_flags_ptr(exp); +} + +static inline int exp_max_brw_size(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) + return exp->exp_connect_data.ocd_brw_size; + + return ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_multibulk(struct obd_export *exp) +{ + return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; +} + +static inline int exp_expired(struct obd_export *exp, cfs_duration_t age) +{ + LASSERT(exp->exp_delayed); + return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age), + cfs_time_current_sec()); +} + +static inline int exp_connect_cancelset(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); +} + +static inline int exp_connect_lru_resize(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_rmtclient(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT); +} + +static inline int client_is_remote(struct obd_export *exp) +{ + struct obd_import *imp = class_exp2cliimp(exp); + + return !!(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_RMT_CLIENT); +} + +static inline int exp_connect_vbr(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(exp->exp_connection); + return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); +} + +static inline int exp_connect_som(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM); +} + +static inline int exp_connect_umask(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); +} + +static inline int imp_connect_lru_resize(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_layout(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); +} + +static inline bool exp_connect_lvb_type(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_lvb_type(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +extern struct obd_export *class_conn2export(struct lustre_handle *conn); +extern struct obd_device *class_conn2obd(struct lustre_handle *conn); + +/** @} export */ + +#endif /* __EXPORT_H */ +/** @} obd_export */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h new file mode 100644 index 000000000000..acaa1c478bba --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fid.h @@ -0,0 +1,761 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fid.h + * + * Author: Yury Umanets + */ + +#ifndef __LINUX_FID_H +#define __LINUX_FID_H + +/** \defgroup fid fid + * + * @{ + * + * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs + * describes the FID namespace and interoperability requirements for FIDs. + * The important parts of that document are included here for reference. + * + * FID + * File IDentifier generated by client from range allocated by the SEQuence + * service and stored in struct lu_fid. The FID is composed of three parts: + * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem + * unique 64-bit integer, and only one client is ever assigned any SEQ value. + * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved + * for system use. The OID component is a 32-bit value generated by the + * client on a per-SEQ basis to allow creating many unique FIDs without + * communication with the server. The VER component is a 32-bit value that + * distinguishes between different FID instantiations, such as snapshots or + * separate subtrees within the filesystem. FIDs with the same VER field + * are considered part of the same namespace. + * + * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and + * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while + * OSTs use 64-bit Lustre object IDs and generation numbers. + * + * NEW filesystems are those formatted since the introduction of FIDs. + * + * IGIF + * Inode and Generation In FID, a surrogate FID used to globally identify + * an existing object on OLD formatted MDT file system. This would only be + * used on MDT0 in a DNE filesystem, because there cannot be more than one + * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] + * range, where inode number is stored in SEQ, and inode generation is in OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. It also assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF + * object ID In FID, a surrogate FID used to globally identify an existing + * OST object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object + * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to + * be identified in the FLD correctly. The OID field is calculated as: + * + * objid & 0xffffffff + * + * that is, it consists of lower 32 bits of object ID. For objects within + * the IDIF range, object ID extraction will be: + * + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + * + * NOTE: This assumes that no more than 2^48-1 objects have ever been created + * on any OST, and that no more than 65535 OSTs are in use. Both are very + * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming + * a maximum creation rate of 1M objects per second for a maximum of 9 years, + * or combinations thereof. + * + * OST_MDT0 + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved SEQuence 0, and is used prior to + * the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG + * For Lustre Log objects the object sequence 1 is used. This is compatible + * with both OLD and NEW namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * ECHO + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW namespaces, as this SEQ number is in + * the ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * OST_MDT1 .. OST_MAX + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total + * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any + * production DNE release, as the objects in this range conflict across all + * OSTs, as the OST index is not part of the FID. For production DNE usage, + * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. + * + * DLM OST objid to IDIF mapping + * For compatibility with existing OLD OST network protocol structures, the + * FID must map onto the o_id and o_seq in a manner that ensures existing + * objects are identified consistently for IO, as well as onto the LDLM + * namespace to ensure IDIFs there is only a single resource name for any + * object in the DLM. The OLD OST object DLM resource mapping is: + * + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * The NEW OST object DLM resource mapping is the same for both MDT and OST: + * + * resource[] = {SEQ, OID, VER, HASH}; + * + * NOTE: for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, + * in all production releases the OLD o_seq field is always zero, and all + * valid FID OID values are non-zero, so the lock resources will not collide. + * Even so, the MDT and OST resources are also in different LDLM namespaces. + */ + +#include +#include +#include +#include +#include + + +struct lu_site; +struct lu_context; + +/* Whole sequences space range and zero range definitions */ +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; + +enum { + /* + * This is how may metadata FIDs may be allocated in one sequence(128k) + */ + LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, + + /* + * This is how many data FIDs could be allocated in one sequence(4B - 1) + */ + LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, + + /* + * How many sequences to allocate to a client at once. + */ + LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, + + /* + * seq allocation pool size. + */ + LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, + + /* + * This is how many sequences may be in one super-sequence allocated to + * MDTs. + */ + LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) +}; + +enum { + /** 2^6 FIDs for OI containers */ + OSD_OI_FID_OID_BITS = 6, + /** reserve enough FIDs in case we want more in the future */ + OSD_OI_FID_OID_BITS_MAX = 10, +}; + +/** special OID for local objects */ +enum local_oid { + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ + MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + /** \see mdt_mod_init */ + LAST_RECV_OID = 11UL, + OSD_FS_ROOT_OID = 13UL, + ACCT_USER_OID = 15UL, + ACCT_GROUP_OID = 16UL, + LFSCK_BOOKMARK_OID = 17UL, + OTABLE_IT_OID = 18UL, + /* These two definitions are obsolete + * OFD_GROUP0_LAST_OID = 20UL, + * OFD_GROUP4K_LAST_OID = 20UL+4096, + */ + OFD_LAST_GROUP_OID = 4117UL, + LLOG_CATALOGS_OID = 4118UL, + MGS_CONFIGS_OID = 4119UL, + OFD_HEALTH_CHECK_OID = 4120UL, + MDD_LOV_OBJ_OSEQ = 4121UL, + LFSCK_NAMESPACE_OID = 4122UL, + REMOTE_PARENT_DIR_OID = 4123UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + +static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_NAME; + fid->f_oid = oid; + fid->f_ver = 0; +} + +/* For new FS (>= 2.4), the root FID will be changed to + * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), + * the root FID will still be IGIF */ +static inline int fid_is_root(const struct lu_fid *fid) +{ + return unlikely((fid_seq(fid) == FID_SEQ_ROOT && + fid_oid(fid) == 1)); +} + +static inline int fid_is_dot_lustre(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE); +} + +static inline int fid_is_obf(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); +} + +static inline int fid_is_otable_it(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid(fid) == OTABLE_IT_OID); +} + +static inline int fid_is_acct(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LOCAL_FILE && + (fid_oid(fid) == ACCT_USER_OID || + fid_oid(fid) == ACCT_GROUP_OID); +} + +static inline int fid_is_quota(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_QUOTA || + fid_seq(fid) == FID_SEQ_QUOTA_GLB; +} + +static inline int fid_is_namespace_visible(const struct lu_fid *fid) +{ + const __u64 seq = fid_seq(fid); + + /* Here, we cannot distinguish whether the normal FID is for OST + * object or not. It is caller's duty to check more if needed. */ + return (!fid_is_last_id(fid) && + (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) || + fid_is_root(fid) || fid_is_dot_lustre(fid); +} + +static inline int fid_seq_in_fldb(__u64 seq) +{ + return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || + fid_seq_is_root(seq) || fid_seq_is_dot(seq); +} + +static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq) +{ + if (fid_seq_is_mdt0(seq)) { + fid->f_seq = fid_idif_seq(0, 0); + } else { + LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || + fid_seq_is_idif(seq), LPX64"\n", seq); + fid->f_seq = seq; + } + fid->f_oid = 0; + fid->f_ver = 0; +} + +enum lu_mgr_type { + LUSTRE_SEQ_SERVER, + LUSTRE_SEQ_CONTROLLER +}; + +struct lu_server_seq; + +/* Client sequence manager interface. */ +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct mutex lcs_mutex; + + /* + * Range of allowed for allocation sequeces. When using lu_client_seq on + * clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_seq_range lcs_space; + + /* Seq related proc */ + proc_dir_entry_t *lcs_proc_dir; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lcs_name[80]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + + /* Seq-server for direct talking */ + struct lu_server_seq *lcs_srv; + + /* wait queue for fid allocation and update indicator */ + wait_queue_head_t lcs_waitq; + int lcs_update; +}; + +/* server sequence manager interface */ +struct lu_server_seq { + /* Available sequences space */ + struct lu_seq_range lss_space; + + /* keeps highwater in lsr_end for seq allocation algorithm */ + struct lu_seq_range lss_lowater_set; + struct lu_seq_range lss_hiwater_set; + + /* + * Device for server side seq manager needs (saving sequences to backing + * store). + */ + struct dt_device *lss_dev; + + /* /seq file object device */ + struct dt_object *lss_obj; + + /* Seq related proc */ + proc_dir_entry_t *lss_proc_dir; + + /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */ + enum lu_mgr_type lss_type; + + /* Client interafce to request controller */ + struct lu_client_seq *lss_cli; + + /* Mutex for protecting allocation */ + struct mutex lss_mutex; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lss_name[80]; + + /* + * Allocation chunks for super and meta sequences. Default values are + * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. + */ + __u64 lss_width; + + /* + * minimum lss_alloc_set size that should be allocated from + * lss_space + */ + __u64 lss_set_width; + + /* sync is needed for update operation */ + __u32 lss_need_sync; + + /** + * Pointer to site object, required to access site fld. + */ + struct seq_server_site *lss_site; +}; + +int seq_query(struct com_thread_info *info); +int seq_handle(struct ptlrpc_request *req); + +/* Server methods */ +int seq_server_init(struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss, + const struct lu_env *env); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_set_cli(struct lu_server_seq *seq, + struct lu_client_seq *cli, + const struct lu_env *env); + +/* Client methods */ +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv); + +void seq_client_fini(struct lu_client_seq *seq); + +void seq_client_flush(struct lu_client_seq *seq); + +int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, + struct lu_fid *fid); +int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq, + seqno_t *seqnr); +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss); +/* Fids common stuff */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); + +int client_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type); +int client_fid_fini(struct obd_device *obd); + +/* fid locking */ + +struct ldlm_namespace; + +/* + * Build (DLM) resource name from FID. + * + * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * renaming name[2,3] fields that need to be used for the quota identifier. + */ +static inline struct ldlm_res_id * +fid_build_reg_res_name(const struct lu_fid *f, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f); + return name; +} + +/* + * Build (DLM) resource identifier from global quota FID and quota ID. + */ +static inline struct ldlm_res_id * +fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(glb_fid, res); + res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); + res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); + return res; +} + +/* + * Extract global FID and quota ID from resource name + */ +static inline void fid_extract_quota_resid(struct ldlm_res_id *res, + struct lu_fid *glb_fid, + union lquota_id *qid) +{ + glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; + glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF]; + glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; + qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; + qid->qid_fid.f_ver = + (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); +} + +/* + * Return true if resource is for object identified by fid. + */ +static inline int fid_res_name_eq(const struct lu_fid *f, + const struct ldlm_res_id *name) +{ + return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f); +} + +/* reverse function of fid_build_reg_res_name() */ +static inline void fid_build_from_res_name(struct lu_fid *f, + const struct ldlm_res_id *name) +{ + fid_zero(f); + f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF]; + f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff; + f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32; + LASSERT(fid_res_name_eq(f, name)); +} + +static inline struct ldlm_res_id * +fid_build_pdo_res_name(const struct lu_fid *f, + unsigned int hash, + struct ldlm_res_id *name) +{ + fid_build_reg_res_name(f, name); + name->name[LUSTRE_RES_ID_HSH_OFF] = hash; + return name; +} + +/** + * Build DLM resource name from object id & seq, which will be removed + * finally, when we replace ost_id with FID in data stack. + * + * Currently, resid from the old client, whose res[0] = object_id, + * res[1] = object_seq, is just oposite with Metatdata + * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. + * To unifiy the resid identification, we will reverse the data + * resid to keep it same with Metadata resid, i.e. + * + * For resid from the old client, + * res[0] = objid, res[1] = 0, still keep the original order, + * for compatiblity. + * + * For new resid + * res will be built from normal FID directly, i.e. res[0] = f_seq, + * res[1] = f_oid + f_ver. + */ +static inline void ostid_build_res_name(struct ost_id *oi, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + if (fid_seq_is_mdt0(ostid_seq(oi))) { + name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); + } else { + fid_build_reg_res_name((struct lu_fid *)oi, name); + } +} + +static inline void ostid_res_name_to_id(struct ost_id *oi, + struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) { + /* old resid */ + ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + } else { + /* new resid */ + fid_build_from_res_name((struct lu_fid *)oi, name); + } +} + +/** + * Return true if the resource is for the object identified by this id & group. + */ +static inline int ostid_res_name_eq(struct ost_id *oi, + struct ldlm_res_id *name) +{ + /* Note: it is just a trick here to save some effort, probably the + * correct way would be turn them into the FID and compare */ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); + } else { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); + } +} + +/* The same as osc_build_res_name() */ +static inline void ost_fid_build_resid(const struct lu_fid *fid, + struct ldlm_res_id *resname) +{ + if (fid_is_mdt0(fid) || fid_is_idif(fid)) { + struct ost_id oi; + if (fid_to_ostid(fid, &oi) != 0) + return; + ostid_build_res_name(&oi, resname); + } else { + fid_build_reg_res_name(fid, resname); + } +} + +static inline void ost_fid_from_resid(struct lu_fid *fid, + const struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) { + /* old resid */ + struct ost_id oi; + ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + ostid_to_fid(fid, &oi, 0); + } else { + /* new resid */ + fid_build_from_res_name(fid, name); + } +} + +/** + * Flatten 128-bit FID values into a 64-bit value for use as an inode number. + * For non-IGIF FIDs this starts just over 2^32, and continues without + * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ + * into the range where there may not be many OID values in use, to minimize + * the risk of conflict. + * + * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, + * the time between re-used inode numbers is very long - 2^40 SEQ numbers, + * or about 2^40 client mounts, if clients create less than 2^24 files/mount. + */ +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + RETURN(ino); + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + RETURN(ino ? ino : fid_oid(fid)); +} + +static inline __u32 fid_hash(const struct lu_fid *f, int bits) +{ + /* all objects with same id and different versions will belong to same + * collisions list. */ + return cfs_hash_long(fid_flatten(f), bits); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + RETURN(ino); + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + RETURN(ino ? ino : fid_oid(fid)); +} + +static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2) +{ + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", + PFID(fid1), PFID(fid2)); + + if (fid_is_idif(fid1) && fid_is_idif(fid2)) + return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - + fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); + + return fid_oid(fid1) - fid_oid(fid2); +} + +#define LUSTRE_SEQ_SRV_NAME "seq_srv" +#define LUSTRE_SEQ_CTL_NAME "seq_ctl" + +/* Range common stuff */ +static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_index = cpu_to_le32(src->lsr_index); + dst->lsr_flags = cpu_to_le32(src->lsr_flags); +} + +static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_index = le32_to_cpu(src->lsr_index); + dst->lsr_flags = le32_to_cpu(src->lsr_flags); +} + +static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_index = cpu_to_be32(src->lsr_index); + dst->lsr_flags = cpu_to_be32(src->lsr_flags); +} + +static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_index = be32_to_cpu(src->lsr_index); + dst->lsr_flags = be32_to_cpu(src->lsr_flags); +} + +/** @} fid */ + +#endif /* __LINUX_FID_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h new file mode 100644 index 000000000000..11e034a65b17 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fld.h @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_FLD_H +#define __LINUX_FLD_H + +/** \defgroup fld fld + * + * @{ + */ + +#include +#include +#include + +#include + +struct lu_client_fld; +struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; + +extern const struct dt_index_features fld_index_features; +extern const char fld_index_name[]; + +/* + * FLD (Fid Location Database) interface. + */ +enum { + LUSTRE_CLI_FLD_HASH_DHT = 0, + LUSTRE_CLI_FLD_HASH_RRB +}; + + +struct lu_fld_target { + struct list_head ft_chain; + struct obd_export *ft_exp; + struct lu_server_fld *ft_srv; + __u64 ft_idx; +}; + +struct lu_server_fld { + /** + * Fld dir proc entry. */ + proc_dir_entry_t *lsf_proc_dir; + + /** + * /fld file object device */ + struct dt_object *lsf_obj; + + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ + struct fld_cache *lsf_cache; + + /** + * Protect index modifications */ + struct mutex lsf_lock; + + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" */ + char lsf_name[80]; + +}; + +struct lu_client_fld { + /** + * Client side proc entry. */ + proc_dir_entry_t *lcf_proc_dir; + + /** + * List of exports client FLD knows about. */ + struct list_head lcf_targets; + + /** + * Current hash to be used to chose an export. */ + struct lu_fld_hash *lcf_hash; + + /** + * Exports count. */ + int lcf_count; + + /** + * Lock protecting exports list and fld_hash. */ + spinlock_t lcf_lock; + + /** + * Client FLD cache. */ + struct fld_cache *lcf_cache; + + /** + * Client fld proc entry name. */ + char lcf_name[80]; + + const struct lu_context *lcf_ctx; + + int lcf_flags; +}; + +/** + * number of blocks to reserve for particular operations. Should be function of + * ... something. Stub for now. + */ +enum { + /* one insert operation can involve two delete and one insert */ + FLD_TXN_INDEX_INSERT_CREDITS = 60, + FLD_TXN_INDEX_DELETE_CREDITS = 20, +}; + +int fld_query(struct com_thread_info *info); + +/* Server methods */ +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int mds_node_id, + int type); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + struct lu_seq_range *new, + struct thandle *th); + +int fld_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + struct lu_seq_range *add_range, + struct thandle *th); + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range); + +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + seqno_t seq, struct lu_seq_range *range); + +/* Client methods */ +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash); + +void fld_client_fini(struct lu_client_fld *fld); + +void fld_client_flush(struct lu_client_fld *fld); + +int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds, + __u32 flags, const struct lu_env *env); + +int fld_client_create(struct lu_client_fld *fld, + struct lu_seq_range *range, + const struct lu_env *env); + +int fld_client_delete(struct lu_client_fld *fld, + seqno_t seq, + const struct lu_env *env); + +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar); + +int fld_client_del_target(struct lu_client_fld *fld, + __u64 idx); + +void fld_client_proc_fini(struct lu_client_fld *fld); + +/** @} fld */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h new file mode 100644 index 000000000000..9dcc332cb2f3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fsfilt.h + * + * Filesystem interface helper. + */ + +#ifndef _LUSTRE_FSFILT_H +#define _LUSTRE_FSFILT_H + +#include + +#define LU221_BAD_TIME (0x80000000U + 24 * 3600) + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h new file mode 100644 index 000000000000..105f6d61eef0 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_ha.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_HA_H +#define _LUSTRE_HA_H + +/** \defgroup ha ha + * + * @{ + */ + +struct obd_import; +struct obd_export; +struct obd_device; +struct ptlrpc_request; + + +int ptlrpc_replay(struct obd_import *imp); +int ptlrpc_resend(struct obd_import *imp); +void ptlrpc_free_committed(struct obd_import *imp); +void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_activate_import(struct obd_import *imp); +void ptlrpc_deactivate_import(struct obd_import *imp); +void ptlrpc_invalidate_import(struct obd_import *imp); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); +int ptlrpc_check_suspend(void); +void ptlrpc_activate_timeouts(struct obd_import *imp); +void ptlrpc_deactivate_timeouts(struct obd_import *imp); + +/** @} ha */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h new file mode 100644 index 000000000000..fcd40f33426a --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_handles.h @@ -0,0 +1,93 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_HANDLES_H_ +#define __LUSTRE_HANDLES_H_ + +/** \defgroup handles handles + * + * @{ + */ + +#include + +#include + + +struct portals_handle_ops { + void (*hop_addref)(void *object); + void (*hop_free)(void *object, int size); +}; + +/* These handles are most easily used by having them appear at the very top of + * whatever object that you want to make handles for. ie: + * + * struct ldlm_lock { + * struct portals_handle handle; + * ... + * }; + * + * Now you're able to assign the results of cookie2handle directly to an + * ldlm_lock. If it's not at the top, you'll want to use container_of() + * to compute the start of the structure based on the handle field. */ +struct portals_handle { + struct list_head h_link; + __u64 h_cookie; + struct portals_handle_ops *h_ops; + + /* newly added fields to handle the RCU issue. -jxiong */ + cfs_rcu_head_t h_rcu; + spinlock_t h_lock; + unsigned int h_size:31; + unsigned int h_in:1; +}; +#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu) + +/* handles.c */ + +/* Add a handle to the hash table */ +void class_handle_hash(struct portals_handle *, + struct portals_handle_ops *ops); +void class_handle_unhash(struct portals_handle *); +void class_handle_hash_back(struct portals_handle *); +void *class_handle2object(__u64 cookie); +void class_handle_free_cb(cfs_rcu_head_t *); +int class_handle_init(void); +void class_handle_cleanup(void); + +/** @} handles */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h new file mode 100644 index 000000000000..084bdd6ab4db --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_idmap.h @@ -0,0 +1,104 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_IDMAP_H +#define _LUSTRE_IDMAP_H + +/** \defgroup idmap idmap + * + * @{ + */ + +#include + +#define CFS_NGROUPS_PER_BLOCK ((int)(PAGE_CACHE_SIZE / sizeof(gid_t))) + +#define CFS_GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK]) + +enum { + CFS_IC_NOTHING = 0, /* convert nothing */ + CFS_IC_ALL = 1, /* convert all items */ + CFS_IC_MAPPED = 2, /* convert mapped uid/gid */ + CFS_IC_UNMAPPED = 3 /* convert unmapped uid/gid */ +}; + +#define CFS_IDMAP_NOTFOUND (-1) + +#define CFS_IDMAP_HASHSIZE 32 + +enum lustre_idmap_idx { + RMT_UIDMAP_IDX, + LCL_UIDMAP_IDX, + RMT_GIDMAP_IDX, + LCL_GIDMAP_IDX, + CFS_IDMAP_N_HASHES +}; + +struct lustre_idmap_table { + spinlock_t lit_lock; + struct list_head lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE]; +}; + +struct lu_ucred; + +extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist); +extern void lustre_groups_sort(group_info_t *group_info); +extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp); + +extern int lustre_idmap_add(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid); +extern int lustre_idmap_del(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid); +extern int lustre_idmap_lookup_uid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, uid_t uid); +extern int lustre_idmap_lookup_gid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, gid_t gid); +extern struct lustre_idmap_table *lustre_idmap_init(void); +extern void lustre_idmap_fini(struct lustre_idmap_table *t); + +/** @} idmap */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h new file mode 100644 index 000000000000..3a5dd6a94c08 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_import.h @@ -0,0 +1,367 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_import PtlRPC import definitions + * Imports are client-side representation of remote obd target. + * + * @{ + */ + +#ifndef __IMPORT_H +#define __IMPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include +#include + + +/** + * Adaptive Timeout stuff + * + * @{ + */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + +struct ptlrpc_at_array { + struct list_head *paa_reqs_array; /** array to hold requests */ + __u32 paa_size; /** the size of array */ + __u32 paa_count; /** the total count of reqs */ + time_t paa_deadline; /** the earliest deadline of reqs */ + __u32 *paa_reqs_count; /** the count of reqs in each entry */ +}; + +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + + +/** @} */ + +/** Possible import states */ +enum lustre_imp_state { + LUSTRE_IMP_CLOSED = 1, + LUSTRE_IMP_NEW = 2, + LUSTRE_IMP_DISCON = 3, + LUSTRE_IMP_CONNECTING = 4, + LUSTRE_IMP_REPLAY = 5, + LUSTRE_IMP_REPLAY_LOCKS = 6, + LUSTRE_IMP_REPLAY_WAIT = 7, + LUSTRE_IMP_RECOVER = 8, + LUSTRE_IMP_FULL = 9, + LUSTRE_IMP_EVICTED = 10, +}; + +/** Returns test string representation of numeric import state \a state */ +static inline char * ptlrpc_import_state_name(enum lustre_imp_state state) +{ + static char* import_state_names[] = { + "", "CLOSED", "NEW", "DISCONN", + "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", + "RECOVER", "FULL", "EVICTED", + }; + + LASSERT (state <= LUSTRE_IMP_EVICTED); + return import_state_names[state]; +} + +/** + * List of import event types + */ +enum obd_import_event { + IMP_EVENT_DISCON = 0x808001, + IMP_EVENT_INACTIVE = 0x808002, + IMP_EVENT_INVALIDATE = 0x808003, + IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, + IMP_EVENT_DEACTIVATE = 0x808006, + IMP_EVENT_ACTIVATE = 0x808007, +}; + +/** + * Definition of import connection structure + */ +struct obd_import_conn { + /** Item for linking connections together */ + struct list_head oic_item; + /** Pointer to actual PortalRPC connection */ + struct ptlrpc_connection *oic_conn; + /** uuid of remote side */ + struct obd_uuid oic_uuid; + /** + * Time (64 bit jiffies) of last connection attempt on this connection + */ + __u64 oic_last_attempt; +}; + +/* state history */ +#define IMP_STATE_HIST_LEN 16 +struct import_state_hist { + enum lustre_imp_state ish_state; + time_t ish_time; +}; + +/** + * Defintion of PortalRPC import structure. + * Imports are representing client-side view to remote target. + */ +struct obd_import { + /** Local handle (== id) for this import. */ + struct portals_handle imp_handle; + /** Reference counter */ + atomic_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ + /** Currently active connection */ + struct ptlrpc_connection *imp_connection; + /** PortalRPC client structure for this import */ + struct ptlrpc_client *imp_client; + /** List element for linking into pinger chain */ + struct list_head imp_pinger_chain; + /** List element for linking into chain for destruction */ + struct list_head imp_zombie_chain; + + /** + * Lists of requests that are retained for replay, waiting for a reply, + * or waiting for recovery to complete, respectively. + * @{ + */ + struct list_head imp_replay_list; + struct list_head imp_sending_list; + struct list_head imp_delayed_list; + /** @} */ + + /** obd device for this import */ + struct obd_device *imp_obd; + + /** + * some seciruty-related fields + * @{ + */ + struct ptlrpc_sec *imp_sec; + struct mutex imp_sec_mutex; + cfs_time_t imp_sec_expire; + /** @} */ + + /** Wait queue for those who need to wait for recovery completion */ + wait_queue_head_t imp_recovery_waitq; + + /** Number of requests currently in-flight */ + atomic_t imp_inflight; + /** Number of requests currently unregistering */ + atomic_t imp_unregistering; + /** Number of replay requests inflight */ + atomic_t imp_replay_inflight; + /** Number of currently happening import invalidations */ + atomic_t imp_inval_count; + /** Numbner of request timeouts */ + atomic_t imp_timeouts; + /** Current import state */ + enum lustre_imp_state imp_state; + /** History of import states */ + struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; + int imp_state_hist_idx; + /** Current import generation. Incremented on every reconnect */ + int imp_generation; + /** Incremented every time we send reconnection request */ + __u32 imp_conn_cnt; + /** + * \see ptlrpc_free_committed remembers imp_generation value here + * after a check to save on unnecessary replay list iterations + */ + int imp_last_generation_checked; + /** Last tranno we replayed */ + __u64 imp_last_replay_transno; + /** Last transno committed on remote side */ + __u64 imp_peer_committed_transno; + /** + * \see ptlrpc_free_committed remembers last_transno since its last + * check here and if last_transno did not change since last run of + * ptlrpc_free_committed and import generation is the same, we can + * skip looking for requests to remove from replay list as optimisation + */ + __u64 imp_last_transno_checked; + /** + * Remote export handle. This is how remote side knows what export + * we are talking to. Filled from response to connect request + */ + struct lustre_handle imp_remote_handle; + /** When to perform next ping. time in jiffies. */ + cfs_time_t imp_next_ping; + /** When we last succesfully connected. time in 64bit jiffies */ + __u64 imp_last_success_conn; + + /** List of all possible connection for import. */ + struct list_head imp_conn_list; + /** + * Current connection. \a imp_connection is imp_conn_current->oic_conn + */ + struct obd_import_conn *imp_conn_current; + + /** Protects flags, level, generation, conn_cnt, *_list */ + spinlock_t imp_lock; + + /* flags */ + unsigned long imp_no_timeout:1, /* timeouts are disabled */ + imp_invalid:1, /* evicted */ + /* administratively disabled */ + imp_deactive:1, + /* try to recover the import */ + imp_replayable:1, + /* don't run recovery (timeout instead) */ + imp_dlm_fake:1, + /* use 1/2 timeout on MDS' OSCs */ + imp_server_timeout:1, + /* VBR: imp in delayed recovery */ + imp_delayed_recovery:1, + /* VBR: if gap was found then no lock replays + */ + imp_no_lock_replay:1, + /* recovery by versions was failed */ + imp_vbr_failed:1, + /* force an immidiate ping */ + imp_force_verify:1, + /* force a scheduled ping */ + imp_force_next_verify:1, + /* pingable */ + imp_pingable:1, + /* resend for replay */ + imp_resend_replay:1, + /* disable normal recovery, for test only. */ + imp_no_pinger_recover:1, + /* need IR MNE swab */ + imp_need_mne_swab:1, + /* import must be reconnected instead of + * chouse new connection */ + imp_force_reconnect:1, + /* import has tried to connect with server */ + imp_connect_tried:1; + __u32 imp_connect_op; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + int imp_connect_error; + + __u32 imp_msg_magic; + __u32 imp_msghdr_flags; /* adjusted based on server capability */ + + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ + + struct imp_at imp_at; /* adaptive timeout data */ + time_t imp_last_reply_time; /* for health check */ +}; + +typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, + int event, void *event_arg, void *cb_data); + +/** + * Structure for import observer. + * It is possible to register "observer" on an import and every time + * something happens to an import (like connect/evict/disconnect) + * obderver will get its callback called with event type + */ +struct obd_import_observer { + struct list_head oio_chain; + obd_import_callback oio_cb; + void *oio_cb_data; +}; + +void class_observe_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_notify_import_observers(struct obd_import *imp, int event, + void *event_arg); + +/* import.c */ +static inline unsigned int at_est2timeout(unsigned int val) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return (val + (val >> 2) + 5); +} + +static inline unsigned int at_timeout2est(unsigned int val) +{ + /* restore estimate value from timeout: e=4/5(t-5) */ + LASSERT(val); + return (max((val << 2) / 5, 5U) - 4); +} + +static inline void at_reset(struct adaptive_timeout *at, int val) { + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = cfs_time_current_sec(); +} +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { + memset(at, 0, sizeof(*at)); + spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, val); +} +extern unsigned int at_min; +static inline int at_get(struct adaptive_timeout *at) { + return (at->at_current > at_min) ? at->at_current : at_min; +} +int at_measured(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + +/* genops.c */ +struct obd_export; +extern struct obd_import *class_exp2cliimp(struct obd_export *); +extern struct obd_import *class_conn2cliimp(struct lustre_handle *); + +/** @} import */ + +#endif /* __IMPORT_H */ + +/** @} obd_import */ diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h new file mode 100644 index 000000000000..bdfc5391c6d2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -0,0 +1,667 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LUSTRE_LIB_H +#define _LUSTRE_LIB_H + +/** \defgroup lib lib + * + * @{ + */ + +#include +#include +#include +#include +#include + +/* target.c */ +struct ptlrpc_request; +struct obd_export; +struct lu_target; +struct l_wait_info; +#include +#include +#include + + +int target_pack_pool_reply(struct ptlrpc_request *req); +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + obd_count keylen, void *key, + obd_count vallen, void *val, + struct ptlrpc_request_set *set); + +#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ +#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); + +/* client.c */ + +int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg); +struct client_obd *client_conn2cli(struct lustre_handle *conn); + +struct md_open_data; +struct obd_client_handle { + struct lustre_handle och_fh; + struct lu_fid och_fid; + struct md_open_data *och_mod; + __u32 och_magic; + int och_flags; +}; +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + +/* statfs_pack.c */ +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* l_lock.c */ +struct lustre_lock { + int l_depth; + task_t *l_owner; + struct semaphore l_sem; + spinlock_t l_spin; +}; + +void l_lock_init(struct lustre_lock *); +void l_lock(struct lustre_lock *); +void l_unlock(struct lustre_lock *); +int l_has_lock(struct lustre_lock *); + +/* + * For md echo client + */ +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +/* + * OBD IOCTLS + */ +#define OBD_IOCTL_VERSION 0x00010004 + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + obd_size ioc_count; + obd_off ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char *ioc_pbuf1; + __u32 ioc_plen2; + char *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline int obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + int len = cfs_size_round(sizeof(struct obd_ioctl_data)); + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + len += cfs_size_round(data->ioc_inllen3); + len += cfs_size_round(data->ioc_inllen4); + return len; +} + + +static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen3 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen4 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf3 && !data->ioc_inllen3) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf4 && !data->ioc_inllen4) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + return 0; +} + + +#include + +/* function defined in lustre/obdclass//-module.c */ +int obd_ioctl_getdata(char **buf, int *len, void *arg); +int obd_ioctl_popdata(void *arg, void *data, int len); + +static inline void obd_ioctl_freedata(char *buf, int len) +{ + ENTRY; + + OBD_FREE_LARGE(buf, len); + EXIT; + return; +} + +/* + * BSD ioctl description: + * #define IOC_V1 _IOR(g, n1, long) + * #define IOC_V2 _IOW(g, n2, long) + * + * ioctl(f, IOC_V1, arg); + * arg will be treated as a long value, + * + * ioctl(f, IOC_V2, arg) + * arg will be treated as a pointer, bsd will call + * copyin(buf, arg, sizeof(long)) + * + * To make BSD ioctl handles argument correctly and simplely, + * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data + * for us. Does this change affect Linux? (XXX Liang) + */ +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW ('f', 104, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_SETATTR _IOW ('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW ('f', 114, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) +#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) +#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME + +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PING_TARGET _IOW ('f', 136, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 ) +#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SET_READONLY _IOW ('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) + +#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CHANGELOG_SEND _IOW ('f', 148, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE) +/* see also for ioctls 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */ +#define OBD_IOC_LOV_SETSTRIPE _IOW ('f', 154, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */ +#define OBD_IOC_LOV_GETSTRIPE _IOW ('f', 155, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */ +#define OBD_IOC_LOV_SETEA _IOW ('f', 156, OBD_IOC_DATA_TYPE) +/* see for ioctls 157-159 */ +/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */ +#define OBD_IOC_QUOTACHECK _IOW ('f', 160, int) +/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */ +#define OBD_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */ +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* see also for ioctls 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW ('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW ('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW ('f', 179, struct obd_ioctl_data) +#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO is deprecated */ +#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) + +#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* defines ioctl number 218-219 */ +#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) + +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) + +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE) + +/* XXX _IOWR('f', 250, long) has been defined in + * libcfs/include/libcfs/libcfs_private.h for debug, don't use it + */ + +/* Until such time as we get_info the per-stripe maximum from the OST, + * we define this to be 2T - 4k, which is the ext3 maxbytes. */ +#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL + +/* Special values for remove LOV EA from disk */ +#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \ + offset == (typeof(offset))(-1)) + +/* #define POISON_BULK 0 */ + +/* + * l_wait_event is a flexible sleeping function, permitting simple caller + * configuration of interrupt and timeout sensitivity along with actions to + * be performed in the event of either exception. + * + * The first form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, + * intr_handler, callback_data); + * rc = l_wait_event(waitq, condition, &lwi); + * + * l_wait_event() makes the current process wait on 'waitq' until 'condition' + * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It + * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before + * 'condition' becomes true, it optionally calls the specified 'intr_handler' + * if not NULL, and returns -EINTR. + * + * If a non-zero timeout is specified, signals are ignored until the timeout + * has expired. At this time, if 'timeout_handler' is not NULL it is called. + * If it returns FALSE l_wait_event() continues to wait as described above with + * signals enabled. Otherwise it returns -ETIMEDOUT. + * + * LWI_INTR(intr_handler, callback_data) is shorthand for + * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data) + * + * The second form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * + * This form is the same as the first except that it COMPLETELY IGNORES + * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if + * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that + * can unblock the current process is 'condition' becoming TRUE. + * + * Another form of usage is: + * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval, + * timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * This is the same as previous case, but condition is checked once every + * 'interval' jiffies (if non-zero). + * + * Subtle synchronization point: this macro does *not* necessary takes + * wait-queue spin-lock before returning, and, hence, following idiom is safe + * ONLY when caller provides some external locking: + * + * Thread1 Thread2 + * + * l_wait_event(&obj->wq, ....); (1) + * + * wake_up(&obj->wq): (2) + * spin_lock(&q->lock); (2.1) + * __wake_up_common(q, ...); (2.2) + * spin_unlock(&q->lock, flags); (2.3) + * + * OBD_FREE_PTR(obj); (3) + * + * As l_wait_event() may "short-cut" execution and return without taking + * wait-queue spin-lock, some additional synchronization is necessary to + * guarantee that step (3) can begin only after (2.3) finishes. + * + * XXX nikita: some ptlrpc daemon threads have races of that sort. + * + */ +static inline int back_to_sleep(void *arg) +{ + return 0; +} + +#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1)) + +struct l_wait_info { + cfs_duration_t lwi_timeout; + cfs_duration_t lwi_interval; + int lwi_allow_intr; + int (*lwi_on_timeout)(void *); + void (*lwi_on_signal)(void *); + void *lwi_cb_data; +}; + +/* NB: LWI_TIMEOUT ignores signals completely */ +#define LWI_TIMEOUT(time, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = interval, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 1 \ +}) + +#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data) + + +/* + * wait for @condition to become true, but no longer than timeout, specified + * by @info. + */ +#define __l_wait_event(wq, condition, info, ret, l_add_wait) \ +do { \ + wait_queue_t __wait; \ + cfs_duration_t __timeout = info->lwi_timeout; \ + sigset_t __blocked; \ + int __allow_intr = info->lwi_allow_intr; \ + \ + ret = 0; \ + if (condition) \ + break; \ + \ + init_waitqueue_entry_current(&__wait); \ + l_add_wait(&wq, &__wait); \ + \ + /* Block all signals (just the non-fatal ones if no timeout). */ \ + if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \ + __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \ + else \ + __blocked = cfs_block_sigsinv(0); \ + \ + for (;;) { \ + unsigned __wstate; \ + \ + __wstate = info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr) ? \ + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; \ + \ + set_current_state(TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + \ + if (__timeout == 0) { \ + waitq_wait(&__wait, __wstate); \ + } else { \ + cfs_duration_t interval = info->lwi_interval? \ + min_t(cfs_duration_t, \ + info->lwi_interval,__timeout):\ + __timeout; \ + cfs_duration_t remaining = waitq_timedwait(&__wait,\ + __wstate, \ + interval); \ + __timeout = cfs_time_sub(__timeout, \ + cfs_time_sub(interval, remaining));\ + if (__timeout == 0) { \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + /* Take signals after the timeout expires. */ \ + if (info->lwi_on_signal != NULL) \ + (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\ + } \ + } \ + \ + if (condition) \ + break; \ + if (cfs_signal_pending()) { \ + if (info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr)) { \ + if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \ + info->lwi_on_signal(info->lwi_cb_data);\ + ret = -EINTR; \ + break; \ + } \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + cfs_clear_sigpending(); \ + } \ + } \ + \ + cfs_restore_sigs(__blocked); \ + \ + set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + + + +#define l_wait_event(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue); \ + __ret; \ +}) + +#define l_wait_event_exclusive(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive); \ + __ret; \ +}) + +#define l_wait_event_exclusive_head(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive_head); \ + __ret; \ +}) + +#define l_wait_condition(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive_head(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive_head(wq, condition, &lwi); \ +}) + +#define LIBLUSTRE_CLIENT (0) + +/** @} lib */ + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h new file mode 100644 index 000000000000..5790be913bf6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + * Use is subject to license terms. + * + * Author: di wang + */ + +struct linkea_data { + /** + * Buffer to keep link EA body. + */ + struct lu_buf *ld_buf; + /** + * The matched header, entry and its lenght in the EA + */ + struct link_ea_header *ld_leh; + struct link_ea_entry *ld_lee; + int ld_reclen; +}; + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); +int linkea_init(struct linkea_data *ldata); +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid); +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname); +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); + +#define LINKEA_NEXT_ENTRY(ldata) \ + (struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen) + +#define LINKEA_FIRST_ENTRY(ldata) \ + (struct link_ea_entry *)(ldata.ld_leh + 1) diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h new file mode 100644 index 000000000000..25f8bfaccef3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_lite.h @@ -0,0 +1,147 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LL_H +#define _LL_H + +/** \defgroup lite lite + * + * @{ + */ + +#include + +#include +#include +#include +#include +#include + +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS (22) +#define LL_MAX_BLKSIZE (1UL< + + +struct lustre_rw_params { + int lrp_lock_mode; + ldlm_policy_data_t lrp_policy; + obd_flag lrp_brw_flags; + int lrp_ast_flags; +}; + +/* + * XXX nikita: this function lives in the header because it is used by both + * llite kernel module and liblustre library, and there is no (?) better place + * to put it in. + */ +static inline void lustre_build_lock_params(int cmd, unsigned long open_flags, + __u64 connect_flags, + loff_t pos, ssize_t len, + struct lustre_rw_params *params) +{ + params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW; + params->lrp_brw_flags = 0; + + params->lrp_policy.l_extent.start = pos; + params->lrp_policy.l_extent.end = pos + len - 1; + /* + * for now O_APPEND always takes local locks. + */ + if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) { + params->lrp_policy.l_extent.start = 0; + params->lrp_policy.l_extent.end = OBD_OBJECT_EOF; + } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) { + /* + * liblustre: OST-side locking for all non-O_APPEND + * reads/writes. + */ + params->lrp_lock_mode = LCK_NL; + params->lrp_brw_flags = OBD_BRW_SRVLOCK; + } else { + /* + * nothing special for the kernel. In the future llite may use + * OST-side locks for small writes into highly contended + * files. + */ + } + params->lrp_ast_flags = (open_flags & O_NONBLOCK) ? + LDLM_FL_BLOCK_NOWAIT : 0; +} + +/* + * This is embedded into liblustre and llite super-blocks to keep track of + * connect flags (capabilities) supported by all imports given mount is + * connected to. + */ +struct lustre_client_ocd { + /* + * This is conjunction of connect_flags across all imports (LOVs) this + * mount is connected to. This field is updated by cl_ocd_update() + * under ->lco_lock. + */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + +/* + * Chain of hash overflow pages. + */ +struct ll_dir_chain { + /* XXX something. Later */ +}; + +static inline void ll_dir_chain_init(struct ll_dir_chain *chain) +{ +} + +static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) +{ +} + +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + return ~0UL - hash; +} + +/** @} lite */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h new file mode 100644 index 000000000000..714ab378e431 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_log.h @@ -0,0 +1,576 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LUSTRE_LOG_H +#define _LUSTRE_LOG_H + +/** \defgroup log log + * + * @{ + */ + +#include + +#include +#include +#include +#include + +#define LOG_NAME_LIMIT(logname, name) \ + snprintf(logname, sizeof(logname), "LOGS/%s", name) +#define LLOG_EEMPTY 4711 + +enum llog_open_param { + LLOG_OPEN_EXISTS = 0x0000, + LLOG_OPEN_NEW = 0x0001, +}; + +struct plain_handle_data { + struct list_head phd_entry; + struct llog_handle *phd_cat_handle; + struct llog_cookie phd_cookie; /* cookie of this log in its cat */ +}; + +struct cat_handle_data { + struct list_head chd_head; + struct llog_handle *chd_current_log; /* currently open log */ + struct llog_handle *chd_next_log; /* llog to be used next */ +}; + +static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid) +{ + /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS) + * logid's by non-zero ogen (inode generation) and convert them + * into IGIF */ + if (id->lgl_ogen == 0) { + fid->f_seq = id->lgl_oi.oi.oi_seq; + fid->f_oid = id->lgl_oi.oi.oi_id; + fid->f_ver = 0; + } else { + lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen); + } +} + +static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id) +{ + id->lgl_oi.oi.oi_seq = fid->f_seq; + id->lgl_oi.oi.oi_id = fid->f_oid; + id->lgl_ogen = 0; +} + +static inline void logid_set_id(struct llog_logid *log_id, __u64 id) +{ + log_id->lgl_oi.oi.oi_id = id; +} + +static inline __u64 logid_id(struct llog_logid *log_id) +{ + return log_id->lgl_oi.oi.oi_id; +} + +struct llog_handle; + +/* llog.c - general API */ +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid); +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data); +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata); +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index); +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param); +int llog_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_get_size(struct llog_handle *loghandle); + +/* llog_process flags */ +#define LLOG_FLAG_NODEAMON 0x0001 + +/* llog_cat.c - catalog api */ +struct llog_process_data { + /** + * Any useful data needed while processing catalog. This is + * passed later to process callback. + */ + void *lpd_data; + /** + * Catalog process callback function, called for each record + * in catalog. + */ + llog_cb_t lpd_cb; + /** + * Start processing the catalog from startcat/startidx + */ + int lpd_startcat; + int lpd_startidx; +}; + +struct llog_process_cat_data { + /** + * Temporary stored first_idx while scanning log. + */ + int lpcd_first_idx; + /** + * Temporary stored last_idx while scanning log. + */ + int lpcd_last_idx; +}; + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th); +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th); +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf); +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies); +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data, int startcat, int startidx, bool fork); +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx); +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data); +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh); + +/* llog_obd.c */ +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op); +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags); +int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies); +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags); + +int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *idx); + +int obd_llog_finish(struct obd_device *obd, int count); + +/* llog_ioctl.c */ +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data); + +/* llog_net.c */ +int llog_initiator_connect(struct llog_ctxt *ctxt); + +struct llog_operations { + int (*lop_destroy)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, + int *curr_idx, int next_idx, __u64 *offset, + void *buf, int len); + int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, + int prev_idx, void *buf, int len); + int (*lop_read_header)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd); + int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, + int flags); + int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); + int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags); + int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, + struct llog_gen *gen, struct obd_uuid *uuid); + /** + * Any llog file must be opened first using llog_open(). Llog can be + * opened by name, logid or without both, in last case the new logid + * will be generated. + */ + int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_logid *logid, char *name, + enum llog_open_param); + /** + * Opened llog may not exist and this must be checked where needed using + * the llog_exist() call. + */ + int (*lop_exist)(struct llog_handle *lgh); + /** + * Close llog file and calls llog_free_handle() implicitly. + * Any opened llog must be closed by llog_close() call. + */ + int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); + /** + * Create new llog file. The llog must be opened. + * Must be used only for local llog operations. + */ + int (*lop_declare_create)(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th); + int (*lop_create)(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); + /** + * write new record in llog. It appends records usually but can edit + * existing records too. + */ + int (*lop_declare_write_rec)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, + int idx, struct thandle *th); + int (*lop_write_rec)(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, int cookiecount, + void *buf, int idx, struct thandle *th); + /** + * Add new record in llog catalog. Does the same as llog_write_rec() + * but using llog catalog. + */ + int (*lop_declare_add)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); + int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *cookie, + void *buf, struct thandle *th); + /* Old llog_add version, used in MDS-LOV-OSC now and will gone with + * LOD/OSP replacement */ + int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies); +}; + +/* In-memory descriptor for a log object or log catalog */ +struct llog_handle { + struct rw_semaphore lgh_lock; + spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */ + struct llog_logid lgh_id; /* id of this log */ + struct llog_log_hdr *lgh_hdr; + struct file *lgh_file; + struct dt_object *lgh_obj; + int lgh_last_idx; + int lgh_cur_idx; /* used during llog_process */ + __u64 lgh_cur_offset; /* used during llog_process */ + struct llog_ctxt *lgh_ctxt; + union { + struct plain_handle_data phd; + struct cat_handle_data chd; + } u; + char *lgh_name; + void *private_data; + struct llog_operations *lgh_logops; + atomic_t lgh_refcount; +}; + +/* llog_lvfs.c */ +extern struct llog_operations llog_lvfs_ops; + +/* llog_osd.c */ +extern struct llog_operations llog_osd_ops; +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, + struct llog_catid *idarray); +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, + struct llog_catid *idarray); + +#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 +#define LLOG_CTXT_FLAG_STOP 0x00000002 + +struct llog_ctxt { + int loc_idx; /* my index the obd array of ctxt's */ + struct obd_device *loc_obd; /* points back to the containing obd*/ + struct obd_llog_group *loc_olg; /* group containing that ctxt */ + struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ + struct obd_import *loc_imp; /* to use in RPC's: can be backward + pointing import */ + struct llog_operations *loc_logops; + struct llog_handle *loc_handle; + struct mutex loc_mutex; /* protect loc_imp */ + atomic_t loc_refcount; + long loc_flags; /* flags, see above defines */ + struct dt_object *loc_dir; +}; + +#define LLOG_PROC_BREAK 0x0001 +#define LLOG_DEL_RECORD 0x0002 + +static inline int llog_obd2ops(struct llog_ctxt *ctxt, + struct llog_operations **lop) +{ + if (ctxt == NULL) + return -ENOTCONN; + + *lop = ctxt->loc_logops; + if (*lop == NULL) + return -EOPNOTSUPP; + + return 0; +} + +static inline int llog_handle2ops(struct llog_handle *loghandle, + struct llog_operations **lop) +{ + if (loghandle == NULL || loghandle->lgh_logops == NULL) + return -EINVAL; + + *lop = loghandle->lgh_logops; + return 0; +} + +static inline int llog_data_len(int len) +{ + return cfs_size_round(len); +} + +static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) +{ + atomic_inc(&ctxt->loc_refcount); + CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount)); + return ctxt; +} + +static inline void llog_ctxt_put(struct llog_ctxt *ctxt) +{ + if (ctxt == NULL) + return; + LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount) - 1); + __llog_ctxt_put(NULL, ctxt); +} + +static inline void llog_group_init(struct obd_llog_group *olg, int group) +{ + init_waitqueue_head(&olg->olg_waitq); + spin_lock_init(&olg->olg_lock); + mutex_init(&olg->olg_cat_processing); + olg->olg_seq = group; +} + +static inline int llog_group_set_ctxt(struct obd_llog_group *olg, + struct llog_ctxt *ctxt, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] != NULL) { + spin_unlock(&olg->olg_lock); + return -EEXIST; + } + olg->olg_ctxts[index] = ctxt; + spin_unlock(&olg->olg_lock); + return 0; +} + +static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, + int index) +{ + struct llog_ctxt *ctxt; + + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] == NULL) + ctxt = NULL; + else + ctxt = llog_ctxt_get(olg->olg_ctxts[index]); + spin_unlock(&olg->olg_lock); + return ctxt; +} + +static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + spin_lock(&olg->olg_lock); + olg->olg_ctxts[index] = NULL; + spin_unlock(&olg->olg_lock); +} + +static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, + int index) +{ + return llog_group_get_ctxt(&obd->obd_olg, index); +} + +static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) +{ + return (olg->olg_ctxts[index] == NULL); +} + +static inline int llog_ctxt_null(struct obd_device *obd, int index) +{ + return (llog_group_ctxt_null(&obd->obd_olg, index)); +} + +static inline int llog_destroy(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_destroy(env, handle); + RETURN(rc); +} + +static inline int llog_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_next_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, + cur_offset, buf, len); + RETURN(rc); +} + +static inline int llog_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_prev_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len); + RETURN(rc); +} + +static inline int llog_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, struct llog_gen *gen, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_obd2ops(ctxt, &lop); + if (rc) + RETURN(rc); + if (lop->lop_connect == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_connect(ctxt, logid, gen, uuid); + RETURN(rc); +} + +/* llog.c */ +int llog_exist(struct llog_handle *loghandle); +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th); +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th); +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th); +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th); +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name); +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name); +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx); + +/** @} log */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h new file mode 100644 index 000000000000..fb1561a809b9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mdc.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDC_H +#define _LUSTRE_MDC_H + +/** \defgroup mdc mdc + * + * @{ + */ + +# include +# include +# ifdef CONFIG_FS_POSIX_ACL +# include +# endif /* CONFIG_FS_POSIX_ACL */ +# include +#include +#include +#include +#include +#include +#include +#include + +struct ptlrpc_client; +struct obd_export; +struct ptlrpc_request; +struct obd_device; + +struct mdc_rpc_lock { + struct mutex rpcl_mutex; + struct lookup_intent *rpcl_it; + int rpcl_fakes; +}; + +#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL) + +static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) +{ + mutex_init(&lck->rpcl_mutex); + lck->rpcl_it = NULL; +} + +static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + ENTRY; + + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP)) + return; + + /* This would normally block until the existing request finishes. + * If fail_loc is set it will block until the regular request is + * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set + * it will only be cleared when all fake requests are finished. + * Only when all fake requests are finished can normal requests + * be sent, to ensure they are recoverable again. */ + again: + mutex_lock(&lck->rpcl_mutex); + + if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) { + lck->rpcl_it = MDC_FAKE_RPCL_IT; + lck->rpcl_fakes++; + mutex_unlock(&lck->rpcl_mutex); + return; + } + + /* This will only happen when the CFS_FAIL_CHECK() was + * just turned off but there are still requests in progress. + * Wait until they finish. It doesn't need to be efficient + * in this extremely rare case, just have low overhead in + * the common case when it isn't true. */ + while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) { + mutex_unlock(&lck->rpcl_mutex); + schedule_timeout(cfs_time_seconds(1) / 4); + goto again; + } + + LASSERT(lck->rpcl_it == NULL); + lck->rpcl_it = it; +} + +static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP)) + goto out; + + if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ + mutex_lock(&lck->rpcl_mutex); + + LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes); + lck->rpcl_fakes--; + + if (lck->rpcl_fakes == 0) + lck->rpcl_it = NULL; + + } else { + LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it); + lck->rpcl_it = NULL; + } + + mutex_unlock(&lck->rpcl_mutex); + out: + EXIT; +} + +static inline void mdc_update_max_ea_from_body(struct obd_export *exp, + struct mdt_body *body) +{ + if (body->valid & OBD_MD_FLMODEASIZE) { + if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) + exp->exp_obd->u.cli.cl_max_mds_easize = + body->max_mdsize; + if (exp->exp_obd->u.cli.cl_max_mds_cookiesize < + body->max_cookiesize) + exp->exp_obd->u.cli.cl_max_mds_cookiesize = + body->max_cookiesize; + } +} + + +struct mdc_cache_waiter { + struct list_head mcw_entry; + wait_queue_head_t mcw_waitq; +}; + +/* mdc/mdc_locks.c */ +int it_disposition(struct lookup_intent *it, int flag); +void it_clear_disposition(struct lookup_intent *it, int flag); +void it_set_disposition(struct lookup_intent *it, int flag); +int it_open_error(int phase, struct lookup_intent *it); + +/** @} mdc */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h new file mode 100644 index 000000000000..b386f87471e3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mds.h @@ -0,0 +1,81 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mds.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDS_H +#define _LUSTRE_MDS_H + +/** \defgroup mds mds + * + * @{ + */ + +#include +#include +#include +#include +#include +#include + +struct mds_group_info { + struct obd_uuid *uuid; + int group; +}; + +struct mds_capa_info { + struct obd_uuid *uuid; + struct lustre_capa_key *capa; +}; + +#define MDD_OBD_NAME "mdd_obd" +#define MDD_OBD_UUID "mdd_obd_uuid" + +static inline int md_should_create(__u64 flags) +{ + return !(flags & MDS_OPEN_DELAY_CREATE || + !(flags & FMODE_WRITE)); +} + +/* these are local flags, used only on the client, private */ +#define M_CHECK_STALE 0200000000 + +/** @} mds */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h new file mode 100644 index 000000000000..dba26a6cfa38 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mdt.h @@ -0,0 +1,84 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_MDT_H +#define __LINUX_MDT_H + +/** \defgroup mdt mdt + * + * @{ + */ + +#include +#include +#include +#include +#include + +/* + * Common thread info for mdt, seq and fld + */ +struct com_thread_info { + /* + * for req-layout interface. + */ + struct req_capsule *cti_pill; +}; + +enum { + ESERIOUS = 0x0001000 +}; + +static inline int err_serious(int rc) +{ + LASSERT(rc < 0); + LASSERT(-rc < ESERIOUS); + return -(-rc | ESERIOUS); +} + +static inline int clear_serious(int rc) +{ + if (rc < 0) + rc = -(-rc & ~ESERIOUS); + return rc; +} + +static inline int is_serious(int rc) +{ + return (rc < 0 && -rc & ESERIOUS); +} + +/** @} mdt */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h new file mode 100644 index 000000000000..874412ee58fc --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_net.h @@ -0,0 +1,3453 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup PtlRPC Portal RPC and networking module. + * + * PortalRPC is the layer used by rest of lustre code to achieve network + * communications: establish connections with corresponding export and import + * states, listen for a service, send and receive RPCs. + * PortalRPC also includes base recovery framework: packet resending and + * replaying, reconnections, pinger. + * + * PortalRPC utilizes LNet as its transport layer. + * + * @{ + */ + + +#ifndef _LUSTRE_NET_H +#define _LUSTRE_NET_H + +/** \defgroup net net + * + * @{ + */ + +#include + +#include +// #include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS 0 + +/** + * Max # of bulk operations in one request. + * In order for the client and server to properly negotiate the maximum + * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two + * value. The client is free to limit the actual RPC size for any bulk + * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */ +#define PTLRPC_BULK_OPS_BITS 2 +#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and + * should not be used on the server at all. Otherwise, it imposes a + * protocol limitation on the maximum RPC size that can be used by any + * RPC sent to that server in the future. Instead, the server should + * use the negotiated per-client ocd_brw_size to determine the bulk + * RPC count. */ +#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) + +/** + * Define maxima for bulk I/O. + * + * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT + * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the + * currently supported maximum between peers at connect via ocd_brw_size. + */ +#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) +#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS) +#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) + +#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE +#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) + +/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ +# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +# endif +# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE)) +# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE" +# endif +# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_SIZE too big" +# endif +# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_PAGES too big" +# endif + +#define PTLRPC_NTHRS_INIT 2 + +/** + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. + * + * ?_NBUFS # buffers to allocate when growing the pool + * ?_BUFSIZE # bytes in a single request buffer + * ?_MAXREQSIZE # maximum request service will receive + * + * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk + * of ?_NBUFS is added to the pool. + * + * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are + * considered full when less than ?_MAXREQSIZE is left in them. + */ +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number will be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT +#define LDLM_CLIENT_NBUFS 1 +#define LDLM_SERVER_NBUFS 64 +#define LDLM_BUFSIZE (8 * 1024) +#define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXREPSIZE (1024) + + /* + * MDS threads constants: + * + * Please see examples in "Thread Constants", MDS threads number will be at + * the comparable level of old versions, unless the server has many cores. + */ +#ifndef MDS_MAX_THREADS +#define MDS_MAX_THREADS 1024 +#define MDS_MAX_OTHR_THREADS 256 + +#else /* MDS_MAX_THREADS */ +#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT +#undef MDS_MAX_THREADS +#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT +#endif +#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2) +#endif + +/* default service */ +#define MDS_THR_FACTOR 8 +#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_NTHRS_MAX MDS_MAX_THREADS +#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX) + +/* read-page service */ +#define MDS_RDPG_THR_FACTOR 4 +#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX) + +/* these should be removed when we remove setattr service in the future */ +#define MDS_SETA_THR_FACTOR 4 +#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) + +/* non-affinity threads */ +#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS + +#define MDS_NBUFS 64 + +/** + * Assume file name length = FNAME_MAX = 256 (true for ext3). + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 24 * 2000 + * (NB: 24 is size of lov_ost_data) + * LOV LOGCOOKIE size max = 32 * 2000 + * (NB: 32 is size of llog_cookie) + * symlink: FNAME_MAX + PATH_MAX <- largest + * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) + * rename: FNAME_MAX + FNAME_MAX + * open: FNAME_MAX + EA_MAX + * + * MDS_MAXREQSIZE ~= 4736 bytes = + * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX + * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header + * + * Realistic size is about 512 bytes (20 character name + 128 char symlink), + * except in the open case where there are a large number of OSTs in a LOV. + */ +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ +#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */ + +/** + * MDS incoming request with LOV EA + * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate + */ +#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \ + 362 + LOV_MAX_STRIPE_COUNT * 24) +/** + * MDS outgoing reply with LOV EA + * + * NB: max reply size Lustre 2.4+ client can get from old MDS is: + * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes + * + * but 2.4 or later MDS will never send reply with llog_cookie to any + * version client. This macro is defined for server side reply buffer size. + */ +#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE + +/** + * This is the size of a maximum REINT_SETXATTR request: + * + * lustre_msg 56 (32 + 4 x 5 + 4) + * ptlrpc_body 184 + * mdt_rec_setxattr 136 + * lustre_capa 120 + * name 256 (XATTR_NAME_MAX) + * value 65536 (XATTR_SIZE_MAX) + */ +#define MDS_EA_MAXREQSIZE 66288 + +/** + * These are the maximum request and reply sizes (rounded up to 1 KB + * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL. + */ +#define MDS_REG_MAXREQSIZE (((max(MDS_EA_MAXREQSIZE, \ + MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10) +#define MDS_REG_MAXREPSIZE MDS_REG_MAXREQSIZE + +/** + * The update request includes all of updates from the create, which might + * include linkea (4K maxim), together with other updates, we set it to 9K: + * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K) + */ +#define MDS_OUT_MAXREQSIZE (9 * 1024) +#define MDS_OUT_MAXREPSIZE MDS_MAXREPSIZE + +/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */ +#define MDS_BUFSIZE max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 8 * 1024) + +/** + * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD. + * However, we need to allocate a much larger buffer for it because LNet + * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid + * dropping of maximum-sized incoming request. So if MDS_REG_BUFSIZE is only a + * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request + * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory + * utilization is very low. + * + * In the meanwhile, size of rqbd can't be too large, because rqbd can't be + * reused until all requests fit in it have been processed and released, + * which means one long blocked request can prevent the rqbd be reused. + * Now we set request buffer size to 160 KB, so even each rqbd is unlinked + * from LNet with unused 65 KB, buffer utilization will be about 59%. + * Please check LU-2432 for details. + */ +#define MDS_REG_BUFSIZE max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 160 * 1024) + +/** + * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is + * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some + * extra bytes to each request buffer to improve buffer utilization rate. + */ +#define MDS_OUT_BUFSIZE max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 24 * 1024) + +/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */ +#define FLD_MAXREQSIZE (160) + +/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */ +#define FLD_MAXREPSIZE (152) +#define FLD_BUFSIZE (1 << 12) + +/** + * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range + + * __u32 padding */ +#define SEQ_MAXREQSIZE (160) + +/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */ +#define SEQ_MAXREPSIZE (152) +#define SEQ_BUFSIZE (1 << 12) + +/** MGS threads must be >= 3, see bug 22458 comment #28 */ +#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define MGS_NTHRS_MAX 32 + +#define MGS_NBUFS 64 +#define MGS_BUFSIZE (8 * 1024) +#define MGS_MAXREQSIZE (7 * 1024) +#define MGS_MAXREPSIZE (9 * 1024) + + /* + * OSS threads constants: + * + * Given 8 as factor and 64 as base threads number + * + * example 1): + * On 8-core server configured to 2 partitions, we will have + * 64 + 8 * 4 = 96 threads for each partition, 192 total threads. + * + * example 2): + * On 32-core machine configured to 4 partitions, we will have + * 64 + 8 * 8 = 112 threads for each partition, so total threads number + * will be 112 * 4 = 448. + * + * example 3): + * On 64-core machine configured to 4 partitions, we will have + * 64 + 16 * 8 = 192 threads for each partition, so total threads number + * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we + * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads + * for each partition. + * + * So we can see that with these constants, threads number wil be at the + * similar level of old versions, unless the server has many cores. + */ + /* depress threads factor for VM with small memory size */ +#define OSS_THR_FACTOR min_t(int, 8, \ + NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT)) +#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define OSS_NTHRS_BASE 64 +#define OSS_NTHRS_MAX 512 + +/* threads for handling "create" request */ +#define OSS_CR_THR_FACTOR 1 +#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define OSS_CR_NTHRS_BASE 8 +#define OSS_CR_NTHRS_MAX 64 + +/** + * OST_IO_MAXREQSIZE ~= + * lustre_msg + ptlrpc_body + obdo + obd_ioobj + + * DT_MAX_BRW_PAGES * niobuf_remote + * + * - single object with 16 pages is 512 bytes + * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover + * - Must be a multiple of 1024 + * - actual size is about 18K + */ +#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \ + sizeof(struct ptlrpc_body) + \ + sizeof(struct obdo) + \ + sizeof(struct obd_ioobj) + \ + sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES) +/** + * FIEMAP request can be 4K+ for now + */ +#define OST_MAXREQSIZE (5 * 1024) +#define OST_IO_MAXREQSIZE max_t(int, OST_MAXREQSIZE, \ + (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1)) + +#define OST_MAXREPSIZE (9 * 1024) +#define OST_IO_MAXREPSIZE OST_MAXREPSIZE + +#define OST_NBUFS 64 +/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */ +#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024) +/** + * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization + * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details. + */ +#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024) + +/* Macro to hide a typecast. */ +#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) + +/** + * Structure to single define portal connection. + */ +struct ptlrpc_connection { + /** linkage for connections hash table */ + struct hlist_node c_hash; + /** Our own lnet nid for this connection */ + lnet_nid_t c_self; + /** Remote side nid for this connection */ + lnet_process_id_t c_peer; + /** UUID of the other side */ + struct obd_uuid c_remote_uuid; + /** reference counter for this connection */ + atomic_t c_refcount; +}; + +/** Client definition for PortalRPC */ +struct ptlrpc_client { + /** What lnet portal does this client send messages to by default */ + __u32 cli_request_portal; + /** What portal do we expect replies on */ + __u32 cli_reply_portal; + /** Name of the client */ + char *cli_name; +}; + +/** state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ +#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ +#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ + +#define REQ_MAX_ACK_LOCKS 8 + +union ptlrpc_async_args { + /** + * Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and CLASSERT that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. + */ + void *pointer_arg[11]; + __u64 space[7]; +}; + +struct ptlrpc_request_set; +typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int); +typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); + +/** + * Definition of request set structure. + * Request set is a list of requests (not necessary to the same target) that + * once populated with RPCs could be sent in parallel. + * There are two kinds of request sets. General purpose and with dedicated + * serving thread. Example of the latter is ptlrpcd set. + * For general purpose sets once request set started sending it is impossible + * to add new requests to such set. + * Provides a way to call "completion callbacks" when all requests in the set + * returned. + */ +struct ptlrpc_request_set { + atomic_t set_refcount; + /** number of in queue requests */ + atomic_t set_new_count; + /** number of uncompleted requests */ + atomic_t set_remaining; + /** wait queue to wait on for request events */ + wait_queue_head_t set_waitq; + wait_queue_head_t *set_wakeup_ptr; + /** List of requests in the set */ + struct list_head set_requests; + /** + * List of completion callbacks to be called when the set is completed + * This is only used if \a set_interpret is NULL. + * Links struct ptlrpc_set_cbdata. + */ + struct list_head set_cblist; + /** Completion callback, if only one. */ + set_interpreter_func set_interpret; + /** opaq argument passed to completion \a set_interpret callback. */ + void *set_arg; + /** + * Lock for \a set_new_requests manipulations + * locked so that any old caller can communicate requests to + * the set holder who can then fold them into the lock-free set + */ + spinlock_t set_new_req_lock; + /** List of new yet unsent requests. Only used with ptlrpcd now. */ + struct list_head set_new_requests; + + /** rq_status of requests that have been freed already */ + int set_rc; + /** Additional fields used by the flow control extension */ + /** Maximum number of RPCs in flight */ + int set_max_inflight; + /** Callback function used to generate RPCs */ + set_producer_func set_producer; + /** opaq argument passed to the producer callback */ + void *set_producer_arg; +}; + +/** + * Description of a single ptrlrpc_set callback + */ +struct ptlrpc_set_cbdata { + /** List linkage item */ + struct list_head psc_item; + /** Pointer to interpreting function */ + set_interpreter_func psc_interpret; + /** Opaq argument to pass to the callback */ + void *psc_data; +}; + +struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; +struct ptlrpc_service; + +/** + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(lnet_event_t *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +/** Maximum number of locks to fit into reply state */ +#define RS_MAX_LOCKS 8 +#define RS_DEBUG 0 + +/** + * Structure to define reply state on the server + * Reply state holds various reply message information. Also for "difficult" + * replies (rep-ack case) we store the state after sending reply and wait + * for the client to acknowledge the reception. In these cases locks could be + * added to the state for replay/failover consistency guarantees. + */ +struct ptlrpc_reply_state { + /** Callback description */ + struct ptlrpc_cb_id rs_cb_id; + /** Linkage for list of all reply states in a system */ + struct list_head rs_list; + /** Linkage for list of all reply states on same export */ + struct list_head rs_exp_list; + /** Linkage for list of all reply states for same obd */ + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /** A spinlock to protect the reply state flags */ + spinlock_t rs_lock; + /** Reply state flags */ + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_on_net:1; /* reply_out_callback pending? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed + and the rs was dispatched + by ptlrpc_commit_replies */ + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_service_part *rs_svcpt; + /** Lnet metadata handle for the reply */ + lnet_handle_md_t rs_md_h; + atomic_t rs_refcount; + + /** Context for the sevice thread */ + struct ptlrpc_svc_ctx *rs_svc_ctx; + /** Reply buffer (actually sent to the client), encoded if needed */ + struct lustre_msg *rs_repbuf; /* wrapper */ + /** Size of the reply buffer */ + int rs_repbuf_len; /* wrapper buf length */ + /** Size of the reply message */ + int rs_repdata_len; /* wrapper msg length */ + /** + * Actual reply message. Its content is encrupted (if needed) to + * produce reply buffer for actual sending. In simple case + * of no network encryption we jus set \a rs_repbuf to \a rs_msg + */ + struct lustre_msg *rs_msg; /* reply message */ + + /** Number of locks awaiting client ACK */ + int rs_nlocks; + /** Handles of locks awaiting client reply ACK */ + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + /** Lock modes of locks in \a rs_locks */ + ldlm_mode_t rs_modes[RS_MAX_LOCKS]; +}; + +struct ptlrpc_thread; + +/** RPC stages */ +enum rq_phase { + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREGISTERING = 0xebc0de05, + RQ_PHASE_UNDEFINED = 0xebc0de06 +}; + +/** Type of request interpreter call-back */ +typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc); + +/** + * Definition of request pool structure. + * The pool is used to store empty preallocated requests for the case + * when we would actually need to send something without performing + * any allocations (to avoid e.g. OOM). + */ +struct ptlrpc_request_pool { + /** Locks the list */ + spinlock_t prp_lock; + /** list of ptlrpc_request structs */ + struct list_head prp_req_list; + /** Maximum message size that would fit into a rquest from this pool */ + int prp_rq_size; + /** Function to allocate more requests for this pool */ + void (*prp_populate)(struct ptlrpc_request_pool *, int); +}; + +struct lu_context; +struct lu_env; + +struct ldlm_lock; + +/** + * \defgroup nrs Network Request Scheduler + * @{ + */ +struct ptlrpc_nrs_policy; +struct ptlrpc_nrs_resource; +struct ptlrpc_nrs_request; + +/** + * NRS control operations. + * + * These are common for all policies. + */ +enum ptlrpc_nrs_ctl { + /** + * Not a valid opcode. + */ + PTLRPC_NRS_CTL_INVALID, + /** + * Activate the policy. + */ + PTLRPC_NRS_CTL_START, + /** + * Reserved for multiple primary policies, which may be a possibility + * in the future. + */ + PTLRPC_NRS_CTL_STOP, + /** + * Policies can start using opcodes from this value and onwards for + * their own purposes; the assigned value itself is arbitrary. + */ + PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, +}; + +/** + * ORR policy operations + */ +enum nrs_ctl_orr { + NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_ORR_WR_QUANTUM, + NRS_CTL_ORR_RD_OFF_TYPE, + NRS_CTL_ORR_WR_OFF_TYPE, + NRS_CTL_ORR_RD_SUPP_REQ, + NRS_CTL_ORR_WR_SUPP_REQ, +}; + +/** + * NRS policy operations. + * + * These determine the behaviour of a policy, and are called in response to + * NRS core events. + */ +struct ptlrpc_nrs_pol_ops { + /** + * Called during policy registration; this operation is optional. + * + * \param[in,out] policy The policy being initialized + */ + int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + /** + * Called during policy unregistration; this operation is optional. + * + * \param[in,out] policy The policy being unregistered/finalized + */ + void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + /** + * Called when activating a policy via lprocfs; policies allocate and + * initialize their resources here; this operation is optional. + * + * \param[in,out] policy The policy being started + * + * \see nrs_policy_start_locked() + */ + int (*op_policy_start) (struct ptlrpc_nrs_policy *policy); + /** + * Called when deactivating a policy via lprocfs; policies deallocate + * their resources here; this operation is optional + * + * \param[in,out] policy The policy being stopped + * + * \see nrs_policy_stop0() + */ + void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + /** + * Used for policy-specific operations; i.e. not generic ones like + * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous + * to an ioctl; this operation is optional. + * + * \param[in,out] policy The policy carrying out operation \a opc + * \param[in] opc The command operation being carried out + * \param[in,out] arg An generic buffer for communication between the + * user and the control operation + * + * \retval -ve error + * \retval 0 success + * + * \see ptlrpc_nrs_policy_control() + */ + int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); + + /** + * Called when obtaining references to the resources of the resource + * hierarchy for a request that has arrived for handling at the PTLRPC + * service. Policies should return -ve for requests they do not wish + * to handle. This operation is mandatory. + * + * \param[in,out] policy The policy we're getting resources for. + * \param[in,out] nrq The request we are getting resources for. + * \param[in] parent The parent resource of the resource being + * requested; set to NULL if none. + * \param[out] resp The resource is to be returned here; the + * fallback policy in an NRS head should + * \e always return a non-NULL pointer value. + * \param[in] moving_req When set, signifies that this is an attempt + * to obtain resources for a request being moved + * to the high-priority NRS head by + * ldlm_lock_reorder_req(). + * This implies two things: + * 1. We are under obd_export::exp_rpc_lock and + * so should not sleep. + * 2. We should not perform non-idempotent or can + * skip performing idempotent operations that + * were carried out when resources were first + * taken for the request when it was initialized + * in ptlrpc_nrs_req_initialize(). + * + * \retval 0, +ve The level of the returned resource in the resource + * hierarchy; currently only 0 (for a non-leaf resource) + * and 1 (for a leaf resource) are supported by the + * framework. + * \retval -ve error + * + * \see ptlrpc_nrs_req_initialize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + int (*op_res_get) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); + /** + * Called when releasing references taken for resources in the resource + * hierarchy for the request; this operation is optional. + * + * \param[in,out] policy The policy the resource belongs to + * \param[in] res The resource to be freed + * + * \see ptlrpc_nrs_req_finalize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + void (*op_res_put) (struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); + + /** + * Obtains a request for handling from the policy, and optionally + * removes the request from the policy; this operation is mandatory. + * + * \param[in,out] policy The policy to poll + * \param[in] peek When set, signifies that we just want to + * examine the request, and not handle it, so the + * request is not removed from the policy. + * \param[in] force When set, it will force a policy to return a + * request if it has one queued. + * + * \retval NULL No request available for handling + * \retval valid-pointer The request polled for handling + * + * \see ptlrpc_nrs_req_get_nolock() + */ + struct ptlrpc_nrs_request * + (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, + bool force); + /** + * Called when attempting to add a request to a policy for later + * handling; this operation is mandatory. + * + * \param[in,out] policy The policy on which to enqueue \a nrq + * \param[in,out] nrq The request to enqueue + * + * \retval 0 success + * \retval != 0 error + * + * \see ptlrpc_nrs_req_add_nolock() + */ + int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Removes a request from the policy's set of pending requests. Normally + * called after a request has been polled successfully from the policy + * for handling; this operation is mandatory. + * + * \param[in,out] policy The policy the request \a nrq belongs to + * \param[in,out] nrq The request to dequeue + * + * \see ptlrpc_nrs_req_del_nolock() + */ + void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Called after the request being carried out. Could be used for + * job/resource control; this operation is optional. + * + * \param[in,out] policy The policy which is stopping to handle request + * \a nrq + * \param[in,out] nrq The request + * + * \pre spin_is_locked(&svcpt->scp_req_lock) + * + * \see ptlrpc_nrs_req_stop_nolock() + */ + void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Registers the policy's lprocfs interface with a PTLRPC service. + * + * \param[in] svc The service + * + * \retval 0 success + * \retval != 0 error + */ + int (*op_lprocfs_init) (struct ptlrpc_service *svc); + /** + * Unegisters the policy's lprocfs interface with a PTLRPC service. + * + * In cases of failed policy registration in + * \e ptlrpc_nrs_policy_register(), this function may be called for a + * service which has not registered the policy successfully, so + * implementations of this method should make sure their operations are + * safe in such cases. + * + * \param[in] svc The service + */ + void (*op_lprocfs_fini) (struct ptlrpc_service *svc); +}; + +/** + * Policy flags + */ +enum nrs_policy_flags { + /** + * Fallback policy, use this flag only on a single supported policy per + * service. The flag cannot be used on policies that use + * \e PTLRPC_NRS_FL_REG_EXTERN + */ + PTLRPC_NRS_FL_FALLBACK = (1 << 0), + /** + * Start policy immediately after registering. + */ + PTLRPC_NRS_FL_REG_START = (1 << 1), + /** + * This is a policy registering from a module different to the one NRS + * core ships in (currently ptlrpc). + */ + PTLRPC_NRS_FL_REG_EXTERN = (1 << 2), +}; + +/** + * NRS queue type. + * + * Denotes whether an NRS instance is for handling normal or high-priority + * RPCs, or whether an operation pertains to one or both of the NRS instances + * in a service. + */ +enum ptlrpc_nrs_queue_type { + PTLRPC_NRS_QUEUE_REG = (1 << 0), + PTLRPC_NRS_QUEUE_HP = (1 << 1), + PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) +}; + +/** + * NRS head + * + * A PTLRPC service has at least one NRS head instance for handling normal + * priority RPCs, and may optionally have a second NRS head instance for + * handling high-priority RPCs. Each NRS head maintains a list of available + * policies, of which one and only one policy is acting as the fallback policy, + * and optionally a different policy may be acting as the primary policy. For + * all RPCs handled by this NRS head instance, NRS core will first attempt to + * enqueue the RPC using the primary policy (if any). The fallback policy is + * used in the following cases: + * - when there was no primary policy in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request + * was initialized. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, denoted it did not wish, or for some other reason was + * not able to handle the request, by returning a non-valid NRS resource + * reference. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, fails later during the request enqueueing stage. + * + * \see nrs_resource_get_safe() + * \see nrs_request_enqueue() + */ +struct ptlrpc_nrs { + spinlock_t nrs_lock; + /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ + /** + * List of registered policies + */ + struct list_head nrs_policy_list; + /** + * List of policies with queued requests. Policies that have any + * outstanding requests are queued here, and this list is queried + * in a round-robin manner from NRS core when obtaining a request + * for handling. This ensures that requests from policies that at some + * point transition away from the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. + */ + struct list_head nrs_policy_queued; + /** + * Service partition for this NRS head + */ + struct ptlrpc_service_part *nrs_svcpt; + /** + * Primary policy, which is the preferred policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_primary; + /** + * Fallback policy, which is the backup policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_fallback; + /** + * This NRS head handles either HP or regular requests + */ + enum ptlrpc_nrs_queue_type nrs_queue_type; + /** + * # queued requests from all policies in this NRS head + */ + unsigned long nrs_req_queued; + /** + * # scheduled requests from all policies in this NRS head + */ + unsigned long nrs_req_started; + /** + * # policies on this NRS + */ + unsigned nrs_num_pols; + /** + * This NRS head is in progress of starting a policy + */ + unsigned nrs_policy_starting:1; + /** + * In progress of shutting down the whole NRS head; used during + * unregistration + */ + unsigned nrs_stopping:1; +}; + +#define NRS_POL_NAME_MAX 16 + +struct ptlrpc_nrs_pol_desc; + +/** + * Service compatibility predicate; this determines whether a policy is adequate + * for handling RPCs of a particular PTLRPC service. + * + * XXX:This should give the same result during policy registration and + * unregistration, and for all partitions of a service; so the result should not + * depend on temporal service or other properties, that may influence the + * result. + */ +typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc); + +struct ptlrpc_nrs_pol_conf { + /** + * Human-readable policy name + */ + char nc_name[NRS_POL_NAME_MAX]; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *nc_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t nc_compat; + /** + * Set for policies that support a single ptlrpc service, i.e. ones that + * have \a pd_compat set to nrs_policy_compat_one(). The variable value + * depicts the name of the single service that such policies are + * compatible with. + */ + const char *nc_compat_svc_name; + /** + * Owner module for this policy descriptor; policies registering from a + * different module to the one the NRS framework is held within + * (currently ptlrpc), should set this field to THIS_MODULE. + */ + module_t *nc_owner; + /** + * Policy registration flags; a bitmast of \e nrs_policy_flags + */ + unsigned nc_flags; +}; + +/** + * NRS policy registering descriptor + * + * Is used to hold a description of a policy that can be passed to NRS core in + * order to register the policy with NRS heads in different PTLRPC services. + */ +struct ptlrpc_nrs_pol_desc { + /** + * Human-readable policy name + */ + char pd_name[NRS_POL_NAME_MAX]; + /** + * Link into nrs_core::nrs_policies + */ + struct list_head pd_list; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *pd_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t pd_compat; + /** + * Set for policies that are compatible with only one PTLRPC service. + * + * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name + */ + const char *pd_compat_svc_name; + /** + * Owner module for this policy descriptor. + * + * We need to hold a reference to the module whenever we might make use + * of any of the module's contents, i.e. + * - If one or more instances of the policy are at a state where they + * might be handling a request, i.e. + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to + * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference + * is taken on the module when + * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it + * becomes 0, so that we hold only one reference to the module maximum + * at any time. + * + * We do not need to hold a reference to the module, even though we + * might use code and data from the module, in the following cases: + * - During external policy registration, because this should happen in + * the module's init() function, in which case the module is safe from + * removal because a reference is being held on the module by the + * kernel, and iirc kmod (and I guess module-init-tools also) will + * serialize any racing processes properly anyway. + * - During external policy unregistration, because this should happen + * in a module's exit() function, and any attempts to start a policy + * instance would need to take a reference on the module, and this is + * not possible once we have reached the point where the exit() + * handler is called. + * - During service registration and unregistration, as service setup + * and cleanup, and policy registration, unregistration and policy + * instance starting, are serialized by \e nrs_core::nrs_mutex, so + * as long as users adhere to the convention of registering policies + * in init() and unregistering them in module exit() functions, there + * should not be a race between these operations. + * - During any policy-specific lprocfs operations, because a reference + * is held by the kernel on a proc entry that has been entered by a + * syscall, so as long as proc entries are removed during unregistration time, + * then unregistration and lprocfs operations will be properly + * serialized. + */ + module_t *pd_owner; + /** + * Bitmask of \e nrs_policy_flags + */ + unsigned pd_flags; + /** + * # of references on this descriptor + */ + atomic_t pd_refs; +}; + +/** + * NRS policy state + * + * Policies transition from one state to the other during their lifetime + */ +enum ptlrpc_nrs_pol_state { + /** + * Not a valid policy state. + */ + NRS_POL_STATE_INVALID, + /** + * Policies are at this state either at the start of their life, or + * transition here when the user selects a different policy to act + * as the primary one. + */ + NRS_POL_STATE_STOPPED, + /** + * Policy is progress of stopping + */ + NRS_POL_STATE_STOPPING, + /** + * Policy is in progress of starting + */ + NRS_POL_STATE_STARTING, + /** + * A policy is in this state in two cases: + * - it is the fallback policy, which is always in this state. + * - it has been activated by the user; i.e. it is the primary policy, + */ + NRS_POL_STATE_STARTED, +}; + +/** + * NRS policy information + * + * Used for obtaining information for the status of a policy via lprocfs + */ +struct ptlrpc_nrs_pol_info { + /** + * Policy name + */ + char pi_name[NRS_POL_NAME_MAX]; + /** + * Current policy state + */ + enum ptlrpc_nrs_pol_state pi_state; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pi_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pi_req_started; + /** + * Is this a fallback policy? + */ + unsigned pi_fallback:1; +}; + +/** + * NRS policy + * + * There is one instance of this for each policy in each NRS head of each + * PTLRPC service partition. + */ +struct ptlrpc_nrs_policy { + /** + * Linkage into the NRS head's list of policies, + * ptlrpc_nrs:nrs_policy_list + */ + struct list_head pol_list; + /** + * Linkage into the NRS head's list of policies with enqueued + * requests ptlrpc_nrs:nrs_policy_queued + */ + struct list_head pol_list_queued; + /** + * Current state of this policy + */ + enum ptlrpc_nrs_pol_state pol_state; + /** + * Bitmask of nrs_policy_flags + */ + unsigned pol_flags; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pol_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pol_req_started; + /** + * Usage Reference count taken on the policy instance + */ + long pol_ref; + /** + * The NRS head this policy has been created at + */ + struct ptlrpc_nrs *pol_nrs; + /** + * Private policy data; varies by policy type + */ + void *pol_private; + /** + * Policy descriptor for this policy instance. + */ + struct ptlrpc_nrs_pol_desc *pol_desc; +}; + +/** + * NRS resource + * + * Resources are embedded into two types of NRS entities: + * - Inside NRS policies, in the policy's private data in + * ptlrpc_nrs_policy::pol_private + * - In objects that act as prime-level scheduling entities in different NRS + * policies; e.g. on a policy that performs round robin or similar order + * scheduling across client NIDs, there would be one NRS resource per unique + * client NID. On a policy which performs round robin scheduling across + * backend filesystem objects, there would be one resource associated with + * each of the backend filesystem objects partaking in the scheduling + * performed by the policy. + * + * NRS resources share a parent-child relationship, in which resources embedded + * in policy instances are the parent entities, with all scheduling entities + * a policy schedules across being the children, thus forming a simple resource + * hierarchy. This hierarchy may be extended with one or more levels in the + * future if the ability to have more than one primary policy is added. + * + * Upon request initialization, references to the then active NRS policies are + * taken and used to later handle the dispatching of the request with one of + * these policies. + * + * \see nrs_resource_get_safe() + * \see ptlrpc_nrs_req_add() + */ +struct ptlrpc_nrs_resource { + /** + * This NRS resource's parent; is NULL for resources embedded in NRS + * policy instances; i.e. those are top-level ones. + */ + struct ptlrpc_nrs_resource *res_parent; + /** + * The policy associated with this resource. + */ + struct ptlrpc_nrs_policy *res_policy; +}; + +enum { + NRS_RES_FALLBACK, + NRS_RES_PRIMARY, + NRS_RES_MAX +}; + +/* \name fifo + * + * FIFO policy + * + * This policy is a logical wrapper around previous, non-NRS functionality. + * It dispatches RPCs in the same order as they arrive from the network. This + * policy is currently used as the fallback policy, and the only enabled policy + * on all NRS heads of all PTLRPC service partitions. + * @{ + */ + +/** + * Private data structure for the FIFO policy + */ +struct nrs_fifo_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource fh_res; + /** + * List of queued requests. + */ + struct list_head fh_list; + /** + * For debugging purposes. + */ + __u64 fh_sequence; +}; + +struct nrs_fifo_req { + struct list_head fr_list; + __u64 fr_sequence; +}; + +/** @} fifo */ + +/** + * \name CRR-N + * + * CRR-N, Client Round Robin over NIDs + * @{ + */ + +/** + * private data structure for CRR-N NRS + */ +struct nrs_crrn_net { + struct ptlrpc_nrs_resource cn_res; + cfs_binheap_t *cn_binheap; + cfs_hash_t *cn_cli_hash; + /** + * Used when a new scheduling round commences, in order to synchronize + * all clients with the new round number. + */ + __u64 cn_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 cn_sequence; + /** + * Round Robin quantum; the maximum number of RPCs that each request + * batch for each client can have in a scheduling round. + */ + __u16 cn_quantum; +}; + +/** + * Object representing a client in CRR-N, as identified by its NID + */ +struct nrs_crrn_client { + struct ptlrpc_nrs_resource cc_res; + struct hlist_node cc_hnode; + lnet_nid_t cc_nid; + /** + * The round number against which this client is currently scheduling + * requests. + */ + __u64 cc_round; + /** + * The sequence number used for requests scheduled by this client during + * the current round number. + */ + __u64 cc_sequence; + atomic_t cc_ref; + /** + * Round Robin quantum; the maximum number of RPCs the client is allowed + * to schedule in a single batch of each round. + */ + __u16 cc_quantum; + /** + * # of pending requests for this client, on all existing rounds + */ + __u16 cc_active; +}; + +/** + * CRR-N NRS request definition + */ +struct nrs_crrn_req { + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 cr_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 cr_sequence; +}; + +/** + * CRR-N policy operations. + */ +enum nrs_ctl_crr { + /** + * Read the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_WR_QUANTUM, +}; + +/** @} CRR-N */ + +/** + * \name ORR/TRR + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * @{ + */ + +/** + * Lower and upper byte offsets of a brw RPC + */ +struct nrs_orr_req_range { + __u64 or_start; + __u64 or_end; +}; + +/** + * RPC types supported by the ORR/TRR policies + */ +enum nrs_orr_supp { + NOS_OST_READ = (1 << 0), + NOS_OST_WRITE = (1 << 1), + NOS_OST_RW = (NOS_OST_READ | NOS_OST_WRITE), + /** + * Default value for policies. + */ + NOS_DFLT = NOS_OST_READ +}; + +/** + * As unique keys for grouping RPCs together, we use the object's OST FID for + * the ORR policy, and the OST index for the TRR policy. + * + * XXX: We waste some space for TRR policy instances by using a union, but it + * allows to consolidate some of the code between ORR and TRR, and these + * policies will probably eventually merge into one anyway. + */ +struct nrs_orr_key { + union { + /** object FID for ORR */ + struct lu_fid ok_fid; + /** OST index for TRR */ + __u32 ok_idx; + }; +}; + +/** + * The largest base string for unique hash/slab object names is + * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT + * id number, so this _should_ be more than enough for the maximum number of + * CPTs on any system. If it does happen that this statement is incorrect, + * nrs_orr_genobjname() will inevitably yield a non-unique name and cause + * kmem_cache_create() to complain (on Linux), so the erroneous situation + * will hopefully not go unnoticed. + */ +#define NRS_ORR_OBJ_NAME_MAX (sizeof("nrs_orr_reg_") + 3) + +/** + * private data structure for ORR and TRR NRS + */ +struct nrs_orr_data { + struct ptlrpc_nrs_resource od_res; + cfs_binheap_t *od_binheap; + cfs_hash_t *od_obj_hash; + struct kmem_cache *od_cache; + /** + * Used when a new scheduling round commences, in order to synchronize + * all object or OST batches with the new round number. + */ + __u64 od_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 od_sequence; + /** + * RPC types that are currently supported. + */ + enum nrs_orr_supp od_supp; + /** + * Round Robin quantum; the maxium number of RPCs that each request + * batch for each object or OST can have in a scheduling round. + */ + __u16 od_quantum; + /** + * Whether to use physical disk offsets or logical file offsets. + */ + bool od_physical; + /** + * XXX: We need to provide a persistently allocated string to hold + * unique object names for this policy, since in currently supported + * versions of Linux by Lustre, kmem_cache_create() just sets a pointer + * to the name string provided. kstrdup() is used in the version of + * kmeme_cache_create() in current Linux mainline, so we may be able to + * remove this in the future. + */ + char od_objname[NRS_ORR_OBJ_NAME_MAX]; +}; + +/** + * Represents a backend-fs object or OST in the ORR and TRR policies + * respectively + */ +struct nrs_orr_object { + struct ptlrpc_nrs_resource oo_res; + struct hlist_node oo_hnode; + /** + * The round number against which requests are being scheduled for this + * object or OST + */ + __u64 oo_round; + /** + * The sequence number used for requests scheduled for this object or + * OST during the current round number. + */ + __u64 oo_sequence; + /** + * The key of the object or OST for which this structure instance is + * scheduling RPCs + */ + struct nrs_orr_key oo_key; + atomic_t oo_ref; + /** + * Round Robin quantum; the maximum number of RPCs that are allowed to + * be scheduled for the object or OST in a single batch of each round. + */ + __u16 oo_quantum; + /** + * # of pending requests for this object or OST, on all existing rounds + */ + __u16 oo_active; +}; + +/** + * ORR/TRR NRS request definition + */ +struct nrs_orr_req { + /** + * The offset range this request covers + */ + struct nrs_orr_req_range or_range; + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 or_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 or_sequence; + /** + * For debugging purposes. + */ + struct nrs_orr_key or_key; + /** + * An ORR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_orr_set:1; + /** + * A TRR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_trr_set:1; + /** + * Request offset ranges have been filled in with logical offset + * values. + */ + unsigned int or_logical_set:1; + /** + * Request offset ranges have been filled in with physical offset + * values. + */ + unsigned int or_physical_set:1; +}; + +/** @} ORR/TRR */ + +/** + * NRS request + * + * Instances of this object exist embedded within ptlrpc_request; the main + * purpose of this object is to hold references to the request's resources + * for the lifetime of the request, and to hold properties that policies use + * use for determining the request's scheduling priority. + * */ +struct ptlrpc_nrs_request { + /** + * The request's resource hierarchy. + */ + struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; + /** + * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the + * policy that was used to enqueue the request. + * + * \see nrs_request_enqueue() + */ + unsigned nr_res_idx; + unsigned nr_initialized:1; + unsigned nr_enqueued:1; + unsigned nr_started:1; + unsigned nr_finalized:1; + cfs_binheap_node_t nr_node; + + /** + * Policy-specific fields, used for determining a request's scheduling + * priority, and other supporting functionality. + */ + union { + /** + * Fields for the FIFO policy + */ + struct nrs_fifo_req fifo; + /** + * CRR-N request defintion + */ + struct nrs_crrn_req crr; + /** ORR and TRR share the same request definition */ + struct nrs_orr_req orr; + } nr_u; + /** + * Externally-registering policies may want to use this to allocate + * their own request properties. + */ + void *ext; +}; + +/** @} nrs */ + +/** + * Basic request prioritization operations structure. + * The whole idea is centered around locks and RPCs that might affect locks. + * When a lock is contended we try to give priority to RPCs that might lead + * to fastest release of that lock. + * Currently only implemented for OSTs only in a way that makes all + * IO and truncate RPCs that are coming from a locked region where a lock is + * contended a priority over other requests. + */ +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); + /** + * Called after the request has been handled. + */ + void (*hpreq_fini)(struct ptlrpc_request *); +}; + +/** + * Represents remote procedure call. + * + * This is a staple structure used by everybody wanting to send a request + * in Lustre. + */ +struct ptlrpc_request { + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; + /** + * Linkage item through which this request is included into + * sending/delayed lists on client and into rqbd list on server + */ + struct list_head rq_list; + /** + * Server side list of incoming unserved requests sorted by arrival + * time. Traversed from time to time to notice about to expire + * requests and sent back "early replies" to clients to let them + * know server is alive and well, just very busy to service their + * requests in time + */ + struct list_head rq_timed_list; + /** server-side history, used for debuging purposes. */ + struct list_head rq_history_list; + /** server-side per-export list */ + struct list_head rq_exp_list; + /** server-side hp handlers */ + struct ptlrpc_hpreq_ops *rq_ops; + + /** initial thread servicing this request */ + struct ptlrpc_thread *rq_svc_thread; + + /** history sequence # */ + __u64 rq_history_seq; + /** \addtogroup nrs + * @{ + */ + /** stub for NRS request */ + struct ptlrpc_nrs_request rq_nrq; + /** @} nrs */ + /** the index of service's srv_at_array into which request is linked */ + time_t rq_at_index; + /** Lock to protect request flags and some other important bits, like + * rq_list + */ + spinlock_t rq_lock; + /** client-side flags are serialized by rq_lock */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, + /** + * when ->rq_replay is set, request is kept by the client even + * after server commits corresponding transaction. This is + * used for operations that require sequence of multiple + * requests to be replayed. The only example currently is file + * open/close. When last request in such a sequence is + * committed, ->rq_replay is cleared on all requests in the + * sequence. + */ + rq_replay:1, + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, rq_must_unlink:1, + rq_memalloc:1, /* req originated from "kswapd" */ + /* server-side flags */ + rq_packed_final:1, /* packed final reply */ + rq_hp:1, /* high priority RPC */ + rq_at_linked:1, /* link into service's srv_at_array */ + rq_reply_truncate:1, + rq_committed:1, + /* whether the "rq_set" is a valid one */ + rq_invalid_rqset:1, + rq_generation_set:1, + /* do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1, + /* allow the req to be sent if the import is in recovery + * status */ + rq_allow_replay:1, + /* bulk request, sent to server, but uncommitted */ + rq_unstable:1; + + unsigned int rq_nr_resend; + + enum rq_phase rq_phase; /* one of RQ_PHASE_* */ + enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ + atomic_t rq_refcount;/* client-side refcount for SENT race, + server-side refcounf for multiple replies */ + + /** Portal to which this request would be sent */ + short rq_request_portal; /* XXX FIXME bug 249 */ + /** Portal where to wait for reply and where reply would be sent */ + short rq_reply_portal; /* XXX FIXME bug 249 */ + + /** + * client-side: + * !rq_truncate : # reply bytes actually received, + * rq_truncate : required repbuf_len for resend + */ + int rq_nob_received; + /** Request length */ + int rq_reqlen; + /** Reply length */ + int rq_replen; + /** Request message - what client sent */ + struct lustre_msg *rq_reqmsg; + /** Reply message - server response */ + struct lustre_msg *rq_repmsg; + /** Transaction number */ + __u64 rq_transno; + /** xid */ + __u64 rq_xid; + /** + * List item to for replay list. Not yet commited requests get linked + * there. + * Also see \a rq_replay comment above. + */ + struct list_head rq_replay_list; + + /** + * security and encryption data + * @{ */ + struct ptlrpc_cli_ctx *rq_cli_ctx; /**< client's half ctx */ + struct ptlrpc_svc_ctx *rq_svc_ctx; /**< server's half ctx */ + struct list_head rq_ctx_chain; /**< link to waited ctx */ + + struct sptlrpc_flavor rq_flvr; /**< for client & server */ + enum lustre_sec_part rq_sp_from; + + /* client/server security flags */ + unsigned int + rq_ctx_init:1, /* context initiation */ + rq_ctx_fini:1, /* context destroy */ + rq_bulk_read:1, /* request bulk read */ + rq_bulk_write:1, /* request bulk write */ + /* server authentication flags */ + rq_auth_gss:1, /* authenticated by gss */ + rq_auth_remote:1, /* authed as remote user */ + rq_auth_usr_root:1, /* authed as root */ + rq_auth_usr_mdt:1, /* authed as mdt */ + rq_auth_usr_ost:1, /* authed as ost */ + /* security tfm flags */ + rq_pack_udesc:1, + rq_pack_bulk:1, + /* doesn't expect reply FIXME */ + rq_no_reply:1, + rq_pill_init:1; /* pill initialized */ + + uid_t rq_auth_uid; /* authed uid */ + uid_t rq_auth_mapped_uid; /* authed uid mapped to */ + + /* (server side), pointed directly into req buffer */ + struct ptlrpc_user_desc *rq_user_desc; + + /* various buffer pointers */ + struct lustre_msg *rq_reqbuf; /* req wrapper */ + char *rq_repbuf; /* rep buffer */ + struct lustre_msg *rq_repdata; /* rep wrapper msg */ + struct lustre_msg *rq_clrbuf; /* only in priv mode */ + int rq_reqbuf_len; /* req wrapper buf len */ + int rq_reqdata_len; /* req wrapper msg len */ + int rq_repbuf_len; /* rep buffer len */ + int rq_repdata_len; /* rep wrapper msg len */ + int rq_clrbuf_len; /* only in priv mode */ + int rq_clrdata_len; /* only in priv mode */ + + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + + /** @} */ + + /** Fields that help to see if request and reply were swabbed or not */ + __u32 rq_req_swab_mask; + __u32 rq_rep_swab_mask; + + /** What was import generation when this request was sent */ + int rq_import_generation; + enum lustre_imp_state rq_send_state; + + /** how many early replies (for stats) */ + int rq_early_count; + + /** client+server request */ + lnet_handle_md_t rq_req_md_h; + struct ptlrpc_cb_id rq_req_cbid; + /** optional time limit for send attempts */ + cfs_duration_t rq_delay_limit; + /** time request was first queued */ + cfs_time_t rq_queued_time; + + /* server-side... */ + /** request arrival time */ + struct timeval rq_arrival_time; + /** separated reply state */ + struct ptlrpc_reply_state *rq_reply_state; + /** incoming request buffer */ + struct ptlrpc_request_buffer_desc *rq_rqbd; + + /** client-only incoming reply */ + lnet_handle_md_t rq_reply_md_h; + wait_queue_head_t rq_reply_waitq; + struct ptlrpc_cb_id rq_reply_cbid; + + /** our LNet NID */ + lnet_nid_t rq_self; + /** Peer description (the other side) */ + lnet_process_id_t rq_peer; + /** Server-side, export on which request was received */ + struct obd_export *rq_export; + /** Client side, import where request is being sent */ + struct obd_import *rq_import; + + /** Replay callback, called after request is replayed at recovery */ + void (*rq_replay_cb)(struct ptlrpc_request *); + /** + * Commit callback, called when request is committed and about to be + * freed. + */ + void (*rq_commit_cb)(struct ptlrpc_request *); + /** Opaq data for replay and commit callbacks. */ + void *rq_cb_data; + + /** For bulk requests on client only: bulk descriptor */ + struct ptlrpc_bulk_desc *rq_bulk; + + /** client outgoing req */ + /** + * when request/reply sent (secs), or time when request should be sent + */ + time_t rq_sent; + /** time for request really sent out */ + time_t rq_real_sent; + + /** when request must finish. volatile + * so that servers' early reply updates to the deadline aren't + * kept in per-cpu cache */ + volatile time_t rq_deadline; + /** when req reply unlink must finish. */ + time_t rq_reply_deadline; + /** when req bulk unlink must finish. */ + time_t rq_bulk_deadline; + /** + * service time estimate (secs) + * If the requestsis not served by this time, it is marked as timed out. + */ + int rq_timeout; + + /** Multi-rpc bits */ + /** Per-request waitq introduced by bug 21938 for recovery waiting */ + wait_queue_head_t rq_set_waitq; + /** Link item for request set lists */ + struct list_head rq_set_chain; + /** Link back to the request set */ + struct ptlrpc_request_set *rq_set; + /** Async completion handler, called when reply is received */ + ptlrpc_interpterer_t rq_interpret_reply; + /** Async completion context */ + union ptlrpc_async_args rq_async_args; + + /** Pool if request is from preallocated list */ + struct ptlrpc_request_pool *rq_pool; + + struct lu_context rq_session; + struct lu_context rq_recov_session; + + /** request format description */ + struct req_capsule rq_pill; +}; + +/** + * Call completion handler for rpc if any, return it's status or original + * rc if there was no handler defined for this request. + */ +static inline int ptlrpc_req_interpret(const struct lu_env *env, + struct ptlrpc_request *req, int rc) +{ + if (req->rq_interpret_reply != NULL) { + req->rq_status = req->rq_interpret_reply(env, req, + &req->rq_async_args, + rc); + return req->rq_status; + } + return rc; +} + +/** \addtogroup nrs + * @{ + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf); +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf); +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req); +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info); + +/* + * Can the request be moved from the regular NRS head to the high-priority NRS + * head (of the same PTLRPC service partition), if any? + * + * For a reliable result, this should be checked under svcpt->scp_req lock. + */ +static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + + /** + * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the + * request has been enqueued first, and ptlrpc_nrs_request::nr_started + * to make sure it has not been scheduled yet (analogous to previous + * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). + */ + return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; +} +/** @} nrs */ + +/** + * Returns 1 if request buffer at offset \a index was already swabbed + */ +static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + return req->rq_req_swab_mask & (1 << index); +} + +/** + * Returns 1 if request reply buffer at offset \a index was already swabbed + */ +static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + return req->rq_rep_swab_mask & (1 << index); +} + +/** + * Returns 1 if request needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req) +{ + return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Returns 1 if request reply needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req) +{ + return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Mark request buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); + req->rq_req_swab_mask |= 1 << index; +} + +/** + * Mark request reply buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); + req->rq_rep_swab_mask |= 1 << index; +} + +/** + * Convert numerical request phase value \a phase into text string description + */ +static inline const char * +ptlrpc_phase2str(enum rq_phase phase) +{ + switch (phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + case RQ_PHASE_UNREGISTERING: + return "Unregistering"; + default: + return "?Phase?"; + } +} + +/** + * Convert numerical request phase of the request \a req into text stringi + * description + */ +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + +/** + * Debugging functions and helpers to print request structure into debug log + * @{ + */ +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +/** Convert bit flags into a string */ +#define DEBUG_REQ_FLAGS(req) \ + ptlrpc_rqphase2str(req), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), \ + FLAG(req->rq_waiting, "W"), \ + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ + FLAG(req->rq_committed, "M") + +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s" + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *data, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Helper that decides if we need to print request accordig to current debug + * level settings + */ +#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _debug_req((req), msgdata, fmt, ##a); \ +} while(0) + +/** + * This is the debug print function you need to use to print request sturucture + * content into lustre debug log. + * for most callers (level is a constant) this is resolved at compile time */ +#define DEBUG_REQ(level, req, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ + } \ +} while (0) +/** @} */ + +/** + * Structure that defines a single page of a bulk transfer + */ +struct ptlrpc_bulk_page { + /** Linkage to list of pages in a bulk */ + struct list_head bp_link; + /** + * Number of bytes in a page to transfer starting from \a bp_pageoffset + */ + int bp_buflen; + /** offset within a page */ + int bp_pageoffset; + /** The page itself */ + struct page *bp_page; +}; + +#define BULK_GET_SOURCE 0 +#define BULK_PUT_SINK 1 +#define BULK_GET_SINK 2 +#define BULK_PUT_SOURCE 3 + +/** + * Definition of bulk descriptor. + * Bulks are special "Two phase" RPCs where initial request message + * is sent first and it is followed bt a transfer (o receiving) of a large + * amount of data to be settled into pages referenced from the bulk descriptors. + * Bulks transfers (the actual data following the small requests) are done + * on separate LNet portals. + * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. + * Another user is readpage for MDT. + */ +struct ptlrpc_bulk_desc { + /** completed with failure */ + unsigned long bd_failure:1; + /** {put,get}{source,sink} */ + unsigned long bd_type:2; + /** client side */ + unsigned long bd_registered:1; + /** For serialization with callback */ + spinlock_t bd_lock; + /** Import generation when request for this bulk was sent */ + int bd_import_generation; + /** LNet portal for this bulk */ + __u32 bd_portal; + /** Server side - export this bulk created for */ + struct obd_export *bd_export; + /** Client side - import this bulk was sent on */ + struct obd_import *bd_import; + /** Back pointer to the request */ + struct ptlrpc_request *bd_req; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ + + __u64 bd_last_xid; + + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + lnet_nid_t bd_sender; /* stash event::sender */ + int bd_md_count; /* # valid entries in bd_mds */ + int bd_md_max_brw; /* max entries in bd_mds */ + /** array of associated MDs */ + lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT]; + + /* + * encrypt iov, size is either 0 or bd_iov_count. + */ + lnet_kiov_t *bd_enc_iov; + + lnet_kiov_t bd_iov[0]; +}; + +enum { + SVC_STOPPED = 1 << 0, + SVC_STOPPING = 1 << 1, + SVC_STARTING = 1 << 2, + SVC_RUNNING = 1 << 3, + SVC_EVENT = 1 << 4, + SVC_SIGNAL = 1 << 5, +}; + +#define PTLRPC_THR_NAME_LEN 32 +/** + * Definition of server service thread structure + */ +struct ptlrpc_thread { + /** + * List of active threads in svc->srv_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated memory) + */ + void *t_data; + __u32 t_flags; + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * service thread pid + */ + pid_t t_pid; + /** + * put watchdog in the structure per thread b=14840 + */ + struct lc_watchdog *t_watchdog; + /** + * the svc this thread belonged to b=18582 + */ + struct ptlrpc_service_part *t_svcpt; + wait_queue_head_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; +}; + +static inline int thread_is_init(struct ptlrpc_thread *thread) +{ + return thread->t_flags == 0; +} + +static inline int thread_is_stopped(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPED); +} + +static inline int thread_is_stopping(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPING); +} + +static inline int thread_is_starting(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STARTING); +} + +static inline int thread_is_running(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_RUNNING); +} + +static inline int thread_is_event(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_EVENT); +} + +static inline int thread_is_signal(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_SIGNAL); +} + +static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags &= ~flags; +} + +static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags = flags; +} + +static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags |= flags; +} + +static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, + __u32 flags) +{ + if (thread->t_flags & flags) { + thread->t_flags &= ~flags; + return 1; + } + return 0; +} + +/** + * Request buffer descriptor structure. + * This is a structure that contains one posted request buffer for service. + * Once data land into a buffer, event callback creates actual request and + * notifies wakes one of the service threads to process new incoming request. + * More than one request can fit into the buffer. + */ +struct ptlrpc_request_buffer_desc { + /** Link item for rqbds on a service */ + struct list_head rqbd_list; + /** History of requests for this buffer */ + struct list_head rqbd_reqs; + /** Back pointer to service for which this buffer is registered */ + struct ptlrpc_service_part *rqbd_svcpt; + /** LNet descriptor */ + lnet_handle_md_t rqbd_md_h; + int rqbd_refcount; + /** The buffer itself */ + char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + /** + * This "embedded" request structure is only used for the + * last request to fit into the buffer + */ + struct ptlrpc_request rqbd_req; +}; + +typedef int (*svc_handler_t)(struct ptlrpc_request *req); + +struct ptlrpc_service_ops { + /** + * if non-NULL called during thread creation (ptlrpc_start_thread()) + * to initialize service specific per-thread state. + */ + int (*so_thr_init)(struct ptlrpc_thread *thr); + /** + * if non-NULL called during thread shutdown (ptlrpc_main()) to + * destruct state created by ->srv_init(). + */ + void (*so_thr_done)(struct ptlrpc_thread *thr); + /** + * Handler function for incoming requests for this service + */ + int (*so_req_handler)(struct ptlrpc_request *req); + /** + * function to determine priority of the request, it's called + * on every new request + */ + int (*so_hpreq_handler)(struct ptlrpc_request *); + /** + * service-specific print fn + */ + void (*so_req_printer)(void *, struct ptlrpc_request *); +}; + +#ifndef __cfs_cacheline_aligned +/* NB: put it here for reducing patche dependence */ +# define __cfs_cacheline_aligned +#endif + +/** + * How many high priority requests to serve before serving one normal + * priority request + */ +#define PTLRPC_SVC_HP_RATIO 10 + +/** + * Definition of PortalRPC service. + * The service is listening on a particular portal (like tcp port) + * and perform actions for a specific server like IO service for OST + * or general metadata service for MDS. + */ +struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; + /** most often accessed fields */ + /** chain thru all services */ + struct list_head srv_list; + /** service operations table */ + struct ptlrpc_service_ops srv_ops; + /** only statically allocated strings here; we don't clean them */ + char *srv_name; + /** only statically allocated strings here; we don't clean them */ + char *srv_thread_name; + /** service thread list */ + struct list_head srv_threads; + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; + /** Root of /proc dir tree for this service */ + proc_dir_entry_t *srv_procroot; + /** Pointer to statistic data for this service */ + struct lprocfs_stats *srv_stats; + /** # hp per lp reqs to handle */ + int srv_hpreq_ratio; + /** biggest request to receive */ + int srv_max_req_size; + /** biggest reply to send */ + int srv_max_reply_size; + /** size of individual buffers */ + int srv_buf_size; + /** # buffers to allocate in 1 group */ + int srv_nbuf_per_group; + /** Local portal on which to receive requests */ + __u32 srv_req_portal; + /** Portal on the client to send replies to */ + __u32 srv_rep_portal; + /** + * Tags for lu_context associated with this thread, see struct + * lu_context. + */ + __u32 srv_ctx_tags; + /** soft watchdog timeout multiplier */ + int srv_watchdog_factor; + /** under unregister_service */ + unsigned srv_is_stopping:1; + + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service bound on */ + int srv_ncpts; + /** CPTs array this service bound on */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; + +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # of stopping threads, reserved for shrinking threads */ + int scp_nthrs_stopping; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + struct list_head scp_threads; + + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + struct list_head scp_rqbd_idle; + /** req buffers receiving */ + struct list_head scp_rqbd_posted; + /** incoming reqs */ + struct list_head scp_req_incoming; + /** timeout before re-posting reqs, in tick */ + cfs_duration_t scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + wait_queue_head_t scp_waitq; + + /** request history */ + struct list_head scp_hist_reqs; + /** request buffer history */ + struct list_head scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the NRS heads below */ + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** NRS head for regular requests */ + struct ptlrpc_nrs scp_nrs_reg; + /** NRS head for HP requests; this is only valid for services that can + * handle HP requests */ + struct ptlrpc_nrs *scp_nrs_hp; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + timer_list_t scp_at_timer; + /** debug */ + cfs_time_t scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + struct list_head scp_rep_active; + /** List of free reply_states */ + struct list_head scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + wait_queue_head_t scp_rep_waitq; + /** # 'difficult' replies */ + atomic_t scp_nreps_difficult; +}; + +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + +/** + * Declaration of ptlrpcd control structure + */ +struct ptlrpcd_ctl { + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; + /** + * Thread requests set. + */ + struct ptlrpc_request_set *pc_set; + /** + * Thread name used in cfs_daemonize() + */ + char pc_name[16]; + /** + * Environment for request interpreters to run in. + */ + struct lu_env pc_env; + /** + * Index of ptlrpcd thread in the array. + */ + int pc_index; + /** + * Number of the ptlrpcd's partners. + */ + int pc_npartners; + /** + * Pointer to the array of partners' ptlrpcd_ctl structure. + */ + struct ptlrpcd_ctl **pc_partners; + /** + * Record the partner index to be processed next. + */ + int pc_cursor; +}; + +/* Bits for pc_flags */ +enum ptlrpcd_ctl_flags { + /** + * Ptlrpc thread start flag. + */ + LIOD_START = 1 << 0, + /** + * Ptlrpc thread stop flag. + */ + LIOD_STOP = 1 << 1, + /** + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. + */ + LIOD_FORCE = 1 << 2, + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = 1 << 3, + /** + * The ptlrpcd is bound to some CPU core. + */ + LIOD_BIND = 1 << 4, +}; + +/** + * \addtogroup nrs + * @{ + * + * Service compatibility function; the policy is compatible with all services. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return true; +} + +/** + * Service compatibility function; the policy is compatible with only a specific + * service which is identified by its human-readable name at + * ptlrpc_service::srv_name. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval false The policy is not compatible with the service + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + LASSERT(desc->pd_compat_svc_name != NULL); + return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; +} + +/** @} nrs */ + +/* ptlrpc/events.c */ +extern lnet_handle_eq_t ptlrpc_eq_h; +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + lnet_process_id_t *peer, lnet_nid_t *self); +/** + * These callbacks are invoked by LNet when something happened to + * underlying buffer + * @{ + */ +extern void request_out_callback(lnet_event_t *ev); +extern void reply_in_callback(lnet_event_t *ev); +extern void client_bulk_callback(lnet_event_t *ev); +extern void request_in_callback(lnet_event_t *ev); +extern void reply_out_callback(lnet_event_t *ev); +/** @} */ + +/* ptlrpc/connection.c */ +struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer, + lnet_nid_t self, + struct obd_uuid *uuid); +int ptlrpc_connection_put(struct ptlrpc_connection *c); +struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); +int ptlrpc_connection_init(void); +void ptlrpc_connection_fini(void); +extern lnet_pid_t ptl_get_pid(void); + +/* ptlrpc/niobuf.c */ +/** + * Actual interfacing with LNet to put/get/register/unregister stuff + * @{ + */ + +int ptlrpc_register_bulk(struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + int rc; + + LASSERT(req != NULL); + desc = req->rq_bulk; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + req->rq_bulk_deadline > cfs_time_current_sec()) + return 1; + + if (!desc) + return 0; + + spin_lock(&desc->bd_lock); + rc = desc->bd_md_count; + spin_unlock(&desc->bd_lock); + return rc; +} + +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); +int ptlrpc_error(struct ptlrpc_request *req); +void ptlrpc_resend_req(struct ptlrpc_request *request); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); +int ptl_send_rpc(struct ptlrpc_request *request, int noreply); +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); +/** @} */ + +/* ptlrpc/client.c */ +/** + * Client-side portals API. Everything to send requests, receive replies, + * request queues, request management, etc. + * @{ + */ +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *); +void ptlrpc_cleanup_client(struct obd_import *imp); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); + +int ptlrpc_queue_wait(struct ptlrpc_request *req); +int ptlrpc_replay_req(struct ptlrpc_request *req); +int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async); +void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_cleanup_imp(struct obd_import *imp); +void ptlrpc_abort_set(struct ptlrpc_request_set *set); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg); +int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, + set_interpreter_func fn, void *data); +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); +int ptlrpc_set_wait(struct ptlrpc_request_set *); +int ptlrpc_expired_set(void *data); +void ptlrpc_interrupted_set(void *data); +void ptlrpc_mark_interrupted(struct ptlrpc_request *req); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); + +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); +void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); + +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int, int, + void (*populate_pool)(struct ptlrpc_request_pool *, int)); + +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format); +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *, + const struct req_format *format); +void ptlrpc_request_free(struct ptlrpc_request *request); +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode); +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode); +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx); +struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, + int opcode, int count, __u32 *lengths, + char **bufs); +struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_request_pool *pool); +void ptlrpc_req_finished(struct ptlrpc_request *request); +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned npages, unsigned max_brw, + unsigned type, unsigned portal); +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin); +static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 1); +} +static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 0); +} +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, int); +static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); +__u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); + +/* Set of routines to run a function in ptlrpcd context */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *data); +void ptlrpcd_destroy_work(void *handler); +int ptlrpcd_queue_work(void *handler); + +/** @} */ +struct ptlrpc_service_buf_conf { + /* nbufs is buffers # to allocate when growing the pool */ + unsigned int bc_nbufs; + /* buffer size to post */ + unsigned int bc_buf_size; + /* portal to listed for requests on */ + unsigned int bc_req_portal; + /* portal of where to send replies to */ + unsigned int bc_rep_portal; + /* maximum request size to be accepted for this service */ + unsigned int bc_req_max_size; + /* maximum reply size this service can ever send */ + unsigned int bc_rep_max_size; +}; + +struct ptlrpc_service_thr_conf { + /* threadname should be 8 characters or less - 6 will be added on */ + char *tc_thr_name; + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ + unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; + /* set NUMA node affinity for service threads */ + unsigned int tc_cpu_affinity; + /* Tags for lu_context associated with service thread */ + __u32 tc_ctx_tags; +}; + +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; +}; + +struct ptlrpc_service_conf { + /* service name */ + char *psc_name; + /* soft watchdog timeout multiplifier to print stuck service traces */ + unsigned int psc_watchdog_factor; + /* buffer information */ + struct ptlrpc_service_buf_conf psc_buf; + /* thread information */ + struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; + /* function table */ + struct ptlrpc_service_ops psc_ops; +}; + +/* ptlrpc/service.c */ +/** + * Server-side services API. Register/unregister service, request state + * management, service thread management + * + * @{ + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, + struct lustre_handle *lock, int mode, int no_ack); +void ptlrpc_commit_replies(struct obd_export *exp); +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); +struct ptlrpc_service *ptlrpc_register_service( + struct ptlrpc_service_conf *conf, + struct proc_dir_entry *proc_entry); +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); + +int ptlrpc_start_threads(struct ptlrpc_service *svc); +int ptlrpc_unregister_service(struct ptlrpc_service *service); +int liblustre_check_services(void *arg); +void ptlrpc_daemonize(char *name); +int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_server_drop_request(struct ptlrpc_request *req); +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export); + +int ptlrpc_hr_init(void); +void ptlrpc_hr_fini(void); + +/** @} */ + +/* ptlrpc/import.c */ +/** + * Import API + * @{ + */ +int ptlrpc_connect_import(struct obd_import *imp); +int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); + +/* ptlrpc/pack_generic.c */ +int ptlrpc_reconnect_import(struct obd_import *imp); +/** @} */ + +/** + * ptlrpc msg buffer and swab interface + * + * @{ + */ +int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + int index); +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + int index); +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version); +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs); +int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, + __u32 *lens, char **bufs); +int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs); +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs, int flags); +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); +int __lustre_unpack_msg(struct lustre_msg *m, int len); +int lustre_msg_hdr_size(__u32 magic, int count); +int lustre_msg_size(__u32 magic, int count, __u32 *lengths); +int lustre_msg_size_v2(int count, __u32 *lengths); +int lustre_packed_msg_size(struct lustre_msg *msg); +int lustre_msg_early_size(void); +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size); +void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); +int lustre_msg_buflen(struct lustre_msg *m, int n); +void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len); +int lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, int n, int max_len); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_flags(struct lustre_msg *msg); +void lustre_msg_add_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_flags(struct lustre_msg *msg, int flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, int flags); +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); +void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags); +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); +__u32 lustre_msg_get_type(struct lustre_msg *msg); +__u32 lustre_msg_get_version(struct lustre_msg *msg); +void lustre_msg_add_version(struct lustre_msg *msg, int version); +__u32 lustre_msg_get_opc(struct lustre_msg *msg); +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg); +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); +__u64 *lustre_msg_get_versions(struct lustre_msg *msg); +__u64 lustre_msg_get_transno(struct lustre_msg *msg); +__u64 lustre_msg_get_slv(struct lustre_msg *msg); +__u32 lustre_msg_get_limit(struct lustre_msg *msg); +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); +int lustre_msg_get_status(struct lustre_msg *msg); +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +int lustre_msg_is_v1(struct lustre_msg *msg); +__u32 lustre_msg_get_magic(struct lustre_msg *msg); +__u32 lustre_msg_get_timeout(struct lustre_msg *msg); +__u32 lustre_msg_get_service_time(struct lustre_msg *msg); +char *lustre_msg_get_jobid(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18); +#else +# warning "remove checksum compatibility support for b1_8" +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); +#endif +void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid); +void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed); +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes); +void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); + +static inline void +lustre_shrink_reply(struct ptlrpc_request *req, int segment, + unsigned int newlen, int move_data) +{ + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg); + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, + newlen, move_data); +} +/** @} */ + +/** Change request phase of \a req to \a new_phase */ +static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREGISTERING) { + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +/** + * Returns true if request \a req got early reply and hard deadline is not met + */ +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_early; +} + +/** + * Returns true if we got real reply from server for this request + */ +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_replied; +} + +/** Returns true if request \a req is in process of receiving server reply */ +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 1; + return req->rq_receiving_reply; +} + +static inline int +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) { + spin_unlock(&req->rq_lock); + return 1; + } + rc = req->rq_receiving_reply || req->rq_must_unlink; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; +} + +static inline __u32 lustre_request_magic(struct ptlrpc_request *req) +{ + return lustre_msg_get_magic(req->rq_reqmsg); +} + +static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req->rq_reqmsg->lm_repsize; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EFAULT; + } +} + +static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) +{ + if (req->rq_delay_limit != 0 && + cfs_time_before(cfs_time_add(req->rq_queued_time, + cfs_time_seconds(req->rq_delay_limit)), + cfs_time_current())) { + return 1; + } + return 0; +} + +static inline int ptlrpc_no_resend(struct ptlrpc_request *req) +{ + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; +} + +/* ldlm/ldlm_lib.c */ +/** + * Target client logic + * @{ + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); +int client_obd_cleanup(struct obd_device *obddev); +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *, + void *localdata); +int client_disconnect_export(struct obd_export *exp); +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority); +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid); +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); +void client_destroy_import(struct obd_import *imp); +/** @} */ + + +/* ptlrpc/pinger.c */ +/** + * Pinger API (client side only) + * @{ + */ +enum timeout_event { + TIMEOUT_GRANT = 1 +}; +struct timeout_item; +typedef int (*timeout_cb_t)(struct timeout_item *, void *); +int ptlrpc_pinger_add_import(struct obd_import *imp); +int ptlrpc_pinger_del_import(struct obd_import *imp); +int ptlrpc_add_timeout_client(int time, enum timeout_event event, + timeout_cb_t cb, void *data, + struct list_head *obd_list); +int ptlrpc_del_timeout_client(struct list_head *obd_list, + enum timeout_event event); +struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp); +int ptlrpc_obd_ping(struct obd_device *obd); +cfs_time_t ptlrpc_suspend_wakeup_time(void); +void ping_evictor_start(void); +void ping_evictor_stop(void); +int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req); +void ptlrpc_pinger_ir_up(void); +void ptlrpc_pinger_ir_down(void); +/** @} */ +int ptlrpc_pinger_suppress_pings(void); + +/* ptlrpc daemon bind policy */ +typedef enum { + /* all ptlrpcd threads are free mode */ + PDB_POLICY_NONE = 1, + /* all ptlrpcd threads are bound mode */ + PDB_POLICY_FULL = 2, + /* ... */ + PDB_POLICY_PAIR = 3, + /* ... , + * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1]. + * If kernel supports NUMA, pthrpcd threads are binded and + * grouped by NUMA node */ + PDB_POLICY_NEIGHBOR = 4, +} pdb_policy_t; + +/* ptlrpc daemon load policy + * It is caller's duty to specify how to push the async RPC into some ptlrpcd + * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is + * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd, + * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner, + * depends on which is scheduled firstly, to accelerate the RPC processing. */ +typedef enum { + /* on the same CPU core as the caller */ + PDL_POLICY_SAME = 1, + /* within the same CPU partition, but not the same core as the caller */ + PDL_POLICY_LOCAL = 2, + /* round-robin on all CPU cores, but not the same core as the caller */ + PDL_POLICY_ROUND = 3, + /* the specified CPU core is preferred, but not enforced */ + PDL_POLICY_PREFERRED = 4, +} pdl_policy_t; + +/* ptlrpc/ptlrpcd.c */ +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); +void ptlrpcd_free(struct ptlrpcd_ctl *pc); +void ptlrpcd_wake(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); +int ptlrpcd_addref(void); +void ptlrpcd_decref(void); + +/* ptlrpc/lproc_ptlrpc.c */ +/** + * procfs output related functions + * @{ + */ +const char* ll_opcode2str(__u32 opcode); +#ifdef LPROCFS +void ptlrpc_lprocfs_register_obd(struct obd_device *obd); +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); +#else +static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} +#endif +/** @} */ + +/* ptlrpc/llog_server.c */ +int llog_origin_handle_open(struct ptlrpc_request *req); +int llog_origin_handle_destroy(struct ptlrpc_request *req); +int llog_origin_handle_prev_block(struct ptlrpc_request *req); +int llog_origin_handle_next_block(struct ptlrpc_request *req); +int llog_origin_handle_read_header(struct ptlrpc_request *req); +int llog_origin_handle_close(struct ptlrpc_request *req); +int llog_origin_handle_cancel(struct ptlrpc_request *req); + +/* ptlrpc/llog_client.c */ +extern struct llog_operations llog_client_ops; + +/** @} net */ + +#endif +/** @} PtlRPC */ diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h new file mode 100644 index 000000000000..ed654684cb64 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_param.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_param.h + * + * User-settable parameter keys + * + * Author: Nathan Rutman + */ + +#ifndef _LUSTRE_PARAM_H +#define _LUSTRE_PARAM_H + +/** \defgroup param param + * + * @{ + */ + +/* For interoperability */ +struct cfg_interop_param { + char *old_param; + char *new_param; +}; + +/* obd_config.c */ +int class_find_param(char *buf, char *key, char **valp); +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr); +int class_get_next_param(char **params, char *copy); +int class_match_param(char *buf, char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_net(char *buf, __u32 *net, char **endh); +int class_match_nid(char *buf, char *key, lnet_nid_t nid); +int class_match_net(char *buf, char *key, __u32 net); +/* obd_mount.c */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4); + + + +/****************** User-settable parameter keys *********************/ +/* e.g. + tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda + lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 + ... testfs-MDT0000.lov.stripesize=4M + ... testfs-OST0000.ost.client_cache_seconds=15 + ... testfs.sys.timeout= + ... testfs.llite.max_read_ahead_mb=16 +*/ + +/* System global or special params not handled in obd's proc + * See mgs_write_log_sys() + */ +#define PARAM_TIMEOUT "timeout=" /* global */ +#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ +#define PARAM_AT_MIN "at_min=" /* global */ +#define PARAM_AT_MAX "at_max=" /* global */ +#define PARAM_AT_EXTRA "at_extra=" /* global */ +#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ +#define PARAM_AT_HISTORY "at_history=" /* global */ +#define PARAM_JOBID_VAR "jobid_var=" /* global */ +#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ +#define PARAM_FAILNODE "failover.node=" /* add failover nid */ +#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ +#define PARAM_ACTIVE "active=" /* activate/deactivate */ +#define PARAM_NETWORK "network=" /* bind on nid */ +#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ + +/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ +#define PARAM_OST "ost." +#define PARAM_OSC "osc." +#define PARAM_MDT "mdt." +#define PARAM_MDD "mdd." +#define PARAM_MDC "mdc." +#define PARAM_LLITE "llite." +#define PARAM_LOV "lov." +#define PARAM_LOD "lod." +#define PARAM_OSP "osp." +#define PARAM_SYS "sys." /* global */ +#define PARAM_SRPC "srpc." +#define PARAM_SRPC_FLVR "srpc.flavor." +#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" +#define PARAM_SEC "security." +#define PARAM_QUOTA "quota." /* global */ + +/** @} param */ + +#endif /* _LUSTRE_PARAM_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h new file mode 100644 index 000000000000..1c3041f50049 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_quota.h @@ -0,0 +1,239 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + * Use is subject to license terms. + */ + +#ifndef _LUSTRE_QUOTA_H +#define _LUSTRE_QUOTA_H + +/** \defgroup quota quota + * + */ + +#include + +#include +#include +#include + +#ifndef MAX_IQ_TIME +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +#ifndef MAX_DQ_TIME +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +struct lquota_id_info; +struct lquota_trans; + +/* Gather all quota record type in an union that can be used to read any records + * from disk. All fields of these records must be 64-bit aligned, otherwise the + * OSD layer may swab them incorrectly. */ +union lquota_rec { + struct lquota_glb_rec lqr_glb_rec; + struct lquota_slv_rec lqr_slv_rec; + struct lquota_acct_rec lqr_acct_rec; +}; + +/* Index features supported by the global index objects + * Only used for migration purpose and should be removed once on-disk migration + * is no longer needed */ +extern struct dt_index_features dt_quota_iusr_features; +extern struct dt_index_features dt_quota_busr_features; +extern struct dt_index_features dt_quota_igrp_features; +extern struct dt_index_features dt_quota_bgrp_features; + +/* Name used in the configuration logs to identify the default metadata pool + * (composed of all the MDTs, with pool ID 0) and the default data pool (all + * the OSTs, with pool ID 0 too). */ +#define QUOTA_METAPOOL_NAME "mdt=" +#define QUOTA_DATAPOOL_NAME "ost=" + +/* + * Quota Master Target support + */ + +/* Request handlers for quota master operations. + * This is used by the MDT to pass quota/lock requests to the quota master + * target. This won't be needed any more once the QMT is a real target and + * does not rely any more on the MDT service threads and namespace. */ +struct qmt_handlers { + /* Handle quotactl request from client. */ + int (*qmth_quotactl)(const struct lu_env *, struct lu_device *, + struct obd_quotactl *); + + /* Handle dqacq/dqrel request from slave. */ + int (*qmth_dqacq)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *); + + /* LDLM intent policy associated with quota locks */ + int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *, struct ldlm_lock **, + int); + + /* Initialize LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *); + + /* Update LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *, + struct ptlrpc_request *, int); + + /* Return size of LVB to be packed in ldlm message */ + int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *); + + /* Fill request buffer with lvb */ + int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *, + int); + + /* Free lvb associated with ldlm resource */ + int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *); +}; + +/* actual handlers are defined in lustre/quota/qmt_handler.c */ +extern struct qmt_handlers qmt_hdls; + +/* + * Quota enforcement support on slaves + */ + +struct qsd_instance; + +/* The quota slave feature is implemented under the form of a library. + * The API is the following: + * + * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd + * instance via qsd_init(). This creates all required structures + * to manage quota enforcement for this target and performs all + * low-level initialization which does not involve any lustre + * object. qsd_init() should typically be called when the OSD + * is being set up. + * + * - qsd_prepare(): This sets up on-disk objects associated with the quota slave + * feature and initiates the quota reintegration procedure if + * needed. qsd_prepare() should typically be called when + * ->ldo_prepare is invoked. + * + * - qsd_start(): a qsd instance should be started once recovery is completed + * (i.e. when ->ldo_recovery_complete is called). This is used + * to notify the qsd layer that quota should now be enforced + * again via the qsd_op_begin/end functions. The last step of the + * reintegration prodecure (namely usage reconciliation) will be + * completed during start. + * + * - qsd_fini(): is used to release a qsd_instance structure allocated with + * qsd_init(). This releases all quota slave objects and frees the + * structures associated with the qsd_instance. + * + * - qsd_op_begin(): is used to enforce quota, it must be called in the + * declaration of each operation. qsd_op_end() should then be + * invoked later once all operations have been completed in + * order to release/adjust the quota space. + * Running qsd_op_begin() before qsd_start() isn't fatal and + * will return success. + * Once qsd_start() has been run, qsd_op_begin() will block + * until the reintegration procedure is completed. + * + * - qsd_op_end(): performs the post operation quota processing. This must be + * called after the operation transaction stopped. + * While qsd_op_begin() must be invoked each time a new + * operation is declared, qsd_op_end() should be called only + * once for the whole transaction. + * + * - qsd_op_adjust(): triggers pre-acquire/release if necessary. + * + * Below are the function prototypes to be used by OSD layer to manage quota + * enforcement. Arguments are documented where each function is defined. */ + +struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, + proc_dir_entry_t *); +int qsd_prepare(const struct lu_env *, struct qsd_instance *); +int qsd_start(const struct lu_env *, struct qsd_instance *); +void qsd_fini(const struct lu_env *, struct qsd_instance *); +int qsd_op_begin(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *, struct lquota_id_info *, int *); +void qsd_op_end(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *); +void qsd_op_adjust(const struct lu_env *, struct qsd_instance *, + union lquota_id *, int); +/* This is exported for the ldiskfs quota migration only, + * see convert_quota_file() */ +int lquota_disk_write_glb(const struct lu_env *, struct dt_object *, + __u64, struct lquota_glb_rec *); + +/* + * Quota information attached to a transaction + */ + +struct lquota_entry; + +struct lquota_id_info { + /* quota identifier */ + union lquota_id lqi_id; + + /* USRQUOTA or GRPQUOTA for now, could be expanded for + * directory quota or other types later. */ + int lqi_type; + + /* inodes or kbytes to be consumed or released, it could + * be negative when releasing space. */ + long long lqi_space; + + /* quota slave entry structure associated with this ID */ + struct lquota_entry *lqi_qentry; + + /* whether we are reporting blocks or inodes */ + bool lqi_is_blk; +}; + +/* Since we enforce only inode quota in meta pool (MDTs), and block quota in + * data pool (OSTs), there are at most 4 quota ids being enforced in a single + * transaction, which is chown transaction: + * original uid and gid, new uid and gid. + * + * This value might need to be revised when directory quota is added. */ +#define QUOTA_MAX_TRANSIDS 4 + +/* all qids involved in a single transaction */ +struct lquota_trans { + unsigned short lqt_id_cnt; + struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS]; +}; + +/* flags for quota local enforcement */ +#define QUOTA_FL_OVER_USRQUOTA 0x01 +#define QUOTA_FL_OVER_GRPQUOTA 0x02 +#define QUOTA_FL_SYNC 0x04 + +#define IS_LQUOTA_RES(res) \ + (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \ + res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB) + +/* helper function used by MDT & OFD to retrieve quota accounting information + * on slave */ +int lquotactl_slv(const struct lu_env *, struct dt_device *, + struct obd_quotactl *); +/** @} quota */ +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h new file mode 100644 index 000000000000..f4d3820865f1 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h @@ -0,0 +1,334 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_req_layout.h + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ + +#ifndef _LUSTRE_REQ_LAYOUT_H__ +#define _LUSTRE_REQ_LAYOUT_H__ + +/** \defgroup req_layout req_layout + * + * @{ + */ + +struct req_msg_field; +struct req_format; +struct req_capsule; + +struct ptlrpc_request; + +enum req_location { + RCL_CLIENT, + RCL_SERVER, + RCL_NR +}; + +/* Maximal number of fields (buffers) in a request message. */ +#define REQ_MAX_FIELD_NR 9 + +struct req_capsule { + struct ptlrpc_request *rc_req; + const struct req_format *rc_fmt; + enum req_location rc_loc; + __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; +}; + +#if !defined(__REQ_LAYOUT_USER__) + +/* struct ptlrpc_request, lustre_msg* */ +#include + +void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, + enum req_location location); +void req_capsule_fini(struct req_capsule *pill); + +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); +void req_capsule_client_dump(struct req_capsule *pill); +void req_capsule_server_dump(struct req_capsule *pill); +void req_capsule_init_area(struct req_capsule *pill); +int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc); +int req_capsule_server_pack(struct req_capsule *pill); + +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len, void *swabber); +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field); + +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, int size); +int req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc); +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); + +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen, + enum req_location loc); +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen); +int req_layout_init(void); +void req_layout_fini(void); + +/* __REQ_LAYOUT_USER__ */ +#endif + +extern struct req_format RQF_OBD_PING; +extern struct req_format RQF_OBD_SET_INFO; +extern struct req_format RQF_SEC_CTX; +extern struct req_format RQF_OBD_IDX_READ; +/* MGS req_format */ +extern struct req_format RQF_MGS_TARGET_REG; +extern struct req_format RQF_MGS_SET_INFO; +extern struct req_format RQF_MGS_CONFIG_READ; +/* fid/fld req_format */ +extern struct req_format RQF_SEQ_QUERY; +extern struct req_format RQF_FLD_QUERY; +/* MDS req_format */ +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_STATFS; +extern struct req_format RQF_MDS_GETSTATUS; +extern struct req_format RQF_MDS_SYNC; +extern struct req_format RQF_MDS_GETXATTR; +extern struct req_format RQF_MDS_GETATTR; +extern struct req_format RQF_UPDATE_OBJ; + +/* + * This is format of direct (non-intent) MDS_GETATTR_NAME request. + */ +extern struct req_format RQF_MDS_GETATTR_NAME; +extern struct req_format RQF_MDS_CLOSE; +extern struct req_format RQF_MDS_PIN; +extern struct req_format RQF_MDS_UNPIN; +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_GET_INFO; +extern struct req_format RQF_MDS_READPAGE; +extern struct req_format RQF_MDS_WRITEPAGE; +extern struct req_format RQF_MDS_IS_SUBDIR; +extern struct req_format RQF_MDS_DONE_WRITING; +extern struct req_format RQF_MDS_REINT; +extern struct req_format RQF_MDS_REINT_CREATE; +extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL; +extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; +extern struct req_format RQF_MDS_REINT_CREATE_SYM; +extern struct req_format RQF_MDS_REINT_OPEN; +extern struct req_format RQF_MDS_REINT_UNLINK; +extern struct req_format RQF_MDS_REINT_LINK; +extern struct req_format RQF_MDS_REINT_RENAME; +extern struct req_format RQF_MDS_REINT_SETATTR; +extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_QUOTACHECK; +extern struct req_format RQF_MDS_QUOTACTL; +extern struct req_format RQF_QC_CALLBACK; +extern struct req_format RQF_QUOTA_DQACQ; +extern struct req_format RQF_MDS_SWAP_LAYOUTS; +/* MDS hsm formats */ +extern struct req_format RQF_MDS_HSM_STATE_GET; +extern struct req_format RQF_MDS_HSM_STATE_SET; +extern struct req_format RQF_MDS_HSM_ACTION; +extern struct req_format RQF_MDS_HSM_PROGRESS; +extern struct req_format RQF_MDS_HSM_CT_REGISTER; +extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; +extern struct req_format RQF_MDS_HSM_REQUEST; +/* OST req_format */ +extern struct req_format RQF_OST_CONNECT; +extern struct req_format RQF_OST_DISCONNECT; +extern struct req_format RQF_OST_QUOTACHECK; +extern struct req_format RQF_OST_QUOTACTL; +extern struct req_format RQF_OST_GETATTR; +extern struct req_format RQF_OST_SETATTR; +extern struct req_format RQF_OST_CREATE; +extern struct req_format RQF_OST_PUNCH; +extern struct req_format RQF_OST_SYNC; +extern struct req_format RQF_OST_DESTROY; +extern struct req_format RQF_OST_BRW_READ; +extern struct req_format RQF_OST_BRW_WRITE; +extern struct req_format RQF_OST_STATFS; +extern struct req_format RQF_OST_SET_GRANT_INFO; +extern struct req_format RQF_OST_GET_INFO_GENERIC; +extern struct req_format RQF_OST_GET_INFO_LAST_ID; +extern struct req_format RQF_OST_GET_INFO_LAST_FID; +extern struct req_format RQF_OST_SET_INFO_LAST_FID; +extern struct req_format RQF_OST_GET_INFO_FIEMAP; + +/* LDLM req_format */ +extern struct req_format RQF_LDLM_ENQUEUE; +extern struct req_format RQF_LDLM_ENQUEUE_LVB; +extern struct req_format RQF_LDLM_CONVERT; +extern struct req_format RQF_LDLM_INTENT; +extern struct req_format RQF_LDLM_INTENT_BASIC; +extern struct req_format RQF_LDLM_INTENT_LAYOUT; +extern struct req_format RQF_LDLM_INTENT_GETATTR; +extern struct req_format RQF_LDLM_INTENT_OPEN; +extern struct req_format RQF_LDLM_INTENT_CREATE; +extern struct req_format RQF_LDLM_INTENT_UNLINK; +extern struct req_format RQF_LDLM_INTENT_QUOTA; +extern struct req_format RQF_LDLM_CANCEL; +extern struct req_format RQF_LDLM_CALLBACK; +extern struct req_format RQF_LDLM_CP_CALLBACK; +extern struct req_format RQF_LDLM_BL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK; +extern struct req_format RQF_LDLM_GL_DESC_CALLBACK; +/* LOG req_format */ +extern struct req_format RQF_LOG_CANCEL; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; +extern struct req_format RQF_LLOG_ORIGIN_CONNECT; + +extern struct req_msg_field RMF_GENERIC_DATA; +extern struct req_msg_field RMF_PTLRPC_BODY; +extern struct req_msg_field RMF_MDT_BODY; +extern struct req_msg_field RMF_MDT_EPOCH; +extern struct req_msg_field RMF_OBD_STATFS; +extern struct req_msg_field RMF_NAME; +extern struct req_msg_field RMF_SYMTGT; +extern struct req_msg_field RMF_TGTUUID; +extern struct req_msg_field RMF_CLUUID; +extern struct req_msg_field RMF_SETINFO_VAL; +extern struct req_msg_field RMF_SETINFO_KEY; +extern struct req_msg_field RMF_GETINFO_VAL; +extern struct req_msg_field RMF_GETINFO_VALLEN; +extern struct req_msg_field RMF_GETINFO_KEY; +extern struct req_msg_field RMF_IDX_INFO; + +/* + * connection handle received in MDS_CONNECT request. + */ +extern struct req_msg_field RMF_CONN; +extern struct req_msg_field RMF_CONNECT_DATA; +extern struct req_msg_field RMF_DLM_REQ; +extern struct req_msg_field RMF_DLM_REP; +extern struct req_msg_field RMF_DLM_LVB; +extern struct req_msg_field RMF_DLM_GL_DESC; +extern struct req_msg_field RMF_LDLM_INTENT; +extern struct req_msg_field RMF_LAYOUT_INTENT; +extern struct req_msg_field RMF_MDT_MD; +extern struct req_msg_field RMF_REC_REINT; +extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_ACL; +extern struct req_msg_field RMF_LOGCOOKIES; +extern struct req_msg_field RMF_CAPA1; +extern struct req_msg_field RMF_CAPA2; +extern struct req_msg_field RMF_OBD_QUOTACHECK; +extern struct req_msg_field RMF_OBD_QUOTACTL; +extern struct req_msg_field RMF_QUOTA_BODY; +extern struct req_msg_field RMF_STRING; +extern struct req_msg_field RMF_SWAP_LAYOUTS; +extern struct req_msg_field RMF_MDS_HSM_PROGRESS; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; +extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; +extern struct req_msg_field RMF_HSM_USER_STATE; +extern struct req_msg_field RMF_HSM_STATE_SET; +extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; + +/* seq-mgr fields */ +extern struct req_msg_field RMF_SEQ_OPC; +extern struct req_msg_field RMF_SEQ_RANGE; +extern struct req_msg_field RMF_FID_SPACE; + +/* FLD fields */ +extern struct req_msg_field RMF_FLD_OPC; +extern struct req_msg_field RMF_FLD_MDFLD; + +extern struct req_msg_field RMF_LLOGD_BODY; +extern struct req_msg_field RMF_LLOG_LOG_HDR; +extern struct req_msg_field RMF_LLOGD_CONN_BODY; + +extern struct req_msg_field RMF_MGS_TARGET_INFO; +extern struct req_msg_field RMF_MGS_SEND_PARAM; + +extern struct req_msg_field RMF_OST_BODY; +extern struct req_msg_field RMF_OBD_IOOBJ; +extern struct req_msg_field RMF_OBD_ID; +extern struct req_msg_field RMF_FID; +extern struct req_msg_field RMF_NIOBUF_REMOTE; +extern struct req_msg_field RMF_RCS; +extern struct req_msg_field RMF_FIEMAP_KEY; +extern struct req_msg_field RMF_FIEMAP_VAL; +extern struct req_msg_field RMF_OST_ID; + +/* MGS config read message format */ +extern struct req_msg_field RMF_MGS_CONFIG_BODY; +extern struct req_msg_field RMF_MGS_CONFIG_RES; + +/* generic uint32 */ +extern struct req_msg_field RMF_U32; + +/* OBJ update format */ +extern struct req_msg_field RMF_UPDATE; +extern struct req_msg_field RMF_UPDATE_REPLY; +/** @} req_layout */ + +#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h new file mode 100644 index 000000000000..9e0908e1c4d6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_sec.h @@ -0,0 +1,1145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_SEC_H_ +#define _LUSTRE_SEC_H_ + +/** \defgroup sptlrpc sptlrpc + * + * @{ + */ + +/* + * to avoid include + */ +struct obd_import; +struct obd_export; +struct ptlrpc_request; +struct ptlrpc_reply_state; +struct ptlrpc_bulk_desc; +struct brw_page; +/* Linux specific */ +struct key; +struct seq_file; + +/* + * forward declaration + */ +struct ptlrpc_sec_policy; +struct ptlrpc_sec_cops; +struct ptlrpc_sec_sops; +struct ptlrpc_sec; +struct ptlrpc_svc_ctx; +struct ptlrpc_cli_ctx; +struct ptlrpc_ctx_ops; + +/** + * \addtogroup flavor flavor + * + * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits + * are unused, must be set to 0 for future expansion. + *
+ * ------------------------------------------------------------------------
+ * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
+ * ------------------------------------------------------------------------
+ * 
+ * + * @{ + */ + +/* + * flavor constants + */ +enum sptlrpc_policy { + SPTLRPC_POLICY_NULL = 0, + SPTLRPC_POLICY_PLAIN = 1, + SPTLRPC_POLICY_GSS = 2, + SPTLRPC_POLICY_MAX, +}; + +enum sptlrpc_mech_null { + SPTLRPC_MECH_NULL = 0, + SPTLRPC_MECH_NULL_MAX, +}; + +enum sptlrpc_mech_plain { + SPTLRPC_MECH_PLAIN = 0, + SPTLRPC_MECH_PLAIN_MAX, +}; + +enum sptlrpc_mech_gss { + SPTLRPC_MECH_GSS_NULL = 0, + SPTLRPC_MECH_GSS_KRB5 = 1, + SPTLRPC_MECH_GSS_MAX, +}; + +enum sptlrpc_service_type { + SPTLRPC_SVC_NULL = 0, /**< no security */ + SPTLRPC_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_SVC_INTG = 2, /**< integrity */ + SPTLRPC_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_SVC_MAX, +}; + +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /**< hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ + SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ + SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_BULK_SVC_MAX, +}; + +/* + * compose/extract macros + */ +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) + +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) + +/* + * gss subflavors + */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + +#define SPTLRPC_SUBFLVR_KRB5N \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5A \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_KRB5I \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_KRB5P \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) + +/* + * "end user" flavors + */ +#define SPTLRPC_FLVR_NULL \ + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_PLAIN \ + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5N \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5A \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5I \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5P \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) + +#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL + +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + +/** + * extract the useful part from wire flavor + */ +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) + +/** @} flavor */ + +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); +} + +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; + +/** + * Full description of flavors being used on a ptlrpc connection, include + * both regular RPC and bulk transfer parts. + */ +struct sptlrpc_flavor { + /** + * wire flavor, should be renamed to sf_wire. + */ + __u32 sf_rpc; + /** + * general flags of PTLRPC_SEC_FL_* + */ + __u32 sf_flags; + /** + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /** + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; +}; + +/** + * identify the RPC is generated from what part of Lustre. It's encoded into + * RPC requests and to be checked by ptlrpc service. + */ +enum lustre_sec_part { + LUSTRE_SP_CLI = 0, + LUSTRE_SP_MDT, + LUSTRE_SP_OST, + LUSTRE_SP_MGC, + LUSTRE_SP_MGS, + LUSTRE_SP_ANY = 0xFF +}; + +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + +/** + * A rule specifies a flavor to be used by a ptlrpc connection between + * two Lustre parts. + */ +struct sptlrpc_rule { + __u32 sr_netid; /* LNET network ID */ + __u8 sr_from; /* sec_part */ + __u8 sr_to; /* sec_part */ + __u16 sr_padding; + struct sptlrpc_flavor sr_flvr; +}; + +/** + * A set of rules in memory. + * + * Rules are generated and stored on MGS, and propagated to MDT, OST, + * and client when needed. + */ +struct sptlrpc_rule_set { + int srs_nslot; + int srs_nrule; + struct sptlrpc_rule *srs_rules; +}; + +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); + +static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, + struct sptlrpc_rule *rule); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); + +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset, + int initial); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); + +/* The maximum length of security payload. 1024 is enough for Kerberos 5, + * and should be enough for other future mechanisms but not sure. + * Only used by pre-allocated request/reply pool. + */ +#define SPTLRPC_MAX_PAYLOAD (1024) + + +struct vfs_cred { + uint32_t vc_uid; + uint32_t vc_gid; +}; + +struct ptlrpc_ctx_ops { + /** + * To determine whether it's suitable to use the \a ctx for \a vcred. + */ + int (*match) (struct ptlrpc_cli_ctx *ctx, + struct vfs_cred *vcred); + + /** + * To bring the \a ctx uptodate. + */ + int (*refresh) (struct ptlrpc_cli_ctx *ctx); + + /** + * Validate the \a ctx. + */ + int (*validate) (struct ptlrpc_cli_ctx *ctx); + + /** + * Force the \a ctx to die. + */ + void (*die) (struct ptlrpc_cli_ctx *ctx, + int grace); + int (*display) (struct ptlrpc_cli_ctx *ctx, + char *buf, int bufsize); + + /** + * Sign the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message with signature. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). + */ + int (*sign) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Verify the reply message using \a ctx. + * + * \pre req->rq_repdata point to reply message with signature. + * \pre req->rq_repdata_len is the total reply message length. + * \post req->rq_repmsg point to reply message without signature. + * \post req->rq_replen is the reply message length. + * + * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). + */ + int (*verify) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Encrypt the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message in clear text. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see gss_cli_ctx_seal(). + */ + int (*seal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Decrypt the reply message using \a ctx. + * + * \pre req->rq_repdata point to encrypted reply message. + * \pre req->rq_repdata_len is the total cipher text length. + * \post req->rq_repmsg point to reply message in clear text. + * \post req->rq_replen is the reply message length in clear text. + * + * \see gss_cli_ctx_unseal(). + */ + int (*unseal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Wrap bulk request data. This is called before wrapping RPC + * request message. + * + * \pre bulk buffer is descripted by desc->bd_iov and + * desc->bd_iov_count. note for read it's just buffer, no data + * need to be sent; for write it contains data in clear text. + * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared + * (usually inside of RPC request message). + * - encryption: cipher text bulk buffer is descripted by + * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov + * count remains the same). + * - otherwise: bulk buffer is still desc->bd_iov and + * desc->bd_iov_count. + * + * \return 0: success. + * \return -ev: error code. + * + * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap bulk reply data. This is called after wrapping RPC + * reply message. + * + * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and + * desc->bd_iov_count, according to wrap_bulk(). + * \post final bulk data in clear text is placed in buffer described + * by desc->bd_iov and desc->bd_iov_count. + * \return +ve nob of actual bulk data in clear text. + * \return -ve error code. + * + * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ +#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ +#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ +#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ +#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ +#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ + +#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT) +#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT) +#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT) +#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT) +#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT) +#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT) + +#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ + PTLRPC_CTX_UPTODATE | \ + PTLRPC_CTX_DEAD | \ + PTLRPC_CTX_ERROR) + +struct ptlrpc_cli_ctx { + struct hlist_node cc_cache; /* linked into ctx cache */ + atomic_t cc_refcount; + struct ptlrpc_sec *cc_sec; + struct ptlrpc_ctx_ops *cc_ops; + cfs_time_t cc_expire; /* in seconds */ + unsigned int cc_early_expire:1; + unsigned long cc_flags; + struct vfs_cred cc_vcred; + spinlock_t cc_lock; + struct list_head cc_req_list; /* waiting reqs linked here */ + struct list_head cc_gc_chain; /* linked to gc chain */ +}; + +/** + * client side policy operation vector. + */ +struct ptlrpc_sec_cops { + /** + * Given an \a imp, create and initialize a ptlrpc_sec structure. + * \param ctx service context: + * - regular import: \a ctx should be NULL; + * - reverse import: \a ctx is obtained from incoming request. + * \param flavor specify what flavor to use. + * + * When necessary, policy module is responsible for taking reference + * on the import. + * + * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). + */ + struct ptlrpc_sec * (*create_sec) (struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flavor); + + /** + * Destructor of ptlrpc_sec. When called, refcount has been dropped + * to 0 and all contexts has been destroyed. + * + * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). + */ + void (*destroy_sec) (struct ptlrpc_sec *sec); + + /** + * Notify that this ptlrpc_sec is going to die. Optionally, policy + * module is supposed to set sec->ps_dying and whatever necessary + * actions. + * + * \see plain_kill_sec(), gss_sec_kill(). + */ + void (*kill_sec) (struct ptlrpc_sec *sec); + + /** + * Given \a vcred, lookup and/or create its context. The policy module + * is supposed to maintain its own context cache. + * XXX currently \a create and \a remove_dead is always 1, perhaps + * should be removed completely. + * + * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). + */ + struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, + int remove_dead); + + /** + * Called then the reference of \a ctx dropped to 0. The policy module + * is supposed to destroy this context or whatever else according to + * its cache maintainance mechamism. + * + * \param sync if zero, we shouldn't wait for the context being + * destroyed completely. + * + * \see plain_release_ctx(), gss_sec_release_ctx_kr(). + */ + void (*release_ctx) (struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync); + + /** + * Flush the context cache. + * + * \param uid context of which user, -1 means all contexts. + * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected + * contexts should be cleared immediately. + * \param force if zero, only idle contexts will be flushed. + * + * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). + */ + int (*flush_ctx_cache) + (struct ptlrpc_sec *sec, + uid_t uid, + int grace, + int force); + + /** + * Called periodically by garbage collector to remove dead contexts + * from cache. + * + * \see gss_sec_gc_ctx_kr(). + */ + void (*gc_ctx) (struct ptlrpc_sec *sec); + + /** + * Given an context \a ctx, install a corresponding reverse service + * context on client side. + * XXX currently it's only used by GSS module, maybe we should remove + * this from general API. + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + + /** + * To allocate request buffer for \a req. + * + * \pre req->rq_reqmsg == NULL. + * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, + * we are not supposed to free it. + * \post if success, req->rq_reqmsg point to a buffer with size + * at least \a lustre_msg_size. + * + * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). + */ + int (*alloc_reqbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free request buffer for \a req. + * + * \pre req->rq_reqbuf != NULL. + * + * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). + */ + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To allocate reply buffer for \a req. + * + * \pre req->rq_repbuf == NULL. + * \post if success, req->rq_repbuf point to a buffer with size + * req->rq_repbuf_len, the size should be large enough to receive + * reply which be transformed from \a lustre_msg_size of clear text. + * + * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). + */ + int (*alloc_repbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free reply buffer for \a req. + * + * \pre req->rq_repbuf != NULL. + * \post req->rq_repbuf == NULL. + * \post req->rq_repbuf_len == 0. + * + * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). + */ + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To expand the request buffer of \a req, thus the \a segment in + * the request message pointed by req->rq_reqmsg can accommodate + * at least \a newsize of data. + * + * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. + * + * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), + * gss_enlarge_reqbuf(). + */ + int (*enlarge_reqbuf) + (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize); + /* + * misc + */ + int (*display) (struct ptlrpc_sec *sec, + struct seq_file *seq); +}; + +/** + * server side policy operation vector. + */ +struct ptlrpc_sec_sops { + /** + * verify an incoming request. + * + * \pre request message is pointed by req->rq_reqbuf, size is + * req->rq_reqdata_len; and the message has been unpacked to + * host byte order. + * + * \retval SECSVC_OK success, req->rq_reqmsg point to request message + * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; + * req->rq_sp_from is decoded from request. + * \retval SECSVC_COMPLETE success, the request has been fully + * processed, and reply message has been prepared; req->rq_sp_from is + * decoded from request. + * \retval SECSVC_DROP failed, this request should be dropped. + * + * \see null_accept(), plain_accept(), gss_svc_accept_kr(). + */ + int (*accept) (struct ptlrpc_request *req); + + /** + * Perform security transformation upon reply message. + * + * \pre reply message is pointed by req->rq_reply_state->rs_msg, size + * is req->rq_replen. + * \post req->rs_repdata_len is the final message size. + * \post req->rq_reply_off is set. + * + * \see null_authorize(), plain_authorize(), gss_svc_authorize(). + */ + int (*authorize) (struct ptlrpc_request *req); + + /** + * Invalidate server context \a ctx. + * + * \see gss_svc_invalidate_ctx(). + */ + void (*invalidate_ctx) + (struct ptlrpc_svc_ctx *ctx); + + /** + * Allocate a ptlrpc_reply_state. + * + * \param msgsize size of the reply message in clear text. + * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we + * should simply use it; otherwise we'll responsible for allocating + * a new one. + * \post req->rq_reply_state != NULL; + * \post req->rq_reply_state->rs_msg != NULL; + * + * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). + */ + int (*alloc_rs) (struct ptlrpc_request *req, + int msgsize); + + /** + * Free a ptlrpc_reply_state. + */ + void (*free_rs) (struct ptlrpc_reply_state *rs); + + /** + * Release the server context \a ctx. + * + * \see gss_svc_free_ctx(). + */ + void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); + + /** + * Install a reverse context based on the server context \a ctx. + * + * \see gss_svc_install_rctx_kr(). + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); + + /** + * Prepare buffer for incoming bulk write. + * + * \pre desc->bd_iov and desc->bd_iov_count describes the buffer + * intended to receive the write. + * + * \see gss_svc_prep_bulk(). + */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap the bulk write data. + * + * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Wrap the bulk read data. + * + * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +struct ptlrpc_sec_policy { + module_t *sp_owner; + char *sp_name; + __u16 sp_policy; /* policy number */ + struct ptlrpc_sec_cops *sp_cops; /* client ops */ + struct ptlrpc_sec_sops *sp_sops; /* server ops */ +}; + +#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ +#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ +#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ +#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ +#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ + +/** + * The ptlrpc_sec represents the client side ptlrpc security facilities, + * each obd_import (both regular and reverse import) must associate with + * a ptlrpc_sec. + * + * \see sptlrpc_import_sec_adapt(). + */ +struct ptlrpc_sec { + struct ptlrpc_sec_policy *ps_policy; + atomic_t ps_refcount; + /** statistic only */ + atomic_t ps_nctx; + /** unique identifier */ + int ps_id; + struct sptlrpc_flavor ps_flvr; + enum lustre_sec_part ps_part; + /** after set, no more new context will be created */ + unsigned int ps_dying:1; + /** owning import */ + struct obd_import *ps_import; + spinlock_t ps_lock; + + /* + * garbage collection + */ + struct list_head ps_gc_list; + cfs_time_t ps_gc_interval; /* in seconds */ + cfs_time_t ps_gc_next; /* in seconds */ +}; + +static inline int sec_is_reverse(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); +} + +static inline int sec_is_rootonly(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); +} + + +struct ptlrpc_svc_ctx { + atomic_t sc_refcount; + struct ptlrpc_sec_policy *sc_policy; +}; + +/* + * user identity descriptor + */ +#define LUSTRE_MAX_GROUPS (128) + +struct ptlrpc_user_desc { + __u32 pud_uid; + __u32 pud_gid; + __u32 pud_fsuid; + __u32 pud_fsgid; + __u32 pud_cap; + __u32 pud_ngroups; + __u32 pud_groups[0]; +}; + +/* + * bulk flavors + */ +enum sptlrpc_bulk_hash_alg { + BULK_HASH_ALG_NULL = 0, + BULK_HASH_ALG_ADLER32, + BULK_HASH_ALG_CRC32, + BULK_HASH_ALG_MD5, + BULK_HASH_ALG_SHA1, + BULK_HASH_ALG_SHA256, + BULK_HASH_ALG_SHA384, + BULK_HASH_ALG_SHA512, + BULK_HASH_ALG_MAX +}; + +const char * sptlrpc_get_hash_name(__u8 hash_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); + +enum { + BSD_FL_ERR = 1, +}; + +struct ptlrpc_bulk_sec_desc { + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ +}; + + +/* + * lprocfs + */ +struct proc_dir_entry; +extern struct proc_dir_entry *sptlrpc_proc_root; + +/* + * round size up to next power of 2, for slab allocation. + * @size must be sane (can't overflow after round up) + */ +static inline int size_roundup_power2(int size) +{ + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + return size; +} + +/* + * internal support libraries + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize); + +/* + * security policies + */ +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); + +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); + +static inline +struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) +{ + __module_get(policy->sp_owner); + return policy; +} + +static inline +void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) +{ + module_put(policy->sp_owner); +} + +/* + * client credential + */ +static inline +unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) +{ + return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); +} + +static inline +int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); +} + +static inline +int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) != 0); +} + +static inline +int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); +} + +static inline +int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); +} + +static inline +int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); +} + +static inline +int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); +} + +/* + * sec get/put + */ +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec); +void sptlrpc_sec_put(struct ptlrpc_sec *sec); + +/* + * internal apis which only used by policy impelentation + */ +int sptlrpc_get_next_secid(void); +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec); + +/* + * exported client context api + */ +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx); +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); + +/* + * exported client context wrap/buffers + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + int segment, int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret); +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); + +void sptlrpc_request_out_callback(struct ptlrpc_request *req); + +/* + * exported higher interface of import & request + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flvr); +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); +void sptlrpc_import_sec_put(struct obd_import *imp); + +int sptlrpc_import_check_ctx(struct obd_import *imp); +void sptlrpc_import_flush_root_ctx(struct obd_import *imp); +void sptlrpc_import_flush_my_ctx(struct obd_import *imp); +void sptlrpc_import_flush_all_ctx(struct obd_import *imp); +int sptlrpc_req_get_ctx(struct ptlrpc_request *req); +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req); +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); + +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule); + +/* gc */ +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx); + +/* misc */ +const char * sec2target_str(struct ptlrpc_sec *sec); +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev); + +/* + * server side + */ +enum secsvc_accept_res { + SECSVC_OK = 0, + SECSVC_COMPLETE, + SECSVC_DROP, +}; + +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req); + +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req); +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset); + +/* + * reverse context + */ +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx); + +/* bulk security api */ +int sptlrpc_enc_pool_add_user(void); +int sptlrpc_enc_pool_del_user(void); +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc); +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); + +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob); +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); + +/* user descriptor helpers */ +static inline int sptlrpc_user_desc_size(int ngroups) +{ + return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); +} + +int sptlrpc_current_user_desc_size(void); +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); +int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); + + +#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN) +#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE) + +enum { + LUSTRE_SEC_NONE = 0, + LUSTRE_SEC_REMOTE = 1, + LUSTRE_SEC_SPECIFY = 2, + LUSTRE_SEC_ALL = 3 +}; + +/** @} sptlrpc */ + +#endif /* _LUSTRE_SEC_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h new file mode 100644 index 000000000000..84defce0f623 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_update.h @@ -0,0 +1,189 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.htm + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + */ +/* + * lustre/include/lustre_update.h + * + * Author: Di Wang + */ + +#ifndef _LUSTRE_UPDATE_H +#define _LUSTRE_UPDATE_H + +#define UPDATE_BUFFER_SIZE 8192 +struct update_request { + struct dt_device *ur_dt; + struct list_head ur_list; /* attached itself to thandle */ + int ur_flags; + int ur_rc; /* request result */ + int ur_batchid; /* Current batch(trans) id */ + struct update_buf *ur_buf; /* Holding the update req */ +}; + +static inline unsigned long update_size(struct update *update) +{ + unsigned long size; + int i; + + size = cfs_size_round(offsetof(struct update, u_bufs[0])); + for (i = 0; i < UPDATE_BUF_COUNT; i++) + size += cfs_size_round(update->u_lens[i]); + + return size; +} + +static inline void *update_param_buf(struct update *update, int index, + int *size) +{ + int i; + void *ptr; + + if (index >= UPDATE_BUF_COUNT) + return NULL; + + ptr = (char *)update + cfs_size_round(offsetof(struct update, + u_bufs[0])); + for (i = 0; i < index; i++) { + LASSERT(update->u_lens[i] > 0); + ptr += cfs_size_round(update->u_lens[i]); + } + + if (size != NULL) + *size = update->u_lens[index]; + + return ptr; +} + +static inline unsigned long update_buf_size(struct update_buf *buf) +{ + unsigned long size; + int i = 0; + + size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0])); + for (i = 0; i < buf->ub_count; i++) { + struct update *update; + + update = (struct update *)((char *)buf + size); + size += update_size(update); + } + LASSERT(size <= UPDATE_BUFFER_SIZE); + return size; +} + +static inline void *update_buf_get(struct update_buf *buf, int index, int *size) +{ + int count = buf->ub_count; + void *ptr; + int i = 0; + + if (index >= count) + return NULL; + + ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf, + ub_bufs[0])); + for (i = 0; i < index; i++) + ptr += update_size((struct update *)ptr); + + if (size != NULL) + *size = update_size((struct update *)ptr); + + return ptr; +} + +static inline void update_init_reply_buf(struct update_reply *reply, int count) +{ + reply->ur_version = UPDATE_REPLY_V1; + reply->ur_count = count; +} + +static inline void *update_get_buf_internal(struct update_reply *reply, + int index, int *size) +{ + char *ptr; + int count = reply->ur_count; + int i; + + if (index >= count) + return NULL; + + ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply, + ur_lens[count])); + for (i = 0; i < index; i++) { + LASSERT(reply->ur_lens[i] > 0); + ptr += cfs_size_round(reply->ur_lens[i]); + } + + if (size != NULL) + *size = reply->ur_lens[index]; + + return ptr; +} + +static inline void update_insert_reply(struct update_reply *reply, void *data, + int data_len, int index, int rc) +{ + char *ptr; + + ptr = update_get_buf_internal(reply, index, NULL); + LASSERT(ptr != NULL); + + *(int *)ptr = cpu_to_le32(rc); + ptr += sizeof(int); + if (data_len > 0) { + LASSERT(data != NULL); + memcpy(ptr, data, data_len); + } + reply->ur_lens[index] = data_len + sizeof(int); +} + +static inline int update_get_reply_buf(struct update_reply *reply, void **buf, + int index) +{ + char *ptr; + int size = 0; + int result; + + ptr = update_get_buf_internal(reply, index, &size); + result = *(int *)ptr; + + if (result < 0) + return result; + + LASSERT((ptr != NULL && size >= sizeof(int))); + *buf = ptr + sizeof(int); + return size - sizeof(int); +} + +static inline int update_get_reply_result(struct update_reply *reply, + void **buf, int index) +{ + void *ptr; + int size; + + ptr = update_get_buf_internal(reply, index, &size); + LASSERT(ptr != NULL && size > sizeof(int)); + return *(int *)ptr; +} + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h new file mode 100644 index 000000000000..dc187b8f741f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_ver.h @@ -0,0 +1,24 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ +/* This file automatically generated from lustre/include/lustre_ver.h.in, + * based on parameters in lustre/autoconf/lustre-version.ac. + * Changes made directly to this file will be lost. */ + +#define LUSTRE_MAJOR 2 +#define LUSTRE_MINOR 3 +#define LUSTRE_PATCH 64 +#define LUSTRE_FIX 0 +#define LUSTRE_VERSION_STRING "2.3.64" + +#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX) + +/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches + * by this amount (set in lustre/autoconf/lustre-version.ac). */ +#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32) + +/* If lustre version of client and servers it connects to differs by more + * than this amount, client would issue a warning. + * (set in lustre/autoconf/lustre-version.ac) */ +#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0) + +#endif diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h new file mode 100644 index 000000000000..28f1a6b76f73 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lvfs.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LVFS_H__ +#define __LVFS_H__ + +#define LL_FID_NAMELEN (16 + 1 + 8 + 1) + +#include +#include + +#include + + +/* lvfs_common.c */ +struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data); + +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *cred); +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *cred); +#endif diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h new file mode 100644 index 000000000000..eefa0f11bd13 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/md_object.h @@ -0,0 +1,946 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/md_object.h + * + * Extention of lu_object.h for metadata objects + */ + +#ifndef _LUSTRE_MD_OBJECT_H +#define _LUSTRE_MD_OBJECT_H + +/** \defgroup md md + * Sub-class of lu_object with methods common for "meta-data" objects in MDT + * stack. + * + * Meta-data objects implement namespace operations: you can link, unlink + * them, and treat them as directories. + * + * Examples: mdt, cmm, and mdt are implementations of md interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include + +struct md_device; +struct md_device_operations; +struct md_object; +struct obd_export; + +enum { + UCRED_INVALID = -1, + UCRED_INIT = 0, + UCRED_OLD = 1, + UCRED_NEW = 2 +}; + +enum { + MD_CAPAINFO_MAX = 5 +}; + +/** there are at most 5 fids in one operation, see rename, NOTE the last one + * is a temporary one used for is_subdir() */ +struct md_capainfo { + __u32 mc_auth; + __u32 mc_padding; + struct lu_fid mc_fid[MD_CAPAINFO_MAX]; + struct lustre_capa *mc_capa[MD_CAPAINFO_MAX]; +}; + +struct md_quota { + struct obd_export *mq_exp; +}; + +/** + * Implemented in mdd/mdd_handler.c. + * + * XXX should be moved into separate .h/.c together with all md security + * related definitions. + */ +struct md_capainfo *md_capainfo(const struct lu_env *env); +struct md_quota *md_quota(const struct lu_env *env); + +/** metadata attributes */ +enum ma_valid { + MA_INODE = (1 << 0), + MA_LOV = (1 << 1), + MA_COOKIE = (1 << 2), + MA_FLAGS = (1 << 3), + MA_LMV = (1 << 4), + MA_ACL_DEF = (1 << 5), + MA_LOV_DEF = (1 << 6), + MA_LAY_GEN = (1 << 7), + MA_HSM = (1 << 8), + MA_SOM = (1 << 9), + MA_PFID = (1 << 10) +}; + +typedef enum { + MDL_MINMODE = 0, + MDL_EX = 1, + MDL_PW = 2, + MDL_PR = 4, + MDL_CW = 8, + MDL_CR = 16, + MDL_NL = 32, + MDL_GROUP = 64, + MDL_MAXMODE +} mdl_mode_t; + +typedef enum { + MDT_NUL_LOCK = 0, + MDT_REG_LOCK = (1 << 0), + MDT_PDO_LOCK = (1 << 1) +} mdl_type_t; + +/* memory structure for hsm attributes + * for fields description see the on disk structure hsm_attrs + * which is defined in lustre_idl.h + */ +struct md_hsm { + __u32 mh_compat; + __u32 mh_flags; + __u64 mh_arch_id; + __u64 mh_arch_ver; +}; + +#define IOEPOCH_INVAL 0 + +/* memory structure for som attributes + * for fields description see the on disk structure som_attrs + * which is defined in lustre_idl.h + */ +struct md_som_data { + __u32 msd_compat; + __u32 msd_incompat; + __u64 msd_ioepoch; + __u64 msd_size; + __u64 msd_blocks; + __u64 msd_mountid; +}; + +struct md_attr { + __u64 ma_valid; + __u64 ma_need; + __u64 ma_attr_flags; + struct lu_attr ma_attr; + struct lu_fid ma_pfid; + struct md_hsm ma_hsm; + struct lov_mds_md *ma_lmm; + struct lmv_stripe_md *ma_lmv; + void *ma_acl; + struct llog_cookie *ma_cookie; + struct lustre_capa *ma_capa; + struct md_som_data *ma_som; + int ma_lmm_size; + int ma_lmv_size; + int ma_acl_size; + int ma_cookie_size; + __u16 ma_layout_gen; +}; + +/** Additional parameters for create */ +struct md_op_spec { + union { + /** symlink target */ + const char *sp_symname; + /** parent FID for cross-ref mkdir */ + const struct lu_fid *sp_pfid; + /** eadata for regular files */ + struct md_spec_reg { + /** lov objs exist already */ + const struct lu_fid *fid; + const void *eadata; + int eadatalen; + } sp_ea; + } u; + + /** Create flag from client: such as MDS_OPEN_CREAT, and others. */ + __u64 sp_cr_flags; + + /** don't create lov objects or llog cookie - this replay */ + unsigned int no_create:1, + sp_cr_lookup:1, /* do lookup sanity check or not. */ + sp_rm_entry:1; /* only remove name entry */ + + /** Current lock mode for parent dir where create is performing. */ + mdl_mode_t sp_cr_mode; + + /** to create directory */ + const struct dt_index_features *sp_feat; +}; + +/** + * Operations implemented for each md object (both directory and leaf). + */ +struct md_object_operations { + int (*moo_permission)(const struct lu_env *env, + struct md_object *pobj, struct md_object *cobj, + struct md_attr *attr, int mask); + + int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj, + struct md_attr *attr); + + int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj, + const struct md_attr *attr); + + int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf, const char *name); + + int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj, + const struct lu_buf *buf, const char *name, + int fl); + + int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj, + const char *name); + + /** This method is used to swap the layouts between 2 objects */ + int (*moo_swap_layouts)(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 flags); + + /** \retval number of bytes actually read upon success */ + int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, + const struct lu_rdpg *rdpg); + + int (*moo_readlink)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + int (*moo_changelog)(const struct lu_env *env, + enum changelog_rec_type type, int flags, + struct md_object *obj); + /** part of cross-ref operation */ + int (*moo_object_create)(const struct lu_env *env, + struct md_object *obj, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*moo_ref_add)(const struct lu_env *env, + struct md_object *obj, + const struct md_attr *ma); + + int (*moo_ref_del)(const struct lu_env *env, + struct md_object *obj, + struct md_attr *ma); + + int (*moo_open)(const struct lu_env *env, + struct md_object *obj, int flag); + + int (*moo_close)(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, int mode); + + int (*moo_capa_get)(const struct lu_env *, struct md_object *, + struct lustre_capa *, int renewal); + + int (*moo_object_sync)(const struct lu_env *, struct md_object *); + + int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj, + struct lov_mds_md *lmm, struct ldlm_extent *extent, + struct lustre_handle *lockh); + int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj, + struct lov_mds_md *lmm, + struct lustre_handle *lockh); + int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy); +}; + +/** + * Operations implemented for each directory object. + */ +struct md_dir_operations { + int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj, + const struct lu_fid *fid, struct lu_fid *sfid); + + int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj, + const struct lu_name *lname, struct lu_fid *fid, + struct md_op_spec *spec); + + mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env, + struct md_object *obj, + mdl_mode_t mode); + + int (*mdo_create)(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, struct md_object *child, + struct md_op_spec *spec, + struct md_attr *ma); + + /** This method is used for creating data object for this meta object*/ + int (*mdo_create_data)(const struct lu_env *env, struct md_object *p, + struct md_object *o, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj, + struct md_object *tpobj, const struct lu_fid *lf, + const struct lu_name *lsname, struct md_object *tobj, + const struct lu_name *ltname, struct md_attr *ma); + + int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, const struct lu_name *lname, + struct md_attr *ma); + + int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma, int no_name); + + /** This method is used to compare a requested layout to an existing + * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */ + int (*mdo_lum_lmm_cmp)(const struct lu_env *env, + struct md_object *cobj, + const struct md_op_spec *spec, + struct md_attr *ma); + + /** partial ops for cross-ref case */ + int (*mdo_name_insert)(const struct lu_env *env, + struct md_object *obj, + const struct lu_name *lname, + const struct lu_fid *fid, + const struct md_attr *ma); + + int (*mdo_name_remove)(const struct lu_env *env, + struct md_object *obj, + const struct lu_name *lname, + const struct md_attr *ma); + + int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj, + struct md_object *tobj, const struct lu_fid *fid, + const struct lu_name *lname, struct md_attr *ma); +}; + +struct md_device_operations { + /** meta-data device related handlers. */ + int (*mdo_root_get)(const struct lu_env *env, struct md_device *m, + struct lu_fid *f); + + int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m, + int *md_size, int *cookie_size); + + int (*mdo_statfs)(const struct lu_env *env, struct md_device *m, + struct obd_statfs *sfs); + + int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m, + int mode, unsigned long timeout, __u32 alg, + struct lustre_capa_key *keys); + + int (*mdo_update_capa_key)(const struct lu_env *env, + struct md_device *m, + struct lustre_capa_key *key); + + int (*mdo_llog_ctxt_get)(const struct lu_env *env, + struct md_device *m, int idx, void **h); + + int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m, + unsigned int cmd, int len, void *data); +}; + +enum md_upcall_event { + /** Sync the md layer*/ + MD_LOV_SYNC = (1 << 0), + /** Just for split, no need trans, for replay */ + MD_NO_TRANS = (1 << 1), + MD_LOV_CONFIG = (1 << 2), + /** Trigger quota recovery */ + MD_LOV_QUOTA = (1 << 3) +}; + +struct md_upcall { + /** this lock protects upcall using against its removal + * read lock is for usage the upcall, write - for init/fini */ + struct rw_semaphore mu_upcall_sem; + /** device to call, upper layer normally */ + struct md_device *mu_upcall_dev; + /** upcall function */ + int (*mu_upcall)(const struct lu_env *env, struct md_device *md, + enum md_upcall_event ev, void *data); +}; + +struct md_device { + struct lu_device md_lu_dev; + const struct md_device_operations *md_ops; + struct md_upcall md_upcall; +}; + +static inline void md_upcall_init(struct md_device *m, void *upcl) +{ + init_rwsem(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = NULL; + m->md_upcall.mu_upcall = upcl; +} + +static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up) +{ + down_write(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = up; + up_write(&m->md_upcall.mu_upcall_sem); +} + +static inline void md_upcall_fini(struct md_device *m) +{ + down_write(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = NULL; + m->md_upcall.mu_upcall = NULL; + up_write(&m->md_upcall.mu_upcall_sem); +} + +static inline int md_do_upcall(const struct lu_env *env, struct md_device *m, + enum md_upcall_event ev, void *data) +{ + int rc = 0; + down_read(&m->md_upcall.mu_upcall_sem); + if (m->md_upcall.mu_upcall_dev != NULL && + m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) { + rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env, + m->md_upcall.mu_upcall_dev, + ev, data); + } + up_read(&m->md_upcall.mu_upcall_sem); + return rc; +} + +struct md_object { + struct lu_object mo_lu; + const struct md_object_operations *mo_ops; + const struct md_dir_operations *mo_dir_ops; +}; + +/** + * seq-server site. + */ +struct seq_server_site { + struct lu_site *ss_lu; + /** + * mds number of this site. + */ + mdsno_t ss_node_id; + /** + * Fid location database + */ + struct lu_server_fld *ss_server_fld; + struct lu_client_fld *ss_client_fld; + + /** + * Server Seq Manager + */ + struct lu_server_seq *ss_server_seq; + + /** + * Controller Seq Manager + */ + struct lu_server_seq *ss_control_seq; + struct obd_export *ss_control_exp; + + /** + * Client Seq Manager + */ + struct lu_client_seq *ss_client_seq; +}; + +static inline struct md_device *lu2md_dev(const struct lu_device *d) +{ + LASSERT(IS_ERR(d) || lu_device_is_md(d)); + return container_of0(d, struct md_device, md_lu_dev); +} + +static inline struct lu_device *md2lu_dev(struct md_device *d) +{ + return &d->md_lu_dev; +} + +static inline struct md_object *lu2md(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev)); + return container_of0(o, struct md_object, mo_lu); +} + +static inline struct md_object *md_object_next(const struct md_object *obj) +{ + return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL); +} + +static inline struct md_device *md_obj2dev(const struct md_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev)); + return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev); +} + +static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) +{ + return s->ld_seq_site; +} + +static inline int md_device_init(struct md_device *md, struct lu_device_type *t) +{ + return lu_device_init(&md->md_lu_dev, t); +} + +static inline void md_device_fini(struct md_device *md) +{ + lu_device_fini(&md->md_lu_dev); +} + +static inline struct md_object *md_object_find_slice(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *f) +{ + return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL)); +} + + +/** md operations */ +static inline int mo_permission(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + struct md_attr *at, + int mask) +{ + LASSERT(c->mo_ops->moo_permission); + return c->mo_ops->moo_permission(env, p, c, at, mask); +} + +static inline int mo_attr_get(const struct lu_env *env, + struct md_object *m, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_get); + return m->mo_ops->moo_attr_get(env, m, at); +} + +static inline int mo_readlink(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_readlink); + return m->mo_ops->moo_readlink(env, m, buf); +} + +static inline int mo_changelog(const struct lu_env *env, + enum changelog_rec_type type, + int flags, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_changelog); + return m->mo_ops->moo_changelog(env, type, flags, m); +} + +static inline int mo_attr_set(const struct lu_env *env, + struct md_object *m, + const struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_set); + return m->mo_ops->moo_attr_set(env, m, at); +} + +static inline int mo_xattr_get(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_get); + return m->mo_ops->moo_xattr_get(env, m, buf, name); +} + +static inline int mo_xattr_del(const struct lu_env *env, + struct md_object *m, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_del); + return m->mo_ops->moo_xattr_del(env, m, name); +} + +static inline int mo_xattr_set(const struct lu_env *env, + struct md_object *m, + const struct lu_buf *buf, + const char *name, + int flags) +{ + LASSERT(m->mo_ops->moo_xattr_set); + return m->mo_ops->moo_xattr_set(env, m, buf, name, flags); +} + +static inline int mo_xattr_list(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_xattr_list); + return m->mo_ops->moo_xattr_list(env, m, buf); +} + +static inline int mo_swap_layouts(const struct lu_env *env, + struct md_object *o1, + struct md_object *o2, __u64 flags) +{ + LASSERT(o1->mo_ops->moo_swap_layouts); + LASSERT(o2->mo_ops->moo_swap_layouts); + if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) + return -EPERM; + return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); +} + +static inline int mo_open(const struct lu_env *env, + struct md_object *m, + int flags) +{ + LASSERT(m->mo_ops->moo_open); + return m->mo_ops->moo_open(env, m, flags); +} + +static inline int mo_close(const struct lu_env *env, + struct md_object *m, + struct md_attr *ma, + int mode) +{ + LASSERT(m->mo_ops->moo_close); + return m->mo_ops->moo_close(env, m, ma, mode); +} + +static inline int mo_readpage(const struct lu_env *env, + struct md_object *m, + const struct lu_rdpg *rdpg) +{ + LASSERT(m->mo_ops->moo_readpage); + return m->mo_ops->moo_readpage(env, m, rdpg); +} + +static inline int mo_object_create(const struct lu_env *env, + struct md_object *m, + const struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_object_create); + return m->mo_ops->moo_object_create(env, m, spc, at); +} + +static inline int mo_ref_add(const struct lu_env *env, + struct md_object *m, + const struct md_attr *ma) +{ + LASSERT(m->mo_ops->moo_ref_add); + return m->mo_ops->moo_ref_add(env, m, ma); +} + +static inline int mo_ref_del(const struct lu_env *env, + struct md_object *m, + struct md_attr *ma) +{ + LASSERT(m->mo_ops->moo_ref_del); + return m->mo_ops->moo_ref_del(env, m, ma); +} + +static inline int mo_capa_get(const struct lu_env *env, + struct md_object *m, + struct lustre_capa *c, + int renewal) +{ + LASSERT(m->mo_ops->moo_capa_get); + return m->mo_ops->moo_capa_get(env, m, c, renewal); +} + +static inline int mo_object_sync(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_object_sync); + return m->mo_ops->moo_object_sync(env, m); +} + +static inline int mo_file_lock(const struct lu_env *env, struct md_object *m, + struct lov_mds_md *lmm, + struct ldlm_extent *extent, + struct lustre_handle *lockh) +{ + LASSERT(m->mo_ops->moo_file_lock); + return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh); +} + +static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m, + struct lov_mds_md *lmm, + struct lustre_handle *lockh) +{ + LASSERT(m->mo_ops->moo_file_unlock); + return m->mo_ops->moo_file_unlock(env, m, lmm, lockh); +} + +static inline int mo_object_lock(const struct lu_env *env, + struct md_object *m, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy) +{ + LASSERT(m->mo_ops->moo_object_lock); + return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy); +} + +static inline int mdo_lookup(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + struct lu_fid *f, + struct md_op_spec *spec) +{ + LASSERT(p->mo_dir_ops->mdo_lookup); + return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec); +} + +static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env, + struct md_object *mo, + mdl_mode_t lm) +{ + if (mo->mo_dir_ops->mdo_lock_mode == NULL) + return MDL_MINMODE; + return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm); +} + +static inline int mdo_create(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lchild_name, + struct md_object *c, + struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(p->mo_dir_ops->mdo_create); + return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at); +} + +static inline int mdo_create_data(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_create_data); + return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma); +} + +static inline int mdo_rename(const struct lu_env *env, + struct md_object *sp, + struct md_object *tp, + const struct lu_fid *lf, + const struct lu_name *lsname, + struct md_object *t, + const struct lu_name *ltname, + struct md_attr *ma) +{ + LASSERT(tp->mo_dir_ops->mdo_rename); + return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname, + ma); +} + +static inline int mdo_is_subdir(const struct lu_env *env, + struct md_object *mo, + const struct lu_fid *fid, + struct lu_fid *sfid) +{ + LASSERT(mo->mo_dir_ops->mdo_is_subdir); + return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid); +} + +static inline int mdo_link(const struct lu_env *env, + struct md_object *p, + struct md_object *s, + const struct lu_name *lname, + struct md_attr *ma) +{ + LASSERT(s->mo_dir_ops->mdo_link); + return s->mo_dir_ops->mdo_link(env, p, s, lname, ma); +} + +static inline int mdo_unlink(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct lu_name *lname, + struct md_attr *ma, int no_name) +{ + LASSERT(p->mo_dir_ops->mdo_unlink); + return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name); +} + +static inline int mdo_lum_lmm_cmp(const struct lu_env *env, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp); + return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma); +} + +static inline int mdo_name_insert(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + const struct lu_fid *f, + const struct md_attr *ma) +{ + LASSERT(p->mo_dir_ops->mdo_name_insert); + return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma); +} + +static inline int mdo_name_remove(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + const struct md_attr *ma) +{ + LASSERT(p->mo_dir_ops->mdo_name_remove); + return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma); +} + +static inline int mdo_rename_tgt(const struct lu_env *env, + struct md_object *p, + struct md_object *t, + const struct lu_fid *lf, + const struct lu_name *lname, + struct md_attr *ma) +{ + if (t) { + LASSERT(t->mo_dir_ops->mdo_rename_tgt); + return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma); + } else { + LASSERT(p->mo_dir_ops->mdo_rename_tgt); + return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma); + } +} + +/** + * Used in MDD/OUT layer for object lock rule + **/ +enum mdd_object_role { + MOR_SRC_PARENT, + MOR_SRC_CHILD, + MOR_TGT_PARENT, + MOR_TGT_CHILD, + MOR_TGT_ORPHAN +}; + +struct dt_device; +/** + * Structure to hold object information. This is used to create object + * \pre llod_dir exist + */ +struct lu_local_obj_desc { + const char *llod_dir; + const char *llod_name; + __u32 llod_oid; + int llod_is_index; + const struct dt_index_features *llod_feat; + struct list_head llod_linkage; +}; + +struct md_object *llo_store_resolve(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *path, + struct lu_fid *fid); + +struct md_object *llo_store_open(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + struct lu_fid *fid); + +struct md_object *llo_store_create_index(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat); + +struct md_object *llo_store_create(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid); + +void llo_local_obj_register(struct lu_local_obj_desc *); +void llo_local_obj_unregister(struct lu_local_obj_desc *); + +int llo_local_objects_setup(const struct lu_env *env, + struct md_device * md, + struct dt_device * dt); + +int llo_global_init(void); +void llo_global_fini(void); + +int lustre_buf2som(void *buf, int rc, struct md_som_data *msd); +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh); +void lustre_hsm2buf(void *buf, struct md_hsm *mh); + +struct lu_ucred { + __u32 uc_valid; + __u32 uc_o_uid; + __u32 uc_o_gid; + __u32 uc_o_fsuid; + __u32 uc_o_fsgid; + __u32 uc_uid; + __u32 uc_gid; + __u32 uc_fsuid; + __u32 uc_fsgid; + __u32 uc_suppgids[2]; + cfs_cap_t uc_cap; + __u32 uc_umask; + group_info_t *uc_ginfo; + struct md_identity *uc_identity; +}; + +struct lu_ucred *lu_ucred(const struct lu_env *env); + +struct lu_ucred *lu_ucred_check(const struct lu_env *env); + +struct lu_ucred *lu_ucred_assert(const struct lu_env *env); + +int lu_ucred_global_init(void); + +void lu_ucred_global_fini(void); + +#define md_cap_t(x) (x) + +#define MD_CAP_TO_MASK(x) (1 << (x)) + +#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag)) + +/* capable() is copied from linux kernel! */ +static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap) +{ + if (md_cap_raised(uc->uc_cap, cap)) + return 1; + return 0; +} + +/** @} md */ +#endif /* _LINUX_MD_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h new file mode 100644 index 000000000000..dade2fd2eb7d --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd.h @@ -0,0 +1,1683 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_H +#define __OBD_H + +#include + +#define IOC_OSC_TYPE 'h' +#define IOC_OSC_MIN_NR 20 +#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *) +#define IOC_OSC_MAX_NR 50 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_MIN_NR 20 +#define IOC_MDC_MAX_NR 50 + +#include +#include +#include +#include +#include +#include + +#include + + +#define MAX_OBD_DEVICES 8192 + +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + __u64 ar_min_xid; +}; + +struct lov_oinfo { /* per-stripe data structure */ + struct ost_id loi_oi; /* object ID/Sequence on the target OST */ + int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ + int loi_ost_gen; /* generation of this loi_ost_idx */ + + unsigned long loi_kms_valid:1; + __u64 loi_kms; /* known minimum size */ + struct ost_lvb loi_lvb; + struct osc_async_rc loi_ar; +}; + +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + +static inline void loi_init(struct lov_oinfo *loi) +{ +} + +struct lov_stripe_md { + atomic_t lsm_refc; + spinlock_t lsm_lock; + pid_t lsm_lock_owner; /* debugging */ + + /* maximum possible file size, might change as OSTs status changes, + * e.g. disconnected, deactivated */ + __u64 lsm_maxbytes; + struct { + /* Public members. */ + struct ost_id lw_object_oi; /* lov object id/seq */ + + /* LOV-private members start here -- only for use in lov/. */ + __u32 lw_magic; + __u32 lw_stripe_size; /* size of the stripe */ + __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ + __u16 lw_stripe_count; /* number of objects being striped over */ + __u16 lw_layout_gen; /* generation of the layout */ + char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + } lsm_wire; + + struct lov_oinfo *lsm_oinfo[0]; +}; + +#define lsm_oi lsm_wire.lw_object_oi +#define lsm_magic lsm_wire.lw_magic +#define lsm_layout_gen lsm_wire.lw_layout_gen +#define lsm_stripe_size lsm_wire.lw_stripe_size +#define lsm_pattern lsm_wire.lw_pattern +#define lsm_stripe_count lsm_wire.lw_stripe_count +#define lsm_pool_name lsm_wire.lw_pool_name + +struct obd_info; + +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* Lock policy. It keeps an extent which is specific for a particular + * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy, + * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */ + ldlm_policy_data_t oi_policy; + /* Flags used for set request specific flags: + - while lock handling, the flags obtained on the enqueue + request are set here. + - while stats, the flags used for control delay/resend. + - while setattr, the flags used for distinguish punch operation + */ + __u64 oi_flags; + /* Lock handle specific for every OSC lock. */ + struct lustre_handle *oi_lockh; + /* lsm data specific for every OSC. */ + struct lov_stripe_md *oi_md; + /* obdo data specific for every OSC, if needed at all. */ + struct obdo *oi_oa; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every recieved + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; + /* oss capability, its type is obd_capa in client to avoid copy. + * in contrary its type is lustre_capa in OSS. */ + void *oi_capa; + /* transfer jobid from ost_sync() to filter_sync()... */ + char *oi_jobid; +}; + +/* compare all relevant fields. */ +static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1, + struct lov_stripe_md *m2) +{ + /* + * ->lsm_wire contains padding, but it should be zeroed out during + * allocation. + */ + return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire); +} + +static inline int lov_lum_lsm_cmp(struct lov_user_md *lum, + struct lov_stripe_md *lsm) +{ + if (lsm->lsm_magic != lum->lmm_magic) + return 1; + if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) && + (lsm->lsm_stripe_count != lum->lmm_stripe_count)) + return 2; + if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) && + (lsm->lsm_stripe_size != lum->lmm_stripe_size)) + return 3; + if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) && + (lsm->lsm_pattern != lum->lmm_pattern)) + return 4; + if ((lsm->lsm_magic == LOV_MAGIC_V3) && + (strncmp(lsm->lsm_pool_name, + ((struct lov_user_md_v3 *)lum)->lmm_pool_name, + LOV_MAXPOOLNAME) != 0)) + return 5; + return 0; +} + +static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3, + int *lmm_magic, + struct lov_user_md *lum) +{ + if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1))) + return -EFAULT; + + *lmm_magic = lumv3->lmm_magic; + + if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3); + *lmm_magic = LOV_USER_MAGIC_V1; + } else if (*lmm_magic == LOV_USER_MAGIC_V3) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + lustre_swab_lov_user_md_v3(lumv3); + *lmm_magic = LOV_USER_MAGIC_V3; + } else if (*lmm_magic != LOV_USER_MAGIC_V1) { + CDEBUG(D_IOCTL, + "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", + *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3); + return -EINVAL; + } + return 0; +} + +void lov_stripe_lock(struct lov_stripe_md *md); +void lov_stripe_unlock(struct lov_stripe_md *md); + +struct obd_type { + struct list_head typ_chain; + struct obd_ops *typ_dt_ops; + struct md_ops *typ_md_ops; + proc_dir_entry_t *typ_procroot; + char *typ_name; + int typ_refcnt; + struct lu_device_type *typ_lu; + spinlock_t obd_type_lock; +}; + +struct brw_page { + obd_off off; + struct page *pg; + int count; + obd_flag flag; +}; + +/* Individual type definitions */ + +struct ost_server_data; + +struct osd_properties { + size_t osd_max_ea_size; +}; + +#define OBT_MAGIC 0xBDDECEAE +/* hold common fields for "target" device */ +struct obd_device_target { + __u32 obt_magic; + __u32 obt_instance; + struct super_block *obt_sb; + /** last_rcvd file */ + struct file *obt_rcvd_filp; + __u64 obt_mount_count; + struct rw_semaphore obt_rwsem; + struct vfsmount *obt_vfsmnt; + struct file *obt_health_check_filp; + struct osd_properties obt_osd_properties; + struct obd_job_stats obt_jobstats; +}; + +/* llog contexts */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT, + LLOG_MDS_OST_ORIG_CTXT, + LLOG_MDS_OST_REPL_CTXT, + LLOG_SIZE_ORIG_CTXT, + LLOG_SIZE_REPL_CTXT, + LLOG_RD1_ORIG_CTXT, + LLOG_RD1_REPL_CTXT, + LLOG_TEST_ORIG_CTXT, + LLOG_TEST_REPL_CTXT, + LLOG_LOVEA_ORIG_CTXT, + LLOG_LOVEA_REPL_CTXT, + LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */ + LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */ + LLOG_MAX_CTXTS +}; + +#define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */ + +struct filter_subdirs { + struct dentry *dentry[FILTER_SUBDIR_COUNT]; +}; + + +struct filter_ext { + __u64 fe_start; + __u64 fe_end; +}; + +struct filter_obd { + /* NB this field MUST be first */ + struct obd_device_target fo_obt; + const char *fo_fstype; + + int fo_group_count; + struct dentry *fo_dentry_O; + struct dentry **fo_dentry_O_groups; + struct filter_subdirs *fo_dentry_O_sub; + struct mutex fo_init_lock; /* group initialization lock*/ + int fo_committed_group; + + spinlock_t fo_objidlock; /* protect fo_lastobjid */ + + unsigned long fo_destroys_in_progress; + struct mutex fo_create_locks[FILTER_SUBDIR_COUNT]; + + struct list_head fo_export_list; + int fo_subdir_count; + + obd_size fo_tot_dirty; /* protected by obd_osfs_lock */ + obd_size fo_tot_granted; /* all values in bytes */ + obd_size fo_tot_pending; + int fo_tot_granted_clients; + + obd_size fo_readcache_max_filesize; + spinlock_t fo_flags_lock; + unsigned int fo_read_cache:1, /**< enable read-only cache */ + fo_writethrough_cache:1,/**< read cache writes */ + fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/ + fo_raid_degraded:1;/**< RAID device degraded */ + + struct obd_import *fo_mdc_imp; + struct obd_uuid fo_mdc_uuid; + struct lustre_handle fo_mdc_conn; + struct file **fo_last_objid_files; + __u64 *fo_last_objids; /* last created objid for groups, + * protected by fo_objidlock */ + + struct mutex fo_alloc_lock; + + atomic_t fo_r_in_flight; + atomic_t fo_w_in_flight; + + /* + * per-filter pool of kiobuf's allocated by filter_common_setup() and + * torn down by filter_cleanup(). + * + * This pool contains kiobuf used by + * filter_{prep,commit}rw_{read,write}() and is shared by all OST + * threads. + * + * Locking: protected by internal lock of cfs_hash, pool can be + * found from this hash table by t_id of ptlrpc_thread. + */ + struct cfs_hash *fo_iobuf_hash; + + struct brw_stats fo_filter_stats; + + int fo_fmd_max_num; /* per exp filter_mod_data */ + int fo_fmd_max_age; /* jiffies to fmd expiry */ + unsigned long fo_syncjournal:1, /* sync journal on writes */ + fo_sync_lock_cancel:2;/* sync on lock cancel */ + + + /* sptlrpc stuff */ + rwlock_t fo_sptlrpc_lock; + struct sptlrpc_rule_set fo_sptlrpc_rset; + + /* capability related */ + unsigned int fo_fl_oss_capa; + struct list_head fo_capa_keys; + struct hlist_head *fo_capa_hash; + int fo_sec_level; +}; + +struct timeout_item { + enum timeout_event ti_event; + cfs_time_t ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; +}; + +#define OSC_MAX_RIF_DEFAULT 8 +#define MDS_OSC_MAX_RIF_DEFAULT 50 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 + +/* possible values for fo_sync_lock_cancel */ +enum { + NEVER_SYNC_ON_CANCEL = 0, + BLOCKING_SYNC_ON_CANCEL = 1, + ALWAYS_SYNC_ON_CANCEL = 2, + NUM_SYNC_ON_CANCEL_STATES +}; + +#define MDC_MAX_RIF_DEFAULT 8 +#define MDC_MAX_RIF_MAX 512 + +struct mdc_rpc_lock; +struct obd_import; +struct client_obd { + struct rw_semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ + int cl_conn_count; + /* max_mds_easize is purely a performance thing so we don't have to + * call obd_size_diskmd() all the time. */ + int cl_default_mds_easize; + int cl_max_mds_easize; + int cl_max_mds_cookiesize; + + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ + + /* the grant values are protected by loi_list_lock below */ + long cl_dirty; /* all _dirty_ in bytes */ + long cl_dirty_max; /* allowed w/o rpc */ + long cl_dirty_transit; /* dirty synchronous */ + long cl_avail_grant; /* bytes of credit for ost */ + long cl_lost_grant; /* lost credits (trunc) */ + + /* since we allocate grant by blocks, we don't know how many grant will + * be used to add a page into cache. As a solution, we reserve maximum + * grant before trying to dirty a page and unreserve the rest. + * See osc_{reserve|unreserve}_grant for details. */ + long cl_reserved_grant; + struct list_head cl_cache_waiters; /* waiting for cache/grant */ + cfs_time_t cl_next_shrink_grant; /* jiffies */ + struct list_head cl_grant_shrink_list; /* Timeout event list */ + int cl_grant_shrink_interval; /* seconds */ + + /* A chunk is an optimal size used by osc_extent to determine + * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */ + int cl_chunkbits; + int cl_chunk; + int cl_extent_tax; /* extent overhead, by bytes */ + + /* keep track of objects that have lois that contain pages which + * have been queued for async brw. this lock also protects the + * lists of osc_client_pages that hang off of the loi */ + /* + * ->cl_loi_list_lock protects consistency of + * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and + * ->ap_completion() call-backs are executed under this lock. As we + * cannot guarantee that these call-backs never block on all platforms + * (as a matter of fact they do block on Mac OS X), type of + * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux + * and blocking mutex on Mac OS X. (Alternative is to make this lock + * blocking everywhere, but we don't want to slow down fast-path of + * our main platform.) + * + * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together + * with client_obd_list_{un,}lock() and + * client_obd_list_lock_{init,done}() functions. + * + * NB by Jinshan: though field names are still _loi_, but actually + * osc_object{}s are in the list. + */ + client_obd_lock_t cl_loi_list_lock; + struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; + struct list_head cl_loi_write_list; + struct list_head cl_loi_read_list; + int cl_r_in_flight; + int cl_w_in_flight; + /* just a sum of the loi/lop pending numbers to be exported by /proc */ + atomic_t cl_pending_w_pages; + atomic_t cl_pending_r_pages; + __u32 cl_max_pages_per_rpc; + int cl_max_rpcs_in_flight; + struct obd_histogram cl_read_rpc_hist; + struct obd_histogram cl_write_rpc_hist; + struct obd_histogram cl_read_page_hist; + struct obd_histogram cl_write_page_hist; + struct obd_histogram cl_read_offset_hist; + struct obd_histogram cl_write_offset_hist; + + /* lru for osc caching pages */ + struct cl_client_cache *cl_cache; + struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */ + atomic_t *cl_lru_left; + atomic_t cl_lru_busy; + atomic_t cl_lru_shrinkers; + atomic_t cl_lru_in_list; + struct list_head cl_lru_list; /* lru page list */ + client_obd_lock_t cl_lru_list_lock; /* page list protector */ + + /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ + atomic_t cl_destroy_in_flight; + wait_queue_head_t cl_destroy_waitq; + + struct mdc_rpc_lock *cl_rpc_lock; + struct mdc_rpc_lock *cl_close_lock; + + /* mgc datastruct */ + struct semaphore cl_mgc_sem; + struct vfsmount *cl_mgc_vfsmnt; + struct dentry *cl_mgc_configs_dir; + atomic_t cl_mgc_refcount; + struct obd_export *cl_mgc_mgsexp; + + /* checksumming for data sent over the network */ + unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */ + /* supported checksum types that are worked out at connect time */ + __u32 cl_supp_cksum_types; + /* checksum algorithm to be used */ + cksum_type_t cl_cksum_type; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; + + /* used by quotacheck when the servers are older than 2.4 */ + int cl_qchk_stat; /* quotacheck stat of the peer */ +#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ +#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0) +#warning "please consider removing quotacheck compatibility code" +#endif + + /* sequence manager */ + struct lu_client_seq *cl_seq; + + atomic_t cl_resends; /* resend count */ + + /* ptlrpc work for writeback in ptlrpcd context */ + void *cl_writeback_work; + /* hash tables for osc_quota_info */ + cfs_hash_t *cl_quota_hash[MAXQUOTAS]; +}; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) + +struct obd_id_info { + __u32 idx; + obd_id *data; +}; + +/* */ + +struct echo_obd { + struct obd_device_target eo_obt; + struct obdo eo_oa; + spinlock_t eo_lock; + __u64 eo_lastino; + struct lustre_handle eo_nl_lock; + atomic_t eo_prep; +}; + +struct ost_obd { + struct ptlrpc_service *ost_service; + struct ptlrpc_service *ost_create_service; + struct ptlrpc_service *ost_io_service; + struct ptlrpc_service *ost_seq_service; + struct mutex ost_health_mutex; +}; + +struct echo_client_obd { + struct obd_export *ec_exp; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + struct list_head ec_locks; + int ec_nstripes; + __u64 ec_unique; +}; + +struct lov_qos_oss { + struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lqo_oss_list; /* link to lov_qos */ + __u64 lqo_bavail; /* total bytes avail on OSS */ + __u64 lqo_penalty; /* current penalty */ + __u64 lqo_penalty_per_obj;/* penalty decrease every obj*/ + time_t lqo_used; /* last used time, seconds */ + __u32 lqo_ost_count; /* number of osts on this oss */ +}; + +struct ltd_qos { + struct lov_qos_oss *ltq_oss; /* oss info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ + __u64 ltq_weight; /* net weighting */ + time_t ltq_used; /* last used time, seconds */ + unsigned int ltq_usable:1; /* usable for striping */ +}; + +/* Generic subset of OSTs */ +struct ost_pool { + __u32 *op_array; /* array of index of + lov_obd->lov_tgts */ + unsigned int op_count; /* number of OSTs in the array */ + unsigned int op_size; /* allocated size of lp_array */ + struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ +}; + +/* Round-robin allocator data */ +struct lov_qos_rr { + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx; /* aliasing for start_idx */ + int lqr_start_count; /* reseed counter */ + struct ost_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_dirty:1; /* recalc round-robin list */ +}; + +/* allow statfs data caching for 1 second */ +#define OBD_STATFS_CACHE_SECONDS 1 + +struct lov_statfs_data { + struct obd_info lsd_oi; + struct obd_statfs lsd_statfs; +}; +/* Stripe placement optimization */ +struct lov_qos { + struct list_head lq_oss_list; /* list of OSSs that targets use */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_oss_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lov_qos_rr lq_rr; /* round robin qos data */ + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the ost's all have approx. + the same space avail */ + lq_reset:1, /* zero current penalties */ + lq_statfs_in_progress:1; /* statfs op in + progress */ + /* qos statfs data */ + struct lov_statfs_data *lq_statfs_data; + wait_queue_head_t lq_statfs_waitq; /* waitqueue to notify statfs + * requests completion */ +}; + +struct lov_tgt_desc { + struct list_head ltd_kill; + struct obd_uuid ltd_uuid; + struct obd_device *ltd_obd; + struct obd_export *ltd_exp; + struct ltd_qos ltd_qos; /* qos info per target */ + __u32 ltd_gen; + __u32 ltd_index; /* index in lov_obd->tgts */ + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1; /* should this target be deleted */ +}; + +/* Pool metadata */ +#define pool_tgt_size(_p) _p->pool_obds.op_size +#define pool_tgt_count(_p) _p->pool_obds.op_count +#define pool_tgt_array(_p) _p->pool_obds.op_array +#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */ + struct ost_pool pool_obds; /* pool members */ + atomic_t pool_refcount; /* pool ref. counter */ + struct lov_qos_rr pool_rr; /* round robin qos */ + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + proc_dir_entry_t *pool_proc_entry; /* file in /proc */ + struct obd_device *pool_lobd; /* obd of the lov/lod to which + * this pool belongs */ +}; + +struct lov_obd { + struct lov_desc desc; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct ost_pool lov_packed; /* all OSTs in a packed + array */ + struct mutex lov_lock; + struct obd_connect_data lov_ocd; + atomic_t lov_refcount; + __u32 lov_tgt_count; /* how many OBD's */ + __u32 lov_active_tgt_count; /* how many active */ + __u32 lov_death_row;/* tgts scheduled to be deleted */ + __u32 lov_tgt_size; /* size of tgts array */ + int lov_connects; + int lov_pool_count; + cfs_hash_t *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + proc_dir_entry_t *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; + + /* Cached LRU and unstable data from upper layer */ + void *lov_cache; + + struct rw_semaphore lov_notify_lock; +}; + +struct lmv_tgt_desc { + struct obd_uuid ltd_uuid; + struct obd_export *ltd_exp; + int ltd_idx; + struct mutex ltd_fid_mutex; + unsigned long ltd_active:1; /* target up for requests */ +}; + +enum placement_policy { + PLACEMENT_CHAR_POLICY = 0, + PLACEMENT_NID_POLICY = 1, + PLACEMENT_INVAL_POLICY = 2, + PLACEMENT_MAX_POLICY +}; + +typedef enum placement_policy placement_policy_t; + +struct lmv_obd { + int refcount; + struct lu_client_fld lmv_fld; + spinlock_t lmv_lock; + placement_policy_t lmv_placement; + struct lmv_desc desc; + struct obd_uuid cluuid; + struct obd_export *exp; + + struct mutex init_mutex; + int connected; + int max_easize; + int max_def_easize; + int max_cookiesize; + int server_timeout; + + int tgts_size; /* size of tgts array */ + struct lmv_tgt_desc **tgts; + + struct obd_connect_data conn_data; +}; + +struct niobuf_local { + __u64 lnb_file_offset; + __u32 lnb_page_offset; + __u32 len; + __u32 flags; + struct page *page; + struct dentry *dentry; + int lnb_grant_used; + int rc; +}; + +#define LUSTRE_FLD_NAME "fld" +#define LUSTRE_SEQ_NAME "seq" + +#define LUSTRE_MDD_NAME "mdd" +#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" +#define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_VVP_NAME "vvp" +#define LUSTRE_LMV_NAME "lmv" +#define LUSTRE_SLP_NAME "slp" +#define LUSTRE_LOD_NAME "lod" +#define LUSTRE_OSP_NAME "osp" +#define LUSTRE_LWP_NAME "lwp" + +/* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_ECHO_NAME "obdecho" +#define LUSTRE_ECHO_CLIENT_NAME "echo_client" +#define LUSTRE_QMT_NAME "qmt" + +/* Constant obd names (post-rename) */ +#define LUSTRE_MDS_OBDNAME "MDS" +#define LUSTRE_OSS_OBDNAME "OSS" +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" + +static inline int is_osp_on_mdt(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */ + if (strncmp(ptr + 1, "osc", 3) == 0) + return 1; + + if (strncmp(ptr + 1, "MDT", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 && + strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0) + return 0; + + return 1; +} + +/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ +#define N_LOCAL_TEMP_PAGE 0x10000000 + +struct obd_trans_info { + __u64 oti_transno; + __u64 oti_xid; + /* Only used on the server side for tracking acks. */ + struct oti_req_ack_lock { + struct lustre_handle lock; + __u32 mode; + } oti_ack_locks[4]; + void *oti_handle; + struct llog_cookie oti_onecookie; + struct llog_cookie *oti_logcookies; + int oti_numcookies; + /** synchronous write is needed */ + unsigned long oti_sync_write:1; + + /* initial thread handling transaction */ + struct ptlrpc_thread * oti_thread; + __u32 oti_conn_cnt; + /** VBR: versions */ + __u64 oti_pre_version; + /** JobID */ + char *oti_jobid; + + struct obd_uuid *oti_ost_uuid; +}; + +static inline void oti_init(struct obd_trans_info *oti, + struct ptlrpc_request *req) +{ + if (oti == NULL) + return; + memset(oti, 0, sizeof(*oti)); + + if (req == NULL) + return; + + oti->oti_xid = req->rq_xid; + /** VBR: take versions from request */ + if (req->rq_reqmsg != NULL && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg); + oti->oti_pre_version = pre_version ? pre_version[0] : 0; + oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + /** called from mds_create_objects */ + if (req->rq_repmsg != NULL) + oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); + oti->oti_thread = req->rq_svc_thread; + if (req->rq_reqmsg != NULL) + oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); +} + +static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies) +{ + if (!oti) + return; + + if (num_cookies == 1) + oti->oti_logcookies = &oti->oti_onecookie; + else + OBD_ALLOC_LARGE(oti->oti_logcookies, + num_cookies * sizeof(oti->oti_onecookie)); + + oti->oti_numcookies = num_cookies; +} + +static inline void oti_free_cookies(struct obd_trans_info *oti) +{ + if (!oti || !oti->oti_logcookies) + return; + + if (oti->oti_logcookies == &oti->oti_onecookie) + LASSERT(oti->oti_numcookies == 1); + else + OBD_FREE_LARGE(oti->oti_logcookies, + oti->oti_numcookies*sizeof(oti->oti_onecookie)); + oti->oti_logcookies = NULL; + oti->oti_numcookies = 0; +} + +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* target added */ + OBD_NOTIFY_CREATE, + /* Device connect start */ + OBD_NOTIFY_CONNECT, + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Device disconnected */ + OBD_NOTIFY_DISCON, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD, + /* Sync request */ + OBD_NOTIFY_SYNC_NONBLOCK, + OBD_NOTIFY_SYNC, + /* Configuration event */ + OBD_NOTIFY_CONFIG, + /* Administratively deactivate/activate event */ + OBD_NOTIFY_DEACTIVATE, + OBD_NOTIFY_ACTIVATE +}; + +/* bit-mask flags for config events */ +enum config_flags { + CONFIG_LOG = 0x1, /* finished processing config log */ + CONFIG_SYNC = 0x2, /* mdt synced 1 ost */ + CONFIG_TARGET = 0x4 /* one target is added */ +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * and liblustre being main examples). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + +struct target_recovery_data { + svc_handler_t trd_recovery_handler; + pid_t trd_processing_task; + struct completion trd_starting; + struct completion trd_finishing; +}; + +struct obd_llog_group { + int olg_seq; + struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; + wait_queue_head_t olg_waitq; + spinlock_t olg_lock; + struct mutex olg_cat_processing; +}; + +/* corresponds to one of the obd's */ +#define OBD_DEVICE_MAGIC 0XAB5CD6EF +#define OBD_DEV_BY_DEVNAME 0xffffd0de + +struct obd_device { + struct obd_type *obd_type; + __u32 obd_magic; + + /* common and UUID name of this device */ + char obd_name[MAX_OBD_NAME]; + struct obd_uuid obd_uuid; + + struct lu_device *obd_lu_dev; + + int obd_minor; + /* bitfield modification is protected by obd_dev_lock */ + unsigned long obd_attached:1, /* finished attach */ + obd_set_up:1, /* finished setup */ + obd_recovering:1, /* there are recoverable clients */ + obd_abort_recovery:1,/* recovery expired */ + obd_version_recov:1, /* obd uses version checking */ + obd_replayable:1, /* recovery is enabled; inform clients */ + obd_no_transno:1, /* no committed-transno notification */ + obd_no_recov:1, /* fail instead of retry messages */ + obd_stopping:1, /* started cleanup */ + obd_starting:1, /* started setup */ + obd_force:1, /* cleanup with > 0 obd refcount */ + obd_fail:1, /* cleanup with failover */ + obd_async_recov:1, /* allow asynchronous orphan cleanup */ + obd_no_conn:1, /* deny new connections */ + obd_inactive:1, /* device active/inactive + * (for /proc/status only!!) */ + obd_no_ir:1, /* no imperative recovery. */ + obd_process_conf:1; /* device is processing mgs config */ + /* use separate field as it is set in interrupt to don't mess with + * protection of other bits using _bh lock */ + unsigned long obd_recovery_expired:1; + /* uuid-export hash body */ + cfs_hash_t *obd_uuid_hash; + /* nid-export hash body */ + cfs_hash_t *obd_nid_hash; + /* nid stats body */ + cfs_hash_t *obd_nid_stats_hash; + struct list_head obd_nid_stats; + atomic_t obd_refcount; + wait_queue_head_t obd_refcount_waitq; + struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; + int obd_num_exports; + spinlock_t obd_nid_lock; + struct ldlm_namespace *obd_namespace; + struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ + /* a spinlock is OK for what we do now, may need a semaphore later */ + spinlock_t obd_dev_lock; /* protect OBD bitfield above */ + struct mutex obd_dev_mutex; + __u64 obd_last_committed; + struct fsfilt_operations *obd_fsops; + spinlock_t obd_osfs_lock; + struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ + __u64 obd_osfs_age; + struct lvfs_run_ctxt obd_lvfs_ctxt; + struct obd_llog_group obd_olg; /* default llog group */ + struct obd_device *obd_observer; + struct rw_semaphore obd_observer_link_sem; + struct obd_notify_upcall obd_upcall; + struct obd_export *obd_self_export; + /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ + struct list_head obd_exports_timed; + time_t obd_eviction_timer; /* for ping evictor */ + + int obd_max_recoverable_clients; + atomic_t obd_connected_clients; + int obd_stale_clients; + int obd_delayed_clients; + /* this lock protects all recovery list_heads, timer and + * obd_next_recovery_transno value */ + spinlock_t obd_recovery_task_lock; + __u64 obd_next_recovery_transno; + int obd_replayed_requests; + int obd_requests_queued_for_recovery; + wait_queue_head_t obd_next_transno_waitq; + /* protected by obd_recovery_task_lock */ + timer_list_t obd_recovery_timer; + time_t obd_recovery_start; /* seconds */ + time_t obd_recovery_end; /* seconds, for lprocfs_status */ + int obd_recovery_time_hard; + int obd_recovery_timeout; + int obd_recovery_ir_factor; + + /* new recovery stuff from CMD2 */ + struct target_recovery_data obd_recovery_data; + int obd_replayed_locks; + atomic_t obd_req_replay_clients; + atomic_t obd_lock_replay_clients; + /* all lists are protected by obd_recovery_task_lock */ + struct list_head obd_req_replay_queue; + struct list_head obd_lock_replay_queue; + struct list_head obd_final_req_queue; + int obd_recovery_stage; + + union { + struct obd_device_target obt; + struct filter_obd filter; + struct client_obd cli; + struct ost_obd ost; + struct echo_client_obd echo_client; + struct echo_obd echo; + struct lov_obd lov; + struct lmv_obd lmv; + } u; + /* Fields used by LProcFS */ + unsigned int obd_cntr_base; + struct lprocfs_stats *obd_stats; + + unsigned int md_cntr_base; + struct lprocfs_stats *md_stats; + + proc_dir_entry_t *obd_proc_entry; + proc_dir_entry_t *obd_proc_exports_entry; + proc_dir_entry_t *obd_svc_procroot; + struct lprocfs_stats *obd_svc_stats; + atomic_t obd_evict_inprogress; + wait_queue_head_t obd_evict_inprogress_waitq; + struct list_head obd_evict_list; /* protected with pet_lock */ + + /** + * Ldlm pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + int obd_pool_limit; + __u64 obd_pool_slv; + + /** + * A list of outstanding class_incref()'s against this obd. For + * debugging. + */ + struct lu_ref obd_reference; + + int obd_conn_inprogress; +}; + +#define OBD_LLOG_FL_SENDNOW 0x0001 +#define OBD_LLOG_FL_EXIT 0x0002 + +enum obd_cleanup_stage { +/* Special case hack for MDS LOVs */ + OBD_CLEANUP_EARLY, +/* can be directly mapped to .ldto_device_fini() */ + OBD_CLEANUP_EXPORTS, +}; + +/* get/set_info keys */ +#define KEY_ASYNC "async" +#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_BLOCKSIZE "blocksize" +#define KEY_CAPA_KEY "capa_key" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_CONN_DATA "conn_data" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_HSM_COPYTOOL_SEND "hsm_send" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INIT_RECOV "initial_recov" +#define KEY_INTERMDS "inter_mds" +#define KEY_LAST_ID "last_id" +#define KEY_LAST_FID "last_fid" +#define KEY_LOCK_TO_STRIPE "lock_to_stripe" +#define KEY_LOVDESC "lovdesc" +#define KEY_LOV_IDX "lov_idx" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_MDS_CONN "mds_conn" +#define KEY_MGSSEC "mgssec" +#define KEY_NEXT_ID "next_id" +#define KEY_READ_ONLY "read-only" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_TGT_COUNT "tgt_count" +/* KEY_SET_INFO in lustre_idl.h */ +#define KEY_SPTLRPC_CONF "sptlrpc_conf" +#define KEY_CONNECT_FLAG "connect_flags" +#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel" + +#define KEY_CACHE_SET "cache_set" +#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" +#define KEY_CHANGELOG_INDEX "changelog_index" + +struct lu_context; + +/* /!\ must be coherent with include/linux/namei.h on patched kernel */ +#define IT_OPEN (1 << 0) +#define IT_CREAT (1 << 1) +#define IT_READDIR (1 << 2) +#define IT_GETATTR (1 << 3) +#define IT_LOOKUP (1 << 4) +#define IT_UNLINK (1 << 5) +#define IT_TRUNC (1 << 6) +#define IT_GETXATTR (1 << 7) +#define IT_EXEC (1 << 8) +#define IT_PIN (1 << 9) +#define IT_LAYOUT (1 << 10) +#define IT_QUOTA_DQACQ (1 << 11) +#define IT_QUOTA_CONN (1 << 12) + +static inline int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_CW; + else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP | + IT_LAYOUT)) + return LCK_CR; + + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); + return -EINVAL; +} + +struct md_op_data { + struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ + struct lu_fid op_fid2; /* operation fid2 (usualy child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ + mdsno_t op_mds; /* what mds server open will go to */ + struct lustre_handle op_handle; + obd_time op_mod_time; + const char *op_name; + int op_namelen; + __u32 op_mode; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + cfs_cap_t op_cap; + void *op_data; + + /* iattr fields and blocks. */ + struct iattr op_attr; + unsigned int op_attr_flags; + __u64 op_valid; + loff_t op_attr_blocks; + + /* Size-on-MDS epoch and flags. */ + __u64 op_ioepoch; + __u32 op_flags; + + /* Capa fields */ + struct obd_capa *op_capa1; + struct obd_capa *op_capa2; + + /* Various operation flags. */ + __u32 op_bias; + + /* Operation type */ + __u32 op_opc; + + /* Used by readdir */ + __u64 op_offset; + + /* Used by readdir */ + __u32 op_npages; + + /* used to transfer info between the stacks of MD client + * see enum op_cli_flags */ + __u32 op_cli_flags; +}; + +enum op_cli_flags { + CLI_SET_MEA = 1 << 0, + CLI_RM_ENTRY = 1 << 1, +}; + +struct md_enqueue_info; +/* metadata stat-ahead */ +typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, + int rc); + +/* seq client type */ +enum lu_cli_type { + LUSTRE_SEQ_METADATA = 1, + LUSTRE_SEQ_DATA +}; + +struct md_enqueue_info { + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + md_enqueue_cb_t mi_cb; + __u64 mi_cbdata; + unsigned int mi_generation; +}; + +struct obd_ops { + module_t *o_owner; + int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg); + int (*o_get_info)(const struct lu_env *env, struct obd_export *, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm); + int (*o_set_info_async)(const struct lu_env *, struct obd_export *, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); + int (*o_attach)(struct obd_device *dev, obd_count len, void *data); + int (*o_detach)(struct obd_device *dev); + int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *dev, + enum obd_cleanup_stage cleanup_stage); + int (*o_cleanup)(struct obd_device *dev); + int (*o_process_config)(struct obd_device *dev, obd_count len, + void *data); + int (*o_postrecov)(struct obd_device *dev); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); + + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type); + int (*o_fid_fini)(struct obd_device *obd); + + /* Allocate new fid according to passed @hint. */ + int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + __u64 max_age, struct ptlrpc_request_set *set); + int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src); + int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, int disk_len); + int (*o_preallocate)(struct lustre_handle *, obd_count *req, + obd_id *ids); + /* FIXME: add fid capability support for create & destroy! */ + int (*o_precreate)(struct obd_export *exp); + int (*o_create)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti); + int (*o_create_async)(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md **ea, + struct obd_trans_info *oti); + int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa); + int (*o_setattr)(const struct lu_env *, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti); + int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo); + int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *set); + int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pgarr, + struct obd_trans_info *oti); + int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only); + int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, + obd_off size, int shrink); + int (*o_punch)(const struct lu_env *, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_sync)(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set); + int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst, + struct lov_stripe_md *src, obd_size start, + obd_size end, struct obd_trans_info *oti); + int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst, + struct lustre_handle *srconn, struct lov_stripe_md *src, + obd_size start, obd_size end, struct obd_trans_info *); + int (*o_iterate)(struct lustre_handle *conn, + int (*)(obd_id, obd_seq, void *), + obd_id *startid, obd_seq seq, void *data); + int (*o_preprw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, int objcount, + struct obd_ioobj *obj, struct niobuf_remote *remote, + int *nr_pages, struct niobuf_local *local, + struct obd_trans_info *oti, struct lustre_capa *capa); + int (*o_commitrw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc); + int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset); + int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *, + ldlm_iterator_t it, void *data); + int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *, + ldlm_iterator_t it, void *data); + int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *); + int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *, + ldlm_cancel_flags_t flags, void *opaque); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); + int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *, + int cmd, obd_off *); + + /* llog related obd_methods */ + int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp, + struct obd_device *disk_obd, int *idx); + int (*o_llog_finish)(struct obd_device *obd, int count); + int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *); + + /* metadata-only methods */ + int (*o_pin)(struct obd_export *, const struct lu_fid *fid, + struct obd_capa *, struct obd_client_handle *, int flag); + int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int); + + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); + + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); + + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); + + /* quota methods */ + int (*o_quotacheck)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + + int (*o_ping)(const struct lu_env *, struct obd_export *exp); + + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); + void (*o_getref)(struct obd_device *obd); + void (*o_putref)(struct obd_device *obd); + /* + * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line + * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. + * Also, add a wrapper function in include/linux/obd_class.h. */ +}; + +enum { + LUSTRE_OPC_MKDIR = (1 << 0), + LUSTRE_OPC_SYMLINK = (1 << 1), + LUSTRE_OPC_MKNOD = (1 << 2), + LUSTRE_OPC_CREATE = (1 << 3), + LUSTRE_OPC_ANY = (1 << 4) +}; + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +struct lustre_md { + struct mdt_body *body; + struct lov_stripe_md *lsm; + struct lmv_stripe_md *mea; +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *posix_acl; +#endif + struct mdt_remote_perm *remote_perm; + struct obd_capa *mds_capa; + struct obd_capa *oss_capa; +}; + +struct md_open_data { + struct obd_client_handle *mod_och; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; + atomic_t mod_refcount; +}; + +struct lookup_intent; + +struct md_ops { + int (*m_getstatus)(struct obd_export *, struct lu_fid *, + struct obd_capa **); + int (*m_null_inode)(struct obd_export *, const struct lu_fid *); + int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *, + ldlm_iterator_t, void *); + int (*m_close)(struct obd_export *, struct md_op_data *, + struct md_open_data *, struct ptlrpc_request **); + int (*m_create)(struct obd_export *, struct md_op_data *, + const void *, int, int, __u32, __u32, cfs_cap_t, + __u64, struct ptlrpc_request **); + int (*m_done_writing)(struct obd_export *, struct md_op_data *, + struct md_open_data *); + int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + struct lookup_intent *, struct md_op_data *, + struct lustre_handle *, void *, int, + struct ptlrpc_request **, __u64); + int (*m_getattr)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_getattr_name)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_intent_lock)(struct obd_export *, struct md_op_data *, + void *, int, struct lookup_intent *, int, + struct ptlrpc_request **, + ldlm_blocking_callback, __u64); + int (*m_link)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_rename)(struct obd_export *, struct md_op_data *, + const char *, int, const char *, int, + struct ptlrpc_request **); + int (*m_is_subdir)(struct obd_export *, const struct lu_fid *, + const struct lu_fid *, + struct ptlrpc_request **); + int (*m_setattr)(struct obd_export *, struct md_op_data *, void *, + int , void *, int, struct ptlrpc_request **, + struct md_open_data **mod); + int (*m_sync)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, struct ptlrpc_request **); + int (*m_readpage)(struct obd_export *, struct md_op_data *, + struct page **, struct ptlrpc_request **); + + int (*m_unlink)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_setxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, obd_valid, const char *, + const char *, int, int, int, __u32, + struct ptlrpc_request **); + + int (*m_getxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, obd_valid, const char *, + const char *, int, int, int, + struct ptlrpc_request **); + + int (*m_init_ea_size)(struct obd_export *, int, int, int); + + int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *, + struct obd_export *, struct obd_export *, + struct lustre_md *); + + int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *); + + int (*m_set_open_replay_data)(struct obd_export *, + struct obd_client_handle *, + struct ptlrpc_request *); + int (*m_clear_open_replay_data)(struct obd_export *, + struct obd_client_handle *); + int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *); + + ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, ldlm_type_t, + ldlm_policy_data_t *, ldlm_mode_t, + struct lustre_handle *); + + int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, + ldlm_policy_data_t *, ldlm_mode_t, + ldlm_cancel_flags_t flags, void *opaque); + int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, + renew_capa_cb_t cb); + int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, + const struct req_msg_field *, struct obd_capa **); + + int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, __u32, + struct ptlrpc_request **); + + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *, + struct ldlm_enqueue_info *); + + int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, + struct lu_fid *, __u64 *bits); + + /* + * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to + * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a + * wrapper function in include/linux/obd_class.h. + */ +}; + +struct lsm_operations { + void (*lsm_free)(struct lov_stripe_md *); + int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa, + struct obd_export *md_exp); + void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *, + obd_off *); + void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *, + obd_off *); + int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes, + __u16 *stripe_count); + int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmm); +}; + +extern const struct lsm_operations lsm_v1_ops; +extern const struct lsm_operations lsm_v3_ops; +static inline const struct lsm_operations *lsm_op_find(int magic) +{ + switch(magic) { + case LOV_MAGIC_V1: + return &lsm_v1_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; + default: + CERROR("Cannot recognize lsm_magic %08x\n", magic); + return NULL; + } +} + +/* Requests for obd_extent_calc() */ +#define OBD_CALC_STRIPE_START 1 +#define OBD_CALC_STRIPE_END 2 + +static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo) +{ + return oinfo->oi_capa; +} + +static inline struct md_open_data *obd_mod_alloc(void) +{ + struct md_open_data *mod; + OBD_ALLOC_PTR(mod); + if (mod == NULL) + return NULL; + atomic_set(&mod->mod_refcount, 1); + return mod; +} + +#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) +#define obd_mod_put(mod) \ +({ \ + if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ + if ((mod)->mod_open_req) \ + ptlrpc_req_finished((mod)->mod_open_req); \ + OBD_FREE_PTR(mod); \ + } \ +}) + +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid); +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); + +/* return 1 if client should be resend request */ +static inline int client_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + +/** + * Return device name for this device + * + * XXX: lu_device is declared before obd_device, while a pointer pointing + * back to obd_device in lu_device, so this helper function defines here + * instead of in lu_object.h + */ +static inline const char *lu_dev_name(const struct lu_device *lu_dev) +{ + return lu_dev->ld_obd->obd_name; +} + +static inline bool filename_is_volatile(const char *name, int namelen, int *idx) +{ + const char *start; + char *end; + + if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) + return false; + + /* caller does not care of idx */ + if (idx == NULL) + return true; + + /* volatile file, the MDT can be set from name */ + /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ + /* if no MDT is specified, use std way */ + if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) + goto bad_format; + /* test for no MDT idx case */ + if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && + (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { + *idx = -1; + return true; + } + /* we have an idx, read it */ + start = name + LUSTRE_VOLATILE_HDR_LEN + 1; + *idx = strtoul(start, &end, 0); + /* error cases: + * no digit, no trailing :, negative value + */ + if (((*idx == 0) && (end == start)) || + (*end != ':') || (*idx < 0)) + goto bad_format; + + return true; +bad_format: + /* bad format of mdt idx, we cannot return an error + * to caller so we use hash algo */ + CERROR("Bad volatile file name format: %s\n", + name + LUSTRE_VOLATILE_HDR_LEN); + return false; +} + +static inline int cli_brw_size(struct obd_device *obd) +{ + LASSERT(obd != NULL); + return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; +} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h new file mode 100644 index 000000000000..c8249fbb0d72 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_cache.h @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h new file mode 100644 index 000000000000..5f740f1743ca --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_cksum.h @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_CKSUM +#define __OBD_CKSUM +#include +#include + +static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_CRC32: + return CFS_HASH_ALG_CRC32; + case OBD_CKSUM_ADLER: + return CFS_HASH_ALG_ADLER32; + case OBD_CKSUM_CRC32C: + return CFS_HASH_ALG_CRC32C; + default: + CERROR("Unknown checksum type (%x)!!!\n", cksum_type); + LBUG(); + } + return 0; +} + +/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can + * only be a single checksum type per RPC. + * + * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask + * since they need to represent the full range of checksum algorithms that + * both the client and server can understand. + * + * In case of an unsupported types/flags we fall back to ADLER + * because that is supported by all clients since 1.8 + * + * In case multiple algorithms are supported the best one is used. */ +static inline obd_flag cksum_type_pack(cksum_type_t cksum_type) +{ + unsigned int performance = 0, tmp; + obd_flag flag = OBD_FL_CKSUM_ADLER; + + if (cksum_type & OBD_CKSUM_CRC32) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32; + } + } + if (cksum_type & OBD_CKSUM_CRC32C) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32C; + } + } + if (cksum_type & OBD_CKSUM_ADLER) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_ADLER; + } + } + if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C | + OBD_CKSUM_CRC32 | + OBD_CKSUM_ADLER)))) + CWARN("unknown cksum type %x\n", cksum_type); + + return flag; +} + +static inline cksum_type_t cksum_type_unpack(obd_flag o_flags) +{ + switch (o_flags & OBD_FL_CKSUM_ALL) { + case OBD_FL_CKSUM_CRC32C: + return OBD_CKSUM_CRC32C; + case OBD_FL_CKSUM_CRC32: + return OBD_CKSUM_CRC32; + default: + break; + } + + return OBD_CKSUM_ADLER; +} + +/* Return a bitmask of the checksum types supported on this system. + * 1.8 supported ADLER it is base and not depend on hw + * Client uses all available local algos + */ +static inline cksum_type_t cksum_types_supported_client(void) +{ + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + +/* Server uses algos that perform at 50% or better of the Adler */ +static inline cksum_type_t cksum_types_supported_server(void) +{ + int base_speed; + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >= + base_speed) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >= + base_speed) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + + +/* Select the best checksum algorithm among those supplied in the cksum_types + * input. + * + * Currently, calling cksum_type_pack() with a mask will return the fastest + * checksum type due to its benchmarking at libcfs module load. + * Caution is advised, however, since what is fastest on a single client may + * not be the fastest or most efficient algorithm on the server. */ +static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types) +{ + return cksum_type_unpack(cksum_type_pack(cksum_types)); +} + +/* Checksum algorithm names. Must be defined in the same order as the + * OBD_CKSUM_* flags. */ +#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h new file mode 100644 index 000000000000..de5c5853647f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_class.h @@ -0,0 +1,2281 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef __CLASS_OBD_H +#define __CLASS_OBD_H + + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay + * and resends for avoid deadlocks */ +#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update + * obd_osfs_age */ +#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd + * instead of a specific set. This + * means that we cannot rely on the set + * interpret routine to be called. + * lov_statfs_fini() must thus be called + * by the request interpret routine */ +#define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving + * information from MDT0. */ +#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */ + +/* OBD Device Declarations */ +extern struct obd_device *obd_devs[MAX_OBD_DEVICES]; +extern rwlock_t obd_dev_lock; + +/* OBD Operations Declarations */ +extern struct obd_device *class_conn2obd(struct lustre_handle *); +extern struct obd_device *class_exp2obd(struct obd_export *); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); +extern int lustre_get_jobid(char *jobid); + +struct lu_device_type; + +/* genops.c */ +struct obd_export *class_conn2export(struct lustre_handle *); +int class_register_type(struct obd_ops *, struct md_ops *, + struct lprocfs_vars *, const char *nm, + struct lu_device_type *ldt); +int class_unregister_type(const char *nm); + +struct obd_device *class_newdev(const char *type_name, const char *name); +void class_release_dev(struct obd_device *obd); + +int class_name2dev(const char *name); +struct obd_device *class_name2obd(const char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid); +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, + int *next); +struct obd_device * class_num2obd(int num); +int get_devices_count(void); + +int class_notify_sptlrpc_conf(const char *fsname, int namelen); + +char *obd_export_nid2str(struct obd_export *exp); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid); +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep); + +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); +void obd_exports_barrier(struct obd_device *obd); +int kuc_len(int payload_len); +struct kuc_hdr * kuc_ptr(void *p); +int kuc_ispayload(void *p); +void *kuc_alloc(int payload_len, int transport, int type); +void kuc_free(void *p, int payload_len); + +struct llog_handle; +struct llog_rec_hdr; +typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, + struct llog_rec_hdr *, void *); +/* obd_config.c */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name); +int class_process_config(struct lustre_cfg *lcfg); +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data); +int class_attach(struct lustre_cfg *lcfg); +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source); +void class_decref(struct obd_device *obd, + const char *scope, const void *source); +void dump_exports(struct obd_device *obd, int locks); +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_add_uuid(const char *uuid, __u64 nid); + +/*obdecho*/ +#ifdef LPROCFS +extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + +/* Passed as data param to class_config_parse_llog */ +struct config_llog_instance { + char *cfg_obdname; + void *cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + llog_cb_t cfg_callback; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; +}; +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); + +enum { + CONFIG_T_CONFIG = 0, + CONFIG_T_SPTLRPC = 1, + CONFIG_T_RECOVER = 2, + CONFIG_T_MAX = 3 +}; + +/* list of active configuration logs */ +struct config_llog_data { + struct ldlm_res_id cld_resid; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain; + atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ + struct config_llog_data *cld_recover; /* imperative recover log */ + struct obd_export *cld_mgcexp; + struct mutex cld_lock; + int cld_type; + unsigned int cld_stopping:1, /* we were told to stop + * watching */ + cld_lostlock:1; /* lock not requeued */ + char cld_logname[0]; +}; + +struct lustre_profile { + struct list_head lp_list; + char *lp_profile; + char *lp_dt; + char *lp_md; +}; + +struct lustre_profile *class_get_profile(const char * prof); +void class_del_profile(const char *prof); +void class_del_profiles(void); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *); +void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *); +extern void (*class_export_dump_hook)(struct obd_export *); + +#else + +#define __class_export_add_lock_ref(exp, lock) do {} while(0) +#define __class_export_del_lock_ref(exp, lock) do {} while(0) + +#endif + +#define class_export_rpc_inc(exp) \ +({ \ + atomic_inc(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_rpc_dec(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_rpc_count); \ + atomic_dec(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_lock_get(exp, lock) \ +({ \ + atomic_inc(&(exp)->exp_locks_count); \ + __class_export_add_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_get(exp); \ +}) + +#define class_export_lock_put(exp, lock) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_locks_count); \ + atomic_dec(&(exp)->exp_locks_count); \ + __class_export_del_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_put(exp); \ +}) + +#define class_export_cb_get(exp) \ +({ \ + atomic_inc(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_get(exp); \ +}) + +#define class_export_cb_put(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_cb_count); \ + atomic_dec(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_put(exp); \ +}) + +/* genops.c */ +struct obd_export *class_export_get(struct obd_export *exp); +void class_export_put(struct obd_export *exp); +struct obd_export *class_new_export(struct obd_device *obddev, + struct obd_uuid *cluuid); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(struct obd_device *obd); +void class_destroy_import(struct obd_import *exp); + +struct obd_type *class_search_type(const char *name); +struct obd_type *class_get_type(const char *name); +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct obd_export *exp); +void class_fail_export(struct obd_export *exp); +int class_connected_export(struct obd_export *exp); +void class_disconnect_exports(struct obd_device *obddev); +int class_manual_cleanup(struct obd_device *obd); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *)); +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} + + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, + unsigned int ia_valid); +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid); +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid); +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo); +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo); + +#define OBT(dev) (dev)->obd_type +#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op +#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op +#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op + +/* Ensure obd_setup: used for cleanup which must be called + while obd is stopping */ +#define OBD_CHECK_DEV(obd) \ +do { \ + if (!(obd)) { \ + CERROR("NULL device\n"); \ + RETURN(-ENODEV); \ + } \ +} while (0) + +/* ensure obd_setup and !obd_stopping */ +#define OBD_CHECK_DEV_ACTIVE(obd) \ +do { \ + OBD_CHECK_DEV(obd); \ + if (!(obd)->obd_set_up || (obd)->obd_stopping) { \ + CERROR("Device %d not setup\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ + } \ +} while (0) + + +#ifdef LPROCFS +#define OBD_COUNTER_OFFSET(op) \ + ((offsetof(struct obd_ops, o_ ## op) - \ + offsetof(struct obd_ops, o_iocontrol)) \ + / sizeof(((struct obd_ops *)(0))->o_iocontrol)) + +#define OBD_COUNTER_INCREMENT(obdx, op) \ + if ((obdx)->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->obd_stats->ls_num); \ + lprocfs_counter_incr((obdx)->obd_stats, coffset); \ + } + +#define EXP_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \ + if ((export)->exp_nid_stats != NULL && \ + (export)->exp_nid_stats->nid_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_nid_stats->nid_stats, coffset);\ + } + +#define MD_COUNTER_OFFSET(op) \ + ((offsetof(struct md_ops, m_ ## op) - \ + offsetof(struct md_ops, m_getstatus)) \ + / sizeof(((struct md_ops *)(0))->m_getstatus)) + +#define MD_COUNTER_INCREMENT(obdx, op) \ + if ((obd)->md_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->md_stats->ls_num); \ + lprocfs_counter_incr((obdx)->md_stats, coffset); \ + } + +#define EXP_MD_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \ + if ((export)->exp_md_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_md_stats, coffset); \ + } + +#else +#define OBD_COUNTER_OFFSET(op) +#define OBD_COUNTER_INCREMENT(obd, op) +#define EXP_COUNTER_INCREMENT(exp, op) +#define MD_COUNTER_INCREMENT(obd, op) +#define EXP_MD_COUNTER_INCREMENT(exp, op) +#endif + +static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp) +{ + /* Always add in ldlm_stats */ + tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC + ,LPROCFS_STATS_FLAG_NOPERCPU); + if (tmp->nid_ldlm_stats == NULL) + return -ENOMEM; + + lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats); + + return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats", + tmp->nid_ldlm_stats); +} + +#define OBD_CHECK_MD_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !MDP((obd), op)) { \ + if (err) \ + CERROR("md_" #op ": dev %s/%d no operation\n", \ + obd->obd_name, obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +#define EXP_CHECK_MD_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + RETURN(-ENODEV); \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + RETURN(-EOPNOTSUPP); \ + } \ + if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %s/%d no operation\n", \ + (exp)->exp_obd->obd_name, \ + (exp)->exp_obd->obd_minor); \ + RETURN(-EOPNOTSUPP); \ + } \ +} while (0) + + +#define OBD_CHECK_DT_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !OBP((obd), op)) { \ + if (err) \ + CERROR("obd_" #op ": dev %d no operation\n", \ + obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +#define EXP_CHECK_DT_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + RETURN(-ENODEV); \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + RETURN(-EOPNOTSUPP); \ + } \ + if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %d no operation\n", \ + (exp)->exp_obd->obd_minor); \ + RETURN(-EOPNOTSUPP); \ + } \ +} while (0) + +#define CTXT_CHECK_OP(ctxt, op, err) \ +do { \ + if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \ + if (err) \ + CERROR("lop_" #op ": dev %d no operation\n", \ + ctxt->loc_obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +static inline int class_devno_max(void) +{ + return MAX_OBD_DEVICES; +} + +static inline int obd_get_info(const struct lu_env *env, + struct obd_export *exp, __u32 keylen, + void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, get_info); + EXP_COUNTER_INCREMENT(exp, get_info); + + rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val, + lsm); + RETURN(rc); +} + +static inline int obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, set_info_async); + EXP_COUNTER_INCREMENT(exp, set_info_async); + + rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, + val, set); + RETURN(rc); +} + +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ + +#define DECLARE_LU_VARS(ldt, d) \ + struct lu_device_type *ldt; \ + struct lu_device *d + +static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + ldt = obd->obd_type->typ_lu; + if (ldt != NULL) { + struct lu_context session_ctx; + struct lu_env env; + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + env.le_ses = &session_ctx; + d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); + lu_env_fini(&env); + if (!IS_ERR(d)) { + obd->obd_lu_dev = d; + d->ld_obd = obd; + rc = 0; + } else + rc = PTR_ERR(d); + } + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + } else { + OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, setup); + rc = OBP(obd, setup)(obd, cfg); + } + RETURN(rc); +} + +static inline int obd_precleanup(struct obd_device *obd, + enum obd_cleanup_stage cleanup_stage) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + if (cleanup_stage == OBD_CLEANUP_EXPORTS) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_fini(&env, d); + lu_env_fini(&env); + } + } + } + OBD_CHECK_DT_OP(obd, precleanup, 0); + OBD_COUNTER_INCREMENT(obd, precleanup); + + rc = OBP(obd, precleanup)(obd, cleanup_stage); + RETURN(rc); +} + +static inline int obd_cleanup(struct obd_device *obd) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_free(&env, d); + lu_env_fini(&env); + obd->obd_lu_dev = NULL; + } + } + OBD_CHECK_DT_OP(obd, cleanup, 0); + OBD_COUNTER_INCREMENT(obd, cleanup); + + rc = OBP(obd, cleanup)(obd); + RETURN(rc); +} + +static inline void obd_cleanup_client_import(struct obd_device *obd) +{ + ENTRY; + + /* If we set up but never connected, the + client import will not have been cleaned. */ + down_write(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_CONFIG, "%s: client import never connected\n", + obd->obd_name); + ptlrpc_invalidate_import(imp); + if (imp->imp_rq_pool) { + ptlrpc_free_rq_pool(imp->imp_rq_pool); + imp->imp_rq_pool = NULL; + } + client_destroy_import(imp); + obd->u.cli.cl_import = NULL; + } + up_write(&obd->u.cli.cl_sem); + + EXIT; +} + +static inline int +obd_process_config(struct obd_device *obd, int datalen, void *data) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + + obd->obd_process_conf = 1; + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + rc = d->ld_ops->ldo_process_config(&env, d, data); + lu_env_fini(&env); + } + } else { + OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP); + rc = OBP(obd, process_config)(obd, datalen, data); + } + OBD_COUNTER_INCREMENT(obd, process_config); + obd->obd_process_conf = 0; + + RETURN(rc); +} + +/* Pack an in-memory MD struct for storage on disk. + * Returns +ve size of packed MD (0 for free), or -ve error. + * + * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL). + * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed. + * If @*disk_tgt == NULL, it will be allocated + */ +static inline int obd_packmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, packmd); + EXP_COUNTER_INCREMENT(exp, packmd); + + rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src); + RETURN(rc); +} + +static inline int obd_size_diskmd(struct obd_export *exp, + struct lov_stripe_md *mem_src) +{ + return obd_packmd(exp, NULL, mem_src); +} + +/* helper functions */ +static inline int obd_alloc_diskmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt) +{ + LASSERT(disk_tgt); + LASSERT(*disk_tgt == NULL); + return obd_packmd(exp, disk_tgt, NULL); +} + +static inline int obd_free_diskmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt) +{ + LASSERT(disk_tgt); + LASSERT(*disk_tgt); + /* + * LU-2590, for caller's convenience, *disk_tgt could be host + * endianness, it needs swab to LE if necessary, while just + * lov_mds_md header needs it for figuring out how much memory + * needs to be freed. + */ + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) || + ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3))) + lustre_swab_lov_mds_md(*disk_tgt); + return obd_packmd(exp, disk_tgt, NULL); +} + +/* Unpack an MD struct from disk to in-memory format. + * Returns +ve size of unpacked MD (0 for free), or -ve error. + * + * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL). + * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed. + * If @*mem_tgt == NULL, it will be allocated + */ +static inline int obd_unpackmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, + int disk_len) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, unpackmd); + EXP_COUNTER_INCREMENT(exp, unpackmd); + + rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len); + RETURN(rc); +} + +/* helper functions */ +static inline int obd_alloc_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + LASSERT(mem_tgt); + LASSERT(*mem_tgt == NULL); + return obd_unpackmd(exp, mem_tgt, NULL, 0); +} + +static inline int obd_free_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + int rc; + + LASSERT(mem_tgt); + LASSERT(*mem_tgt); + rc = obd_unpackmd(exp, mem_tgt, NULL, 0); + *mem_tgt = NULL; + return rc; +} + +static inline int obd_precreate(struct obd_export *exp) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, precreate); + OBD_COUNTER_INCREMENT(exp->exp_obd, precreate); + + rc = OBP(exp->exp_obd, precreate)(exp); + RETURN(rc); +} + +static inline int obd_create_async(struct obd_export *exp, + struct obd_info *oinfo, + struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, create_async); + EXP_COUNTER_INCREMENT(exp, create_async); + + rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti); + RETURN(rc); +} + +static inline int obd_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, create); + EXP_COUNTER_INCREMENT(exp, create); + + rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti); + RETURN(rc); +} + +static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md *ea, + struct obd_trans_info *oti, + struct obd_export *md_exp, void *capa) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, destroy); + EXP_COUNTER_INCREMENT(exp, destroy); + + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa); + RETURN(rc); +} + +static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, getattr); + EXP_COUNTER_INCREMENT(exp, getattr); + + rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo); + RETURN(rc); +} + +static inline int obd_getattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, getattr_async); + EXP_COUNTER_INCREMENT(exp, getattr_async); + + rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set); + RETURN(rc); +} + +static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr); + EXP_COUNTER_INCREMENT(exp, setattr); + + rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti); + RETURN(rc); +} + +/* This performs all the requests set init/wait/destroy actions. */ +static inline int obd_setattr_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* This adds all the requests into @set if @set != NULL, otherwise + all requests are sent asynchronously without waiting for response. */ +static inline int obd_setattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + RETURN(rc); +} + +static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, add_conn); + + rc = OBP(obd, add_conn)(imp, uuid, priority); + RETURN(rc); +} + +static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, del_conn); + + rc = OBP(obd, del_conn)(imp, uuid); + RETURN(rc); +} + +static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) +{ + struct obd_uuid *uuid; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL); + EXP_COUNTER_INCREMENT(exp, get_uuid); + + uuid = OBP(exp->exp_obd, get_uuid)(exp); + RETURN(uuid); +} + +/** Create a new /a exp on device /a obd for the uuid /a cluuid + * @param exp New export handle + * @param d Connect data, supported flags are set, flags also understood + * by obd are returned. + */ +static inline int obd_connect(const struct lu_env *env, + struct obd_export **exp,struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + int rc; + __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition + * check */ + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, connect); + + rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); + /* check that only subset is granted */ + LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) == + data->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_reconnect(const struct lu_env *env, + struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d, + void *localdata) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition + * check */ + + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, reconnect, 0); + OBD_COUNTER_INCREMENT(obd, reconnect); + + rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_disconnect(struct obd_export *exp) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, disconnect); + EXP_COUNTER_INCREMENT(exp, disconnect); + + rc = OBP(exp->exp_obd, disconnect)(exp); + RETURN(rc); +} + +static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, fid_init, 0); + OBD_COUNTER_INCREMENT(obd, fid_init); + + rc = OBP(obd, fid_init)(obd, exp, type); + RETURN(rc); +} + +static inline int obd_fid_fini(struct obd_device *obd) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, fid_fini, 0); + OBD_COUNTER_INCREMENT(obd, fid_fini); + + rc = OBP(obd, fid_fini)(obd); + RETURN(rc); +} + +static inline int obd_fid_alloc(struct obd_export *exp, + struct lu_fid *fid, + struct md_op_data *op_data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, fid_alloc); + EXP_COUNTER_INCREMENT(exp, fid_alloc); + + rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data); + RETURN(rc); +} + +static inline int obd_ping(const struct lu_env *env, struct obd_export *exp) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, ping, 0); + EXP_COUNTER_INCREMENT(exp, ping); + + rc = OBP(exp->exp_obd, ping)(env, exp); + RETURN(rc); +} + +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_new); + + rc = OBP(obd, pool_new)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_del); + + rc = OBP(obd, pool_del)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_add); + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_rem); + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + RETURN(rc); +} + +static inline void obd_getref(struct obd_device *obd) +{ + ENTRY; + if (OBT(obd) && OBP(obd, getref)) { + OBD_COUNTER_INCREMENT(obd, getref); + OBP(obd, getref)(obd); + } + EXIT; +} + +static inline void obd_putref(struct obd_device *obd) +{ + ENTRY; + if (OBT(obd) && OBP(obd, putref)) { + OBD_COUNTER_INCREMENT(obd, putref); + OBP(obd, putref)(obd); + } + EXIT; +} + +static inline int obd_init_export(struct obd_export *exp) +{ + int rc = 0; + + ENTRY; + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, init_export)) + rc = OBP(exp->exp_obd, init_export)(exp); + RETURN(rc); +} + +static inline int obd_destroy_export(struct obd_export *exp) +{ + ENTRY; + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + RETURN(0); +} + +static inline int obd_extent_calc(struct obd_export *exp, + struct lov_stripe_md *md, + int cmd, obd_off *offset) +{ + int rc; + ENTRY; + EXP_CHECK_DT_OP(exp, extent_calc); + rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset); + RETURN(rc); +} + +static inline struct dentry * +obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen) +{ + struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt; + LASSERT(exp->exp_obd); + + return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi), + exp->exp_obd); +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, + __u64 max_age, + struct ptlrpc_request_set *rqset) +{ + int rc = 0; + struct obd_device *obd; + ENTRY; + + if (exp == NULL || exp->exp_obd == NULL) + RETURN(-EINVAL); + + obd = exp->exp_obd; + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n", + obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); + } else { + CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + } + RETURN(rc); +} + +static inline int obd_statfs_rqset(struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + oinfo.oi_osfs = osfs; + oinfo.oi_flags = flags; + rc = obd_statfs_async(exp, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + int rc = 0; + struct obd_device *obd = exp->exp_obd; + ENTRY; + + if (obd == NULL) + RETURN(-EINVAL); + + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n", + obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); + if (rc == 0) { + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); + obd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&obd->obd_osfs_lock); + } + } else { + CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); + spin_unlock(&obd->obd_osfs_lock); + } + RETURN(rc); +} + +static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo, + obd_size start, obd_size end) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, sync); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_sync(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, sync); + + rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set); + RETURN(rc); +} + +static inline int obd_punch_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, punch); + EXP_COUNTER_INCREMENT(exp, punch); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_punch(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, punch); + EXP_COUNTER_INCREMENT(exp, punch); + + rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset); + RETURN(rc); +} + +static inline int obd_brw(int cmd, struct obd_export *exp, + struct obd_info *oinfo, obd_count oa_bufs, + struct brw_page *pg, struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, brw); + EXP_COUNTER_INCREMENT(exp, brw); + + if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) { + CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, " + "or OBD_BRW_CHECK\n"); + LBUG(); + } + + rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti); + RETURN(rc); +} + +static inline int obd_preprw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int *pages, + struct niobuf_local *local, + struct obd_trans_info *oti, + struct lustre_capa *capa) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, preprw); + EXP_COUNTER_INCREMENT(exp, preprw); + + rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, + pages, local, oti, capa); + RETURN(rc); +} + +static inline int obd_commitrw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rnb, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc) +{ + ENTRY; + + EXP_CHECK_DT_OP(exp, commitrw); + EXP_COUNTER_INCREMENT(exp, commitrw); + + rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, + rnb, pages, local, oti, rc); + RETURN(rc); +} + +static inline int obd_merge_lvb(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, merge_lvb); + EXP_COUNTER_INCREMENT(exp, merge_lvb); + + rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only); + RETURN(rc); +} + +static inline int obd_adjust_kms(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off size, + int shrink) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, adjust_kms); + EXP_COUNTER_INCREMENT(exp, adjust_kms); + + rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink); + RETURN(rc); +} + +static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void *uarg) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, iocontrol); + EXP_COUNTER_INCREMENT(exp, iocontrol); + + rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); + RETURN(rc); +} + +static inline int obd_enqueue_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, enqueue); + EXP_COUNTER_INCREMENT(exp, enqueue); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_enqueue(struct obd_export *exp, + struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, enqueue); + EXP_COUNTER_INCREMENT(exp, enqueue); + + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set); + RETURN(rc); +} + +static inline int obd_change_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, change_cbdata); + EXP_COUNTER_INCREMENT(exp, change_cbdata); + + rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data); + RETURN(rc); +} + +static inline int obd_find_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, find_cbdata); + EXP_COUNTER_INCREMENT(exp, find_cbdata); + + rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data); + RETURN(rc); +} + +static inline int obd_cancel(struct obd_export *exp, + struct lov_stripe_md *ea, __u32 mode, + struct lustre_handle *lockh) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, cancel); + EXP_COUNTER_INCREMENT(exp, cancel); + + rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh); + RETURN(rc); +} + +static inline int obd_cancel_unused(struct obd_export *exp, + struct lov_stripe_md *ea, + ldlm_cancel_flags_t flags, + void *opaque) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, cancel_unused); + EXP_COUNTER_INCREMENT(exp, cancel_unused); + + rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque); + RETURN(rc); +} + +static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct obd_client_handle *handle, + int flag) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, pin); + EXP_COUNTER_INCREMENT(exp, pin); + + rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag); + RETURN(rc); +} + +static inline int obd_unpin(struct obd_export *exp, + struct obd_client_handle *handle, int flag) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, unpin); + EXP_COUNTER_INCREMENT(exp, unpin); + + rc = OBP(exp->exp_obd, unpin)(exp, handle, flag); + RETURN(rc); +} + + +static inline void obd_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + ENTRY; + if (!obd) { + CERROR("NULL device\n"); + EXIT; + return; + } + if (obd->obd_set_up && OBP(obd, import_event)) { + OBD_COUNTER_INCREMENT(obd, import_event); + OBP(obd, import_event)(obd, imp, event); + } + EXIT; +} + +static inline int obd_llog_connect(struct obd_export *exp, + struct llogd_conn_body *body) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0); + EXP_COUNTER_INCREMENT(exp, llog_connect); + + rc = OBP(exp->exp_obd, llog_connect)(exp, body); + RETURN(rc); +} + + +static inline int obd_notify(struct obd_device *obd, + struct obd_device *watched, + enum obd_notify_event ev, + void *data) +{ + int rc; + ENTRY; + OBD_CHECK_DEV(obd); + + /* the check for async_recov is a complete hack - I'm hereby + overloading the meaning to also mean "this was called from + mds_postsetup". I know that my mds is able to handle notifies + by this point, and it needs to get them to execute mds_postrecov. */ + if (!obd->obd_set_up && !obd->obd_async_recov) { + CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); + RETURN(-EINVAL); + } + + if (!OBP(obd, notify)) { + CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); + RETURN(-ENOSYS); + } + + OBD_COUNTER_INCREMENT(obd, notify); + rc = OBP(obd, notify)(obd, watched, ev, data); + RETURN(rc); +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev, + void *data) +{ + int rc1; + int rc2; + + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc1 = obd_notify(observer->obd_observer, observed, ev, data); + else + rc1 = 0; + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, + onu->onu_owner, NULL); + else + rc2 = 0; + + return rc1 ? rc1 : rc2; +} + +static inline int obd_quotacheck(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, quotacheck); + EXP_COUNTER_INCREMENT(exp, quotacheck); + + rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_quotactl(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, quotactl); + EXP_COUNTER_INCREMENT(exp, quotactl); + + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only suppored reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + ENTRY; + + /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */ + if (obd == NULL || !OBT(obd)) { + CERROR("cleaned up obd\n"); + RETURN(-EOPNOTSUPP); + } + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(0); + if (!OBP(obd, health_check)) + RETURN(0); + + rc = OBP(obd, health_check)(env, obd); + RETURN(rc); +} + +static inline int obd_register_observer(struct obd_device *obd, + struct obd_device *observer) +{ + ENTRY; + OBD_CHECK_DEV(obd); + down_write(&obd->obd_observer_link_sem); + if (obd->obd_observer && observer) { + up_write(&obd->obd_observer_link_sem); + RETURN(-EALREADY); + } + obd->obd_observer = observer; + up_write(&obd->obd_observer_link_sem); + RETURN(0); +} + +static inline int obd_pin_observer(struct obd_device *obd, + struct obd_device **observer) +{ + ENTRY; + down_read(&obd->obd_observer_link_sem); + if (!obd->obd_observer) { + *observer = NULL; + up_read(&obd->obd_observer_link_sem); + RETURN(-ENOENT); + } + *observer = obd->obd_observer; + RETURN(0); +} + +static inline int obd_unpin_observer(struct obd_device *obd) +{ + ENTRY; + up_read(&obd->obd_observer_link_sem); + RETURN(0); +} + +#if 0 +static inline int obd_register_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb, + obd_pin_extent_cb pin_cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb); + + rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb); + RETURN(rc); +} + +static inline int obd_unregister_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb); + + rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb); + RETURN(rc); +} + +static inline int obd_register_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb); + + rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb); + RETURN(rc); +} + +static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb); + + rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb); + RETURN(rc); +} +#endif + +/* metadata helpers */ +static inline int md_getstatus(struct obd_export *exp, + struct lu_fid *fid, struct obd_capa **pc) +{ + int rc; + ENTRY; + + EXP_CHECK_MD_OP(exp, getstatus); + EXP_MD_COUNTER_INCREMENT(exp, getstatus); + rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc); + RETURN(rc); +} + +static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, getattr); + EXP_MD_COUNTER_INCREMENT(exp, getattr); + rc = MDP(exp->exp_obd, getattr)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, null_inode); + EXP_MD_COUNTER_INCREMENT(exp, null_inode); + rc = MDP(exp->exp_obd, null_inode)(exp, fid); + RETURN(rc); +} + +static inline int md_find_cbdata(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, find_cbdata); + EXP_MD_COUNTER_INCREMENT(exp, find_cbdata); + rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data); + RETURN(rc); +} + +static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, close); + EXP_MD_COUNTER_INCREMENT(exp, close); + rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request); + RETURN(rc); +} + +static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, + __u32 gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, create); + EXP_MD_COUNTER_INCREMENT(exp, create); + rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev, request); + RETURN(rc); +} + +static inline int md_done_writing(struct obd_export *exp, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, done_writing); + EXP_MD_COUNTER_INCREMENT(exp, done_writing); + rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod); + RETURN(rc); +} + +static inline int md_enqueue(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct md_op_data *op_data, + struct lustre_handle *lockh, + void *lmm, int lmmsize, + struct ptlrpc_request **req, + int extra_lock_flags) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, enqueue); + EXP_MD_COUNTER_INCREMENT(exp, enqueue); + rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh, + lmm, lmmsize, req, extra_lock_flags); + RETURN(rc); +} + +static inline int md_getattr_name(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, getattr_name); + EXP_MD_COUNTER_INCREMENT(exp, getattr_name); + rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, void *lmm, + int lmmsize, struct lookup_intent *it, + int lookup_flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, intent_lock); + EXP_MD_COUNTER_INCREMENT(exp, intent_lock); + rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize, + it, lookup_flags, reqp, cb_blocking, + extra_lock_flags); + RETURN(rc); +} + +static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, link); + EXP_MD_COUNTER_INCREMENT(exp, link); + rc = MDP(exp->exp_obd, link)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, + int newlen, struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, rename); + EXP_MD_COUNTER_INCREMENT(exp, rename); + rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new, + newlen, request); + RETURN(rc); +} + +static inline int md_is_subdir(struct obd_export *exp, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, is_subdir); + EXP_MD_COUNTER_INCREMENT(exp, is_subdir); + rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request); + RETURN(rc); +} + +static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, + struct md_open_data **mod) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, setattr); + EXP_MD_COUNTER_INCREMENT(exp, setattr); + rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, + ea2, ea2len, request, mod); + RETURN(rc); +} + +static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, sync); + EXP_MD_COUNTER_INCREMENT(exp, sync); + rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request); + RETURN(rc); +} + +static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata, + struct page **pages, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, readpage); + EXP_MD_COUNTER_INCREMENT(exp, readpage); + rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request); + RETURN(rc); +} + +static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, unlink); + EXP_MD_COUNTER_INCREMENT(exp, unlink); + rc = MDP(exp->exp_obd, unlink)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_get_lustre_md(struct obd_export *exp, + struct ptlrpc_request *req, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, get_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md); + RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md)); +} + +static inline int md_free_lustre_md(struct obd_export *exp, + struct lustre_md *md) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, free_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md); + RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md)); +} + +static inline int md_setxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + obd_valid valid, const char *name, + const char *input, int input_size, + int output_size, int flags, __u32 suppgid, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, setxattr); + EXP_MD_COUNTER_INCREMENT(exp, setxattr); + RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + suppgid, request)); +} + +static inline int md_getxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + obd_valid valid, const char *name, + const char *input, int input_size, + int output_size, int flags, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, getxattr); + EXP_MD_COUNTER_INCREMENT(exp, getxattr); + RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + request)); +} + +static inline int md_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct ptlrpc_request *open_req) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, set_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data); + RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req)); +} + +static inline int md_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, clear_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data); + RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och)); +} + +static inline int md_set_lock_data(struct obd_export *exp, + __u64 *lockh, void *data, __u64 *bits) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, set_lock_data); + EXP_MD_COUNTER_INCREMENT(exp, set_lock_data); + RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits)); +} + +static inline int md_cancel_unused(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + int rc; + ENTRY; + + EXP_CHECK_MD_OP(exp, cancel_unused); + EXP_MD_COUNTER_INCREMENT(exp, cancel_unused); + + rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, + flags, opaque); + RETURN(rc); +} + +static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, + ldlm_type_t type, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, lock_match); + EXP_MD_COUNTER_INCREMENT(exp, lock_match); + RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, + policy, mode, lockh)); +} + +static inline int md_init_ea_size(struct obd_export *exp, int easize, + int def_asize, int cookiesize) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, init_ea_size); + EXP_MD_COUNTER_INCREMENT(exp, init_ea_size); + RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize, + cookiesize)); +} + +static inline int md_get_remote_perm(struct obd_export *exp, + const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, get_remote_perm); + EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm); + RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid, + request)); +} + +static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa, + renew_capa_cb_t cb) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, renew_capa); + EXP_MD_COUNTER_INCREMENT(exp, renew_capa); + rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb); + RETURN(rc); +} + +static inline int md_unpack_capa(struct obd_export *exp, + struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa **oc) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, unpack_capa); + EXP_MD_COUNTER_INCREMENT(exp, unpack_capa); + rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc); + RETURN(rc); +} + +static inline int md_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, intent_getattr_async); + EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async); + rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo); + RETURN(rc); +} + +static inline int md_revalidate_lock(struct obd_export *exp, + struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, revalidate_lock); + EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock); + rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); + RETURN(rc); +} + + +/* OBD Metadata Support */ + +extern int obd_init_caches(void); +extern void obd_cleanup_caches(void); + +/* support routines */ +extern struct kmem_cache *obdo_cachep; + +#define OBDO_ALLOC(ptr) \ +do { \ + OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO); \ +} while(0) + +#define OBDO_FREE(ptr) \ +do { \ + OBD_SLAB_FREE_PTR((ptr), obdo_cachep); \ +} while(0) + + +static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid) +{ + /* something here */ +} + +static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa) +{ + /* something here */ +} + +typedef int (*register_lwp_cb)(void *data); + +struct lwp_register_item { + struct obd_export **lri_exp; + register_lwp_cb lri_cb_func; + void *lri_cb_data; + struct list_head lri_list; + char lri_name[MTI_NAME_MAXLEN]; +}; + +/* I'm as embarrassed about this as you are. + * + * // XXX do not look into _superhack with remaining eye + * // XXX if this were any uglier, I'd get my own show on MTV */ +extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); + +/* obd_mount.c */ + +/* sysctl.c */ +extern void obd_sysctl_init (void); +extern void obd_sysctl_clean (void); + +/* uuid.c */ +typedef __u8 class_uuid_t[16]; +void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); + +/* lustre_peer.c */ +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); +int class_add_uuid(const char *uuid, __u64 nid); +int class_del_uuid (const char *uuid); +int class_check_uuid(struct obd_uuid *uuid, __u64 nid); +void class_init_uuidlist(void); +void class_exit_uuidlist(void); + +/* mea.c */ +int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen); +int raw_name2idx(int hashtype, int count, const char *name, int namelen); + +/* prng.c */ +#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t)) + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h new file mode 100644 index 000000000000..d82f3341d0a8 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_lov.h @@ -0,0 +1,126 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_LOV_H__ +#define _OBD_LOV_H__ + +#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS) + +static inline int lov_stripe_md_size(__u16 stripes) +{ + return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*); +} + +static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); +} + +struct lov_version_size { + __u32 lvs_magic; + size_t lvs_lmm_size; + size_t lvs_lod_size; +}; + +static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic) +{ + static const struct lov_version_size lmm_ver_size[] = { + { .lvs_magic = LOV_MAGIC_V3, + .lvs_lmm_size = sizeof(struct lov_mds_md_v3), + .lvs_lod_size = sizeof(struct lov_ost_data_v1) }, + { .lvs_magic = LOV_MAGIC_V1, + .lvs_lmm_size = sizeof(struct lov_mds_md_v1), + .lvs_lod_size = sizeof(struct lov_ost_data_v1)} }; + int i; + + for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) { + if (lmm_magic == lmm_ver_size[i].lvs_magic) { + if (ea_size <= lmm_ver_size[i].lvs_lmm_size) + return 0; + return (ea_size - lmm_ver_size[i].lvs_lmm_size) / + lmm_ver_size[i].lvs_lod_size; + } + } + + /* Invalid LOV magic, so no stripes could fit */ + return 0; +} + +/* lov_do_div64(a, b) returns a % b, and a = a / b. + * The 32-bit code is LOV-specific due to knowing about stripe limits in + * order to reduce the divisor to a 32-bit number. If the divisor is + * already a 32-bit value the compiler handles this directly. */ +#if BITS_PER_LONG > 32 +# define lov_do_div64(n,base) ({ \ + uint64_t __base = (base); \ + uint64_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ + }) +#else +# define lov_do_div64(n,base) ({ \ + uint64_t __rem; \ + if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ + int __remainder; \ + LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \ + "division %llu / %llu\n", (n), (uint64_t)(base)); \ + __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \ + (n) >>= LOV_MIN_STRIPE_BITS; \ + __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \ + __rem <<= LOV_MIN_STRIPE_BITS; \ + __rem += __remainder; \ + } else { \ + __rem = do_div(n, base); \ + } \ + __rem; \ + }) +#endif + +#define IOC_LOV_TYPE 'g' +#define IOC_LOV_MIN_NR 50 +#define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long) +#define IOC_LOV_MAX_NR 50 + +#define QOS_DEFAULT_THRESHOLD 10 /* MB */ +#define QOS_DEFAULT_MAXAGE 5 /* Seconds */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h new file mode 100644 index 000000000000..af89843c312b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_ost.h @@ -0,0 +1,96 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/obd_ost.h + * + * Data structures for object storage targets and client: OST & OSC's + * + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_OST_H +#define _LUSTRE_OST_H + +#include + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + obd_count aa_page_count; + int aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; + struct obd_capa *aa_ocapa; + struct cl_req *aa_clerq; +}; + +#define osc_grant_args osc_brw_async_args +struct osc_async_args { + struct obd_info *aa_oi; +}; + +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; + +struct osc_fsync_args { + struct obd_info *fa_oi; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; + +struct osc_enqueue_args { + struct obd_export *oa_exp; + __u64 *oa_flags; + obd_enqueue_update_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle *oa_lockh; + struct ldlm_enqueue_info *oa_ei; + unsigned int oa_agl:1; +}; + +#if 0 +int osc_extent_blocking_cb(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +#endif + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h new file mode 100644 index 000000000000..5f2b4e88f78f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_support.h @@ -0,0 +1,853 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_SUPPORT +#define _OBD_SUPPORT + +#include +#include +#include + +#include + +/* global variables */ +extern struct lprocfs_stats *obd_memory; +enum { + OBD_MEMORY_STAT = 0, + OBD_MEMORY_PAGES_STAT = 1, + OBD_STATS_NUM, +}; + +extern unsigned int obd_debug_peer_on_timeout; +extern unsigned int obd_dump_on_timeout; +extern unsigned int obd_dump_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ +extern unsigned int obd_timeout; /* seconds */ +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int obd_timeout_set; +extern unsigned int ldlm_timeout_set; +extern unsigned int at_min; +extern unsigned int at_max; +extern unsigned int at_history; +extern int at_early_margin; +extern int at_extra; +extern unsigned int obd_sync_filter; +extern unsigned int obd_max_dirty_pages; +extern atomic_t obd_unstable_pages; +extern atomic_t obd_dirty_pages; +extern atomic_t obd_dirty_transit_pages; +extern unsigned int obd_alloc_fail_rate; +extern char obd_jobid_var[]; + +/* lvfs.c */ +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line); + +/* Some hash init argument constants */ +#define HASH_POOLS_BKT_BITS 3 +#define HASH_POOLS_CUR_BITS 3 +#define HASH_POOLS_MAX_BITS 7 +#define HASH_UUID_BKT_BITS 5 +#define HASH_UUID_CUR_BITS 7 +#define HASH_UUID_MAX_BITS 12 +#define HASH_NID_BKT_BITS 5 +#define HASH_NID_CUR_BITS 7 +#define HASH_NID_MAX_BITS 12 +#define HASH_NID_STATS_BKT_BITS 5 +#define HASH_NID_STATS_CUR_BITS 7 +#define HASH_NID_STATS_MAX_BITS 12 +#define HASH_LQE_BKT_BITS 5 +#define HASH_LQE_CUR_BITS 7 +#define HASH_LQE_MAX_BITS 12 +#define HASH_CONN_BKT_BITS 5 +#define HASH_CONN_CUR_BITS 5 +#define HASH_CONN_MAX_BITS 15 +#define HASH_EXP_LOCK_BKT_BITS 5 +#define HASH_EXP_LOCK_CUR_BITS 7 +#define HASH_EXP_LOCK_MAX_BITS 16 +#define HASH_CL_ENV_BKT_BITS 5 +#define HASH_CL_ENV_BITS 10 +#define HASH_JOB_STATS_BKT_BITS 5 +#define HASH_JOB_STATS_CUR_BITS 7 +#define HASH_JOB_STATS_MAX_BITS 12 + +/* Timeout definitions */ +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* a bit more than maximal journal commit time in seconds */ +#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) +/* The min time a target should wait for clients to reconnect in recovery */ +#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_IR_FACTOR_MIN 1 +#define OBD_IR_FACTOR_MAX 10 +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +/* default timeout for the MGS to become IR_FULL */ +#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +#define LONG_UNLINK 300 /* Unlink should happen before now */ + +/** + * Time interval of shrink, if the client is "idle" more than this interval, + * then the ll_grant thread will return the requested grant space to filter + */ +#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ + +#define OBD_FAIL_MDS 0x100 +#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 +#define OBD_FAIL_MDS_GETATTR_NET 0x102 +#define OBD_FAIL_MDS_GETATTR_PACK 0x103 +#define OBD_FAIL_MDS_READPAGE_NET 0x104 +#define OBD_FAIL_MDS_READPAGE_PACK 0x105 +#define OBD_FAIL_MDS_SENDPAGE 0x106 +#define OBD_FAIL_MDS_REINT_NET 0x107 +#define OBD_FAIL_MDS_REINT_UNPACK 0x108 +#define OBD_FAIL_MDS_REINT_SETATTR 0x109 +#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a +#define OBD_FAIL_MDS_REINT_CREATE 0x10b +#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c +#define OBD_FAIL_MDS_REINT_UNLINK 0x10d +#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e +#define OBD_FAIL_MDS_REINT_LINK 0x10f +#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 +#define OBD_FAIL_MDS_REINT_RENAME 0x111 +#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 +#define OBD_FAIL_MDS_OPEN_NET 0x113 +#define OBD_FAIL_MDS_OPEN_PACK 0x114 +#define OBD_FAIL_MDS_CLOSE_NET 0x115 +#define OBD_FAIL_MDS_CLOSE_PACK 0x116 +#define OBD_FAIL_MDS_CONNECT_NET 0x117 +#define OBD_FAIL_MDS_CONNECT_PACK 0x118 +#define OBD_FAIL_MDS_REINT_NET_REP 0x119 +#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a +#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b +#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c +#define OBD_FAIL_MDS_STATFS_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_PIN_NET 0x120 +#define OBD_FAIL_MDS_UNPIN_NET 0x121 +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 +#define OBD_FAIL_MDS_SYNC_NET 0x124 +#define OBD_FAIL_MDS_SYNC_PACK 0x125 +#define OBD_FAIL_MDS_DONE_WRITING_NET 0x126 +#define OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 +#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 +#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 +#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b +#define OBD_FAIL_MDS_OST_SETATTR 0x12c +#define OBD_FAIL_MDS_QUOTACHECK_NET 0x12d +#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e +#define OBD_FAIL_MDS_CLIENT_ADD 0x12f +#define OBD_FAIL_MDS_GETXATTR_NET 0x130 +#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 +#define OBD_FAIL_MDS_SETXATTR_NET 0x132 +#define OBD_FAIL_MDS_SETXATTR 0x133 +#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 +#define OBD_FAIL_MDS_FS_SETUP 0x135 +#define OBD_FAIL_MDS_RESEND 0x136 +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 +#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e +#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 +#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 +#define OBD_FAIL_MDS_REINT_DELAY 0x142 +#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 +#define OBD_FAIL_MDS_PDO_LOCK 0x145 +#define OBD_FAIL_MDS_PDO_LOCK2 0x146 +#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 +#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 +#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 +#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a +#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b +#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d +#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e +#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f +#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 + +/* layout lock */ +#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 +#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 +#define OBD_FAIL_MDS_LL_BLOCK 0x172 + +/* CMD */ +#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 +#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 +#define OBD_FAIL_MDS_SET_INFO_NET 0x182 +#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 +#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 +#define OBD_FAIL_MDS_GET_INFO_NET 0x186 +#define OBD_FAIL_MDS_DQACQ_NET 0x187 + +/* OI scrub */ +#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 +#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 +#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 +#define OBD_FAIL_OSD_FID_MAPPING 0x193 +#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 + +#define OBD_FAIL_OST 0x200 +#define OBD_FAIL_OST_CONNECT_NET 0x201 +#define OBD_FAIL_OST_DISCONNECT_NET 0x202 +#define OBD_FAIL_OST_GET_INFO_NET 0x203 +#define OBD_FAIL_OST_CREATE_NET 0x204 +#define OBD_FAIL_OST_DESTROY_NET 0x205 +#define OBD_FAIL_OST_GETATTR_NET 0x206 +#define OBD_FAIL_OST_SETATTR_NET 0x207 +#define OBD_FAIL_OST_OPEN_NET 0x208 +#define OBD_FAIL_OST_CLOSE_NET 0x209 +#define OBD_FAIL_OST_BRW_NET 0x20a +#define OBD_FAIL_OST_PUNCH_NET 0x20b +#define OBD_FAIL_OST_STATFS_NET 0x20c +#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f +#define OBD_FAIL_OST_SYNC_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 +#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 +#define OBD_FAIL_OST_ENOSPC 0x215 +#define OBD_FAIL_OST_EROFS 0x216 +#define OBD_FAIL_OST_ENOENT 0x217 +#define OBD_FAIL_OST_QUOTACHECK_NET 0x218 +#define OBD_FAIL_OST_QUOTACTL_NET 0x219 +#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a +#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b +#define OBD_FAIL_OST_BRW_SIZE 0x21c +#define OBD_FAIL_OST_DROP_REQ 0x21d +#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e +#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 +#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 +#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 +#define OBD_FAIL_OST_NOMEM 0x226 +#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 +#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 +#define OBD_FAIL_OST_ENOINO 0x229 +#define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 + +#define OBD_FAIL_LDLM 0x300 +#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 +#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 +#define OBD_FAIL_LDLM_CONVERT_NET 0x303 +#define OBD_FAIL_LDLM_CANCEL_NET 0x304 +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 +#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 +#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 +#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 +#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a +#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b +#define OBD_FAIL_LDLM_REPLY 0x30c +#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e +#define OBD_FAIL_LDLM_GLIMPSE 0x30f +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 +#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 +#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 +#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 +#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 +#define OBD_FAIL_LDLM_NEW_LOCK 0x319 +#define OBD_FAIL_LDLM_AGL_DELAY 0x31a +#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b +#define OBD_FAIL_LDLM_OST_LVB 0x31c + +/* LOCKLESS IO */ +#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 + +#define OBD_FAIL_OSC 0x400 +#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 +#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 +#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 +#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 +#define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 +#define OBD_FAIL_OSC_SHUTDOWN 0x407 +#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 +#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a +#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b +#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d +#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f +#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 +#define OBD_FAIL_OSC_NO_GRANT 0x411 +#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 + +#define OBD_FAIL_PTLRPC 0x500 +#define OBD_FAIL_PTLRPC_ACK 0x501 +#define OBD_FAIL_PTLRPC_RQBD 0x502 +#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 +#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 +#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 +#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 +#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 +#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 + +#define OBD_FAIL_OBD_PING_NET 0x600 +#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 +#define OBD_FAIL_OBD_LOGD_NET 0x602 +#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 +#define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 +#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 +#define OBD_FAIL_OBD_IDX_READ_NET 0x607 +#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 +#define OBD_FAIL_OBD_NO_LRU 0x609 + +#define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 +#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 +#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 +#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 +#define OBD_FAIL_TGT_FAKE_EXP 0x708 +#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 +#define OBD_FAIL_TGT_LAST_REPLAY 0x710 +#define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 + +#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 +#define OBD_FAIL_MDC_RPCS_SEM 0x804 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 + +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 + +#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 +#define OBD_FAIL_QUOTA_EDQUOT 0xA02 +#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 +#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 + +#define OBD_FAIL_LPROC_REMOVE 0xB00 + +#define OBD_FAIL_GENERAL_ALLOC 0xC00 + +#define OBD_FAIL_SEQ 0x1000 +#define OBD_FAIL_SEQ_QUERY_NET 0x1001 +#define OBD_FAIL_SEQ_EXHAUST 0x1002 + +#define OBD_FAIL_FLD 0x1100 +#define OBD_FAIL_FLD_QUERY_NET 0x1101 + +#define OBD_FAIL_SEC_CTX 0x1200 +#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 +#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 +#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + +#define OBD_FAIL_LLOG 0x1300 +#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 +#define OBD_FAIL_LLOG_CATINFO_NET 0x1309 +#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 +#define OBD_FAIL_SEQ_ALLOC 0x1311 + +#define OBD_FAIL_LLITE 0x1400 +#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 +#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 +#define OBD_FAIL_LOV_INIT 0x1403 +#define OBD_FAIL_GLIMPSE_DELAY 0x1404 + +#define OBD_FAIL_FID_INDIR 0x1501 +#define OBD_FAIL_FID_INLMA 0x1502 +#define OBD_FAIL_FID_IGIF 0x1504 +#define OBD_FAIL_FID_LOOKUP 0x1505 +#define OBD_FAIL_FID_NOLMA 0x1506 + +/* LFSCK */ +#define OBD_FAIL_LFSCK_DELAY1 0x1600 +#define OBD_FAIL_LFSCK_DELAY2 0x1601 +#define OBD_FAIL_LFSCK_DELAY3 0x1602 +#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 +#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 +#define OBD_FAIL_LFSCK_FATAL1 0x1608 +#define OBD_FAIL_LFSCK_FATAL2 0x1609 +#define OBD_FAIL_LFSCK_CRASH 0x160a +#define OBD_FAIL_LFSCK_NO_AUTO 0x160b +#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c + +/* UPDATE */ +#define OBD_FAIL_UPDATE_OBJ_NET 0x1700 +#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + + +/* Assign references to moved code to reduce code changes */ +#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) +#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) +#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) +#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) +#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) +#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) +#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) +#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) +#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) +#define OBD_RACE(id) CFS_RACE(id) +#define OBD_FAIL_ONCE CFS_FAIL_ONCE +#define OBD_FAILED CFS_FAILED + +extern atomic_t libcfs_kmemory; + +#ifdef LPROCFS +#define obd_memory_add(size) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sub(size) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) +#define obd_pages_add(order) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sub(order) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) + +extern void obd_update_maxusage(void); +extern __u64 obd_memory_max(void); +extern __u64 obd_pages_max(void); + +#else + +extern __u64 obd_alloc; +extern __u64 obd_pages; + +extern __u64 obd_max_alloc; +extern __u64 obd_max_pages; + +static inline void obd_memory_add(long size) +{ + obd_alloc += size; + if (obd_alloc > obd_max_alloc) + obd_max_alloc = obd_alloc; +} + +static inline void obd_memory_sub(long size) +{ + obd_alloc -= size; +} + +static inline void obd_pages_add(int order) +{ + obd_pages += 1<< order; + if (obd_pages > obd_max_pages) + obd_max_pages = obd_pages; +} + +static inline void obd_pages_sub(int order) +{ + obd_pages -= 1<< order; +} + +#define obd_memory_sum() (obd_alloc) +#define obd_pages_sum() (obd_pages) + +#define obd_memory_max() (obd_max_alloc) +#define obd_pages_max() (obd_max_pages) + +#endif + +#define OBD_DEBUG_MEMUSAGE (1) + +#if OBD_DEBUG_MEMUSAGE +#define OBD_ALLOC_POST(ptr, size, name) \ + obd_memory_add(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr) + +#define OBD_FREE_PRE(ptr, size, name) \ + LASSERT(ptr); \ + obd_memory_sub(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr); \ + POISON(ptr, 0x5a, size) + +#else /* !OBD_DEBUG_MEMUSAGE */ + +#define OBD_ALLOC_POST(ptr, size, name) ((void)0) +#define OBD_FREE_PRE(ptr, size, name) ((void)0) + +#endif /* !OBD_DEBUG_MEMUSAGE */ + +#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC) + +#define OBD_ALLOC_FAIL_BITS 24 +#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1) +#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100) + +#if defined(LUSTRE_UTILS) /* this version is for utils only */ +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags) : \ + cfs_cpt_malloc(cptab, cpt, size, flags); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n", \ + (int)(size), __FILE__, __LINE__); \ + } else { \ + memset(ptr, 0, size); \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n", \ + (int)(size), ptr); \ + } \ +} while (0) + +#else /* this version is for the kernel and liblustre */ +#define OBD_FREE_RTN0(ptr) \ +({ \ + kfree(ptr); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags) : \ + cfs_cpt_malloc(cptab, cpt, size, flags); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "km", size, \ + __FILE__, __LINE__) || \ + OBD_FREE_RTN0(ptr)))){ \ + memset(ptr, 0, size); \ + OBD_ALLOC_POST(ptr, size, "kmalloced"); \ + } \ +} while (0) +#endif + +#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask) + +#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO) +#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS) +#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr)) +#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr)) + +#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask) + +#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO) + +#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \ + OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr)) + +# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) \ +do { \ + (ptr) = cptab == NULL ? \ + vmalloc(size) : \ + cfs_cpt_vmalloc(cptab, cpt, size); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \ + (int)(size)); \ + CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \ + obd_memory_sum(), atomic_read(&libcfs_kmemory)); \ + } else { \ + memset(ptr, 0, size); \ + OBD_ALLOC_POST(ptr, size, "vmalloced"); \ + } \ +} while(0) + +# define OBD_VMALLOC(ptr, size) \ + __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size) +# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \ + __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) + + +/* Allocations above this size are considered too big and could not be done + * atomically. + * + * Be very careful when changing this value, especially when decreasing it, + * since vmalloc in Linux doesn't perform well on multi-cores system, calling + * vmalloc in critical path would hurt peformance badly. See LU-66. + */ +#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE) + +#define OBD_ALLOC_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VMALLOC(ptr, size); \ + else \ + OBD_ALLOC(ptr, size); \ +} while (0) + +#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \ + else \ + OBD_CPT_ALLOC(ptr, cptab, cpt, size); \ +} while (0) + +#define OBD_FREE_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VFREE(ptr, size); \ + else \ + OBD_FREE(ptr, size); \ +} while (0) + + +#ifdef CONFIG_DEBUG_SLAB +#define POISON(ptr, c, s) do {} while (0) +#define POISON_PTR(ptr) ((void)0) +#else +#define POISON(ptr, c, s) memset(ptr, c, s) +#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef +#endif + +#ifdef POISON_BULK +#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE); \ + kunmap(page); } while (0) +#else +#define POISON_PAGE(page, val) do { } while (0) +#endif + +#define OBD_FREE(ptr, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "kfreed"); \ + kfree(ptr); \ + POISON_PTR(ptr); \ +} while(0) + + +#define OBD_FREE_RCU(ptr, size, handle) \ +do { \ + struct portals_handle *__h = (handle); \ + \ + LASSERT(handle != NULL); \ + __h->h_cookie = (unsigned long)(ptr); \ + __h->h_size = (size); \ + call_rcu(&__h->h_rcu, class_handle_free_cb); \ + POISON_PTR(ptr); \ +} while(0) + + +#define OBD_VFREE(ptr, size) \ + do { \ + OBD_FREE_PRE(ptr, size, "vfreed"); \ + vfree(ptr); \ + POISON_PTR(ptr); \ + } while (0) + +/* we memset() the slab object to 0 when allocation succeeds, so DO NOT + * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd + * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */ +#define OBD_SLAB_FREE_RTN0(ptr, slab) \ +({ \ + kmem_cache_free((slab), (ptr)); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \ +do { \ + LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \ + (ptr) = (cptab) == NULL ? \ + kmem_cache_alloc(slab, type) : \ + cfs_mem_cache_cpt_alloc(slab, cptab, cpt, type); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "slab-", size, \ + __FILE__, __LINE__) || \ + OBD_SLAB_FREE_RTN0(ptr, slab)))) { \ + memset(ptr, 0, size); \ + OBD_ALLOC_POST(ptr, size, "slab-alloced"); \ + } \ +} while(0) + +#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags) +#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags) + +#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr)) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "slab-freed"); \ + kmem_cache_free(slab, ptr); \ + POISON_PTR(ptr); \ +} while(0) + +#define OBD_SLAB_ALLOC(ptr, slab, size) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO) + +#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO) + +#define OBD_SLAB_ALLOC_PTR(ptr, slab) \ + OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr)) + +#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \ + OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr)) + +#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags) + +#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags) + +#define OBD_SLAB_FREE_PTR(ptr, slab) \ + OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr)) + +#define KEY_IS(str) \ + (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) + +/* Wrapper for contiguous page frame allocation */ +#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) \ +do { \ + (ptr) = (cptab) == NULL ? \ + alloc_page(gfp_mask) : \ + cfs_page_cpt_alloc(cptab, cpt, gfp_mask); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\ + "failed\n", (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT)); \ + CERROR(LPU64" total bytes and "LPU64" total pages " \ + "("LPU64" bytes) allocated by Lustre, " \ + "%d total bytes by LNET\n", \ + obd_memory_sum(), \ + obd_pages_sum() << PAGE_CACHE_SHIFT, \ + obd_pages_sum(), \ + atomic_read(&libcfs_kmemory)); \ + } else { \ + obd_pages_add(0); \ + CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / " \ + LPU64" bytes at %p.\n", \ + (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT), ptr); \ + } \ +} while (0) + +#define OBD_PAGE_ALLOC(ptr, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask) +#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) + +#define OBD_PAGE_FREE(ptr) \ +do { \ + LASSERT(ptr); \ + obd_pages_sub(0); \ + CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \ + "at %p.\n", \ + (int)1, (__u64)(1 << PAGE_CACHE_SHIFT), \ + ptr); \ + __free_page(ptr); \ + (ptr) = (void *)0xdeadbeef; \ +} while (0) + +#endif diff --git a/drivers/staging/lustre/lustre/lclient/glimpse.c b/drivers/staging/lustre/lustre/lclient/glimpse.c new file mode 100644 index 000000000000..7f3974be1f92 --- /dev/null +++ b/drivers/staging/lustre/lustre/lclient/glimpse.c @@ -0,0 +1,274 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * glimpse code shared between vvp and liblustre (and other Lustre clients in + * the future). + * + * Author: Nikita Danilov + * Author: Oleg Drokin + */ + +#include +#include +#include +#include + +# include +# include +# include +# include +# include + +#include "cl_object.h" +#include "lclient.h" +# include "../llite/llite_internal.h" + +static const struct cl_lock_descr whole_file = { + .cld_start = 0, + .cld_end = CL_PAGE_EOF, + .cld_mode = CLM_READ +}; + +/* + * Check whether file has possible unwriten pages. + * + * \retval 1 file is mmap-ed or has dirty pages + * 0 otherwise + */ +blkcnt_t dirty_cnt(struct inode *inode) +{ + blkcnt_t cnt = 0; + struct ccc_object *vob = cl_inode2ccc(inode); + void *results[1]; + + if (inode->i_mapping != NULL) + cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->page_tree, + results, 0, 1, + PAGECACHE_TAG_DIRTY); + if (cnt == 0 && atomic_read(&vob->cob_mmap_cnt) > 0) + cnt = 1; + + return (cnt > 0) ? 1 : 0; +} + +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl) +{ + struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr; + struct cl_inode_info *lli = cl_i2info(inode); + const struct lu_fid *fid = lu_object_fid(&clob->co_lu); + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock *lock; + int result; + + ENTRY; + result = 0; + if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) { + CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid)); + if (lli->lli_has_smd) { + /* NOTE: this looks like DLM lock request, but it may + * not be one. Due to CEF_ASYNC flag (translated + * to LDLM_FL_HAS_INTENT by osc), this is + * glimpse request, that won't revoke any + * conflicting DLM locks held. Instead, + * ll_glimpse_callback() will be called on each + * client holding a DLM lock against this file, + * and resulting size will be returned for each + * stripe. DLM lock on [0, EOF] is acquired only + * if there were no conflicting locks. If there + * were conflicting locks, enqueuing or waiting + * fails with -ENAVAIL, but valid inode + * attributes are returned anyway. */ + *descr = whole_file; + descr->cld_obj = clob; + descr->cld_mode = CLM_PHANTOM; + descr->cld_enq_flags = CEF_ASYNC | CEF_MUST; + if (agl) + descr->cld_enq_flags |= CEF_AGL; + cio->cui_glimpse = 1; + /* + * CEF_ASYNC is used because glimpse sub-locks cannot + * deadlock (because they never conflict with other + * locks) and, hence, can be enqueued out-of-order. + * + * CEF_MUST protects glimpse lock from conversion into + * a lockless mode. + */ + lock = cl_lock_request(env, io, descr, "glimpse", + current); + cio->cui_glimpse = 0; + + if (lock == NULL) + RETURN(0); + + if (IS_ERR(lock)) + RETURN(PTR_ERR(lock)); + + LASSERT(agl == 0); + result = cl_wait(env, lock); + if (result == 0) { + cl_merge_lvb(env, inode); + if (cl_isize_read(inode) > 0 && + inode->i_blocks == 0) { + /* + * LU-417: Add dirty pages block count + * lest i_blocks reports 0, some "cp" or + * "tar" may think it's a completely + * sparse file and skip it. + */ + inode->i_blocks = dirty_cnt(inode); + } + cl_unuse(env, lock); + } + cl_lock_release(env, lock, "glimpse", current); + } else { + CDEBUG(D_DLMTRACE, "No objects for inode\n"); + cl_merge_lvb(env, inode); + } + } + + RETURN(result); +} + +static int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, int *refcheck) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int result; + + if (S_ISREG(cl_inode_mode(inode))) { + env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + io = ccc_env_thread_io(env); + io->ci_obj = clob; + *envout = env; + *ioout = io; + result = +1; + } else + result = PTR_ERR(env); + } else + result = 0; + return result; +} + +int cl_glimpse_size0(struct inode *inode, int agl) +{ + /* + * We don't need ast_flags argument to cl_glimpse_size(), because + * osc_lock_enqueue() takes care of the possible deadlock that said + * argument was introduced to avoid. + */ + /* + * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to + * cl_glimpse_size(), which doesn't make sense: glimpse locks are not + * blocking anyway. + */ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + int result; + int refcheck; + + ENTRY; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result > 0) { + again: + io->ci_verify_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result > 0) + /* + * nothing to do for this io. This currently happens + * when stripe sub-object's are not yet created. + */ + result = io->ci_result; + else if (result == 0) + result = cl_glimpse_lock(env, io, inode, io->ci_obj, + agl); + + OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2); + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + cl_env_put(env, &refcheck); + } + RETURN(result); +} + +int cl_local_size(struct inode *inode) +{ + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct ccc_thread_info *cti; + struct cl_object *clob; + struct cl_lock_descr *descr; + struct cl_lock *lock; + int result; + int refcheck; + + ENTRY; + + if (!cl_i2info(inode)->lli_has_smd) + RETURN(0); + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + RETURN(result); + + clob = io->ci_obj; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result > 0) + result = io->ci_result; + else if (result == 0) { + cti = ccc_env_info(env); + descr = &cti->cti_descr; + + *descr = whole_file; + descr->cld_obj = clob; + lock = cl_lock_peek(env, io, descr, "localsize", current); + if (lock != NULL) { + cl_merge_lvb(env, inode); + cl_unuse(env, lock); + cl_lock_release(env, lock, "localsize", current); + result = 0; + } else + result = -ENODATA; + } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(result); +} diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c new file mode 100644 index 000000000000..4a0166687f07 --- /dev/null +++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c @@ -0,0 +1,1325 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code shared between vvp and liblustre (and other Lustre clients in the + * future). + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +# include +# include +# include +# include +# include +# include +# include +# include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../llite/llite_internal.h" + +const struct cl_req_operations ccc_req_ops; + +/* + * ccc_ prefix stands for "Common Client Code". + */ + +static struct kmem_cache *ccc_lock_kmem; +static struct kmem_cache *ccc_object_kmem; +static struct kmem_cache *ccc_thread_kmem; +static struct kmem_cache *ccc_session_kmem; +static struct kmem_cache *ccc_req_kmem; + +static struct lu_kmem_descr ccc_caches[] = { + { + .ckd_cache = &ccc_lock_kmem, + .ckd_name = "ccc_lock_kmem", + .ckd_size = sizeof (struct ccc_lock) + }, + { + .ckd_cache = &ccc_object_kmem, + .ckd_name = "ccc_object_kmem", + .ckd_size = sizeof (struct ccc_object) + }, + { + .ckd_cache = &ccc_thread_kmem, + .ckd_name = "ccc_thread_kmem", + .ckd_size = sizeof (struct ccc_thread_info), + }, + { + .ckd_cache = &ccc_session_kmem, + .ckd_name = "ccc_session_kmem", + .ckd_size = sizeof (struct ccc_session) + }, + { + .ckd_cache = &ccc_req_kmem, + .ckd_name = "ccc_req_kmem", + .ckd_size = sizeof (struct ccc_req) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +void *ccc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ccc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, ccc_thread_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +void ccc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, ccc_thread_kmem); +} + +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ccc_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, ccc_session_kmem, __GFP_IO); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_session *session = data; + OBD_SLAB_FREE_PTR(session, ccc_session_kmem); +} + +struct lu_context_key ccc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = ccc_key_init, + .lct_fini = ccc_key_fini +}; + +struct lu_context_key ccc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = ccc_session_key_init, + .lct_fini = ccc_session_key_fini +}; + + +/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */ +// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key); + +int ccc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct ccc_device *vdv; + int rc; + ENTRY; + + vdv = lu2ccc_dev(d); + vdv->cdv_next = lu2cl_dev(next); + + LASSERT(d->ld_site != NULL && next->ld_type != NULL); + next->ld_site = d->ld_site; + rc = next->ld_type->ldt_ops->ldto_device_init( + env, next, next->ld_type->ldt_name, NULL); + if (rc == 0) { + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + } + RETURN(rc); +} + +struct lu_device *ccc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return cl2lu_dev(lu2ccc_dev(d)->cdv_next); +} + +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops) +{ + struct ccc_device *vdv; + struct lu_device *lud; + struct cl_site *site; + int rc; + ENTRY; + + OBD_ALLOC_PTR(vdv); + if (vdv == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lud = &vdv->cdv_cl.cd_lu_dev; + cl_device_init(&vdv->cdv_cl, t); + ccc2lu_dev(vdv)->ld_ops = luops; + vdv->cdv_cl.cd_ops = clops; + + OBD_ALLOC_PTR(site); + if (site != NULL) { + rc = cl_site_init(site, &vdv->cdv_cl); + if (rc == 0) + rc = lu_site_init_finish(&site->cs_lu); + else { + LASSERT(lud->ld_site == NULL); + CERROR("Cannot init lu_site, rc %d.\n", rc); + OBD_FREE_PTR(site); + } + } else + rc = -ENOMEM; + if (rc != 0) { + ccc_device_free(env, lud); + lud = ERR_PTR(rc); + } + RETURN(lud); +} + +struct lu_device *ccc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct ccc_device *vdv = lu2ccc_dev(d); + struct cl_site *site = lu2cl_site(d->ld_site); + struct lu_device *next = cl2lu_dev(vdv->cdv_next); + + if (d->ld_site != NULL) { + cl_site_fini(site); + OBD_FREE_PTR(site); + } + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(vdv); + return next; +} + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct ccc_req *vrq; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(vrq, ccc_req_kmem, __GFP_IO); + if (vrq != NULL) { + cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** + * An `emergency' environment used by ccc_inode_fini() when cl_env_get() + * fails. Access to this environment is serialized by ccc_inode_fini_guard + * mutex. + */ +static struct lu_env *ccc_inode_fini_env = NULL; + +/** + * A mutex serializing calls to slp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +static DEFINE_MUTEX(ccc_inode_fini_guard); +static int dummy_refcheck; + +int ccc_global_init(struct lu_device_type *device_type) +{ + int result; + + result = lu_kmem_init(ccc_caches); + if (result) + return result; + + result = lu_device_type_init(device_type); + if (result) + goto out_kmem; + + ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (IS_ERR(ccc_inode_fini_env)) { + result = PTR_ERR(ccc_inode_fini_env); + goto out_device; + } + + ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + return 0; +out_device: + lu_device_type_fini(device_type); +out_kmem: + lu_kmem_fini(ccc_caches); + return result; +} + +void ccc_global_fini(struct lu_device_type *device_type) +{ + if (ccc_inode_fini_env != NULL) { + cl_env_put(ccc_inode_fini_env, &dummy_refcheck); + ccc_inode_fini_env = NULL; + } + lu_device_type_fini(device_type); + lu_kmem_fini(ccc_caches); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops) +{ + struct ccc_object *vob; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(vob, ccc_object_kmem, __GFP_IO); + if (vob != NULL) { + struct cl_object_header *hdr; + + obj = ccc2lu(vob); + hdr = &vob->cob_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + vob->cob_cl.co_ops = clops; + obj->lo_ops = luops; + } else + obj = NULL; + return obj; +} + +int ccc_object_init0(const struct lu_env *env, + struct ccc_object *vob, + const struct cl_object_conf *conf) +{ + vob->cob_inode = conf->coc_inode; + vob->cob_transient_pages = 0; + cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page)); + return 0; +} + +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct ccc_device *dev = lu2ccc_dev(obj->lo_dev); + struct ccc_object *vob = lu2ccc(obj); + struct lu_object *below; + struct lu_device *under; + int result; + + under = &dev->cdv_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + const struct cl_object_conf *cconf; + + cconf = lu2cl_conf(conf); + INIT_LIST_HEAD(&vob->cob_pending_list); + lu_object_add(obj, below); + result = ccc_object_init0(env, vob, cconf); + } else + result = -ENOMEM; + return result; +} + +void ccc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct ccc_object *vob = lu2ccc(obj); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + OBD_SLAB_FREE_PTR(vob, ccc_object_kmem); +} + +int ccc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused, + const struct cl_lock_operations *lkops) +{ + struct ccc_lock *clk; + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + OBD_SLAB_ALLOC_PTR_GFP(clk, ccc_lock_kmem, __GFP_IO); + if (clk != NULL) { + cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + return 0; +} + +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct inode *inode = ccc_object_inode(obj); + + ENTRY; + lvb->lvb_mtime = cl_inode_mtime(inode); + lvb->lvb_atime = cl_inode_atime(inode); + lvb->lvb_ctime = cl_inode_ctime(inode); + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); + RETURN(0); +} + + + +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + /* TODO: destroy all pages attached to this object. */ + return 0; +} + +static void ccc_object_size_lock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_isize_lock(inode); + cl_object_attr_lock(obj); +} + +static void ccc_object_size_unlock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_object_attr_unlock(obj); + cl_isize_unlock(inode); +} + +/***************************************************************************** + * + * Page operations. + * + */ + +struct page *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2vm_page(slice); +} + +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr; + struct cl_page *page = slice->cpl_page; + + int result; + + ENTRY; + + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + io->ci_type == CIT_FAULT) { + if (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED) + result = -EBUSY; + else { + desc->cld_start = page->cp_index; + desc->cld_end = page->cp_index; + desc->cld_obj = page->cp_obj; + desc->cld_mode = CLM_READ; + result = cl_queue_match(&io->ci_lockset.cls_done, + desc) ? -EBUSY : 0; + } + } else + result = 0; + RETURN(result); +} + +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + +void ccc_transient_page_verify(const struct cl_page *page) +{ +} + +int ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused, + int nonblock) +{ + ccc_transient_page_verify(slice->cpl_page); + return 0; +} + +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + ccc_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ENTRY; + /* transient page should always be sent. */ + RETURN(0); +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +void ccc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); +} + +void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) +{ + struct ccc_lock *clk = cl2ccc_lock(slice); + OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem); +} + +int ccc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, __u32 enqflags) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +/** + * Implementation of cl_lock_operations::clo_fits_into() methods for ccc + * layer. This function is executed every time io finds an existing lock in + * the lock cache while creating new lock. This function has to decide whether + * cached lock "fits" into io. + * + * \param slice lock to be checked + * \param io IO that wants a lock. + * + * \see lov_lock_fits_into(). + */ +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock *lock = slice->cls_lock; + const struct cl_lock_descr *descr = &lock->cll_descr; + const struct ccc_io *cio = ccc_env_io(env); + int result; + + ENTRY; + /* + * Work around DLM peculiarity: it assumes that glimpse + * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock + * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make + * sure that glimpse doesn't get CLM_WRITE top-lock, so that it + * doesn't enqueue CLM_WRITE sub-locks. + */ + if (cio->cui_glimpse) + result = descr->cld_mode != CLM_WRITE; + + /* + * Also, don't match incomplete write locks for read, otherwise read + * would enqueue missing sub-locks in the write mode. + */ + else if (need->cld_mode != descr->cld_mode) + result = lock->cll_state >= CLS_ENQUEUED; + else + result = 1; + RETURN(result); +} + +/** + * Implements cl_lock_operations::clo_state() method for ccc layer, invoked + * whenever lock state changes. Transfers object attributes, that might be + * updated as a result of lock acquiring into inode. + */ +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct cl_lock *lock = slice->cls_lock; + ENTRY; + + /* + * Refresh inode attributes when the lock is moving into CLS_HELD + * state, and only when this is a result of real enqueue, rather than + * of finding lock in the cache. + */ + if (state == CLS_HELD && lock->cll_state < CLS_HELD) { + struct cl_object *obj; + struct inode *inode; + + obj = slice->cls_obj; + inode = ccc_object_inode(obj); + + /* vmtruncate() sets the i_size + * under both a DLM lock and the + * ll_inode_size_lock(). If we don't get the + * ll_inode_size_lock() here we can match the DLM lock and + * reset i_size. generic_file_write can then trust the + * stale i_size when doing appending writes and effectively + * cancel the result of the truncate. Getting the + * ll_inode_size_lock() after the enqueue maintains the DLM + * -> ll_inode_size_lock() acquiring order. */ + if (lock->cll_descr.cld_start == 0 && + lock->cll_descr.cld_end == CL_PAGE_EOF) + cl_merge_lvb(env, inode); + } + EXIT; +} + +/***************************************************************************** + * + * io operations. + * + */ + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); +} + +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock_descr *descr = &cio->cui_link.cill_descr; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); + + memset(&cio->cui_link, 0, sizeof cio->cui_link); + + if (cio->cui_fd && (cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + descr->cld_mode = CLM_GROUP; + descr->cld_gid = cio->cui_fd->fd_grouplock.cg_gid; + } else { + descr->cld_mode = mode; + } + descr->cld_obj = obj; + descr->cld_start = start; + descr->cld_end = end; + descr->cld_enq_flags = enqflags; + + cl_io_lock_add(env, io, &cio->cui_link); + RETURN(0); +} + +void ccc_io_update_iov(const struct lu_env *env, + struct ccc_io *cio, struct cl_io *io) +{ + int i; + size_t size = io->u.ci_rw.crw_count; + + cio->cui_iov_olen = 0; + if (!cl_is_normalio(env, io) || cio->cui_tot_nrsegs == 0) + return; + + for (i = 0; i < cio->cui_tot_nrsegs; i++) { + struct iovec *iv = &cio->cui_iov[i]; + + if (iv->iov_len < size) + size -= iv->iov_len; + else { + if (iv->iov_len > size) { + cio->cui_iov_olen = iv->iov_len; + iv->iov_len = size; + } + break; + } + } + + cio->cui_nrsegs = i + 1; + LASSERTF(cio->cui_tot_nrsegs >= cio->cui_nrsegs, + "tot_nrsegs: %lu, nrsegs: %lu\n", + cio->cui_tot_nrsegs, cio->cui_nrsegs); +} + +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end) +{ + struct cl_object *obj = io->ci_obj; + return ccc_io_one_lock_index(env, io, enqflags, mode, + cl_index(obj, start), cl_index(obj, end)); +} + +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + CLOBINVRNT(env, ios->cis_io->ci_obj, + ccc_object_invariant(ios->cis_io->ci_obj)); +} + +void ccc_io_advance(const struct lu_env *env, + const struct cl_io_slice *ios, + size_t nob) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = ios->cis_io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + if (!cl_is_normalio(env, io)) + return; + + LASSERT(cio->cui_tot_nrsegs >= cio->cui_nrsegs); + LASSERT(cio->cui_tot_count >= nob); + + cio->cui_iov += cio->cui_nrsegs; + cio->cui_tot_nrsegs -= cio->cui_nrsegs; + cio->cui_tot_count -= nob; + + /* update the iov */ + if (cio->cui_iov_olen > 0) { + struct iovec *iv; + + cio->cui_iov--; + cio->cui_tot_nrsegs++; + iv = &cio->cui_iov[0]; + if (io->ci_continue) { + iv->iov_base += iv->iov_len; + LASSERT(cio->cui_iov_olen > iv->iov_len); + iv->iov_len = cio->cui_iov_olen - iv->iov_len; + } else { + /* restore the iov_len, in case of restart io. */ + iv->iov_len = cio->cui_iov_olen; + } + cio->cui_iov_olen = 0; + } +} + +/** + * Helper function that if necessary adjusts file size (inode->i_size), when + * position at the offset \a pos is accessed. File size can be arbitrary stale + * on a Lustre client, but client at least knows KMS. If accessed area is + * inside [0, KMS], set file size to KMS, otherwise glimpse file size. + * + * Locking: cl_isize_lock is used to serialize changes to inode size and to + * protect consistency between inode size and cl_object + * attributes. cl_object_size_lock() protects consistency between cl_attr's of + * top-object and sub-objects. + */ +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, int *exceed) +{ + struct cl_attr *attr = ccc_env_thread_attr(env); + struct inode *inode = ccc_object_inode(obj); + loff_t pos = start + count - 1; + loff_t kms; + int result; + + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being accessed and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock already acquired by + * the caller, because to change the class, other client has to take + * DLM lock conflicting with our lock. Also, any updates to ->i_size + * by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + ccc_object_size_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + kms = attr->cat_kms; + if (pos > kms) { + /* + * A glimpse is necessary to determine whether we + * return a short read (B) or some zeroes at the end + * of the buffer (C) + */ + ccc_object_size_unlock(obj); + result = cl_glimpse_lock(env, io, inode, obj, 0); + if (result == 0 && exceed != NULL) { + /* If objective page index exceed end-of-file + * page index, return directly. Do not expect + * kernel will check such case correctly. + * linux-2.6.18-128.1.1 miss to do that. + * --bug 17336 */ + loff_t size = cl_isize_read(inode); + unsigned long cur_index = start >> PAGE_CACHE_SHIFT; + + if ((size == 0 && cur_index != 0) || + (((size - 1) >> PAGE_CACHE_SHIFT) < cur_index)) + *exceed = 1; + } + return result; + } else { + /* + * region is within kms and, hence, within real file + * size (A). We need to increase i_size to cover the + * read region so that generic_file_read() will do its + * job, but that doesn't mean the kms size is + * _correct_, it is only the _minimum_ size. If + * someone does a stat they will get the correct size + * which will always be >= the kms value here. + * b=11081 + */ + if (cl_isize_read(inode) < kms) { + cl_isize_write_nolock(inode, kms); + CDEBUG(D_VFSTRACE, + DFID" updating i_size "LPU64"\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)cl_isize_read(inode)); + + } + } + } + ccc_object_size_unlock(obj); + return result; +} + +/***************************************************************************** + * + * Transfer operations. + * + */ + +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct ccc_req *vrq; + + if (ioret > 0) + cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret); + + vrq = cl2ccc_req(slice); + OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for ccc + * layer. ccc is responsible for + * + * - o_[mac]time + * + * - o_mode + * + * - o_parent_seq + * + * - o_[ug]id + * + * - o_parent_oid + * + * - o_parent_ver + * + * - o_ioepoch, + * + * and capability. + */ +void ccc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct inode *inode; + struct obdo *oa; + obd_flag valid_flags; + + oa = attr->cra_oa; + inode = ccc_object_inode(obj); + valid_flags = OBD_MD_FLTYPE; + + if ((flags & OBD_MD_FLOSSCAPA) != 0) { + LASSERT(attr->cra_capa == NULL); + attr->cra_capa = cl_capa_lookup(inode, + slice->crs_req->crq_type); + } + + if (slice->crs_req->crq_type == CRT_WRITE) { + if (flags & OBD_MD_FLEPOCH) { + oa->o_valid |= OBD_MD_FLEPOCH; + oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch; + valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLUID | OBD_MD_FLGID; + } + } + obdo_from_inode(oa, inode, valid_flags & flags); + obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid); + memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid, + JOBSTATS_JOBID_SIZE); +} + +const struct cl_req_operations ccc_req_ops = { + .cro_attr_set = ccc_req_attr_set, + .cro_completion = ccc_req_completion +}; + +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa) +{ + struct lu_env *env; + struct cl_io *io; + int result; + int refcheck; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = ccc_env_thread_io(env); + io->ci_obj = cl_i2info(inode)->lli_clob; + + io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); + io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime); + io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); + io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; + io->u.ci_setattr.sa_valid = attr->ia_valid; + io->u.ci_setattr.sa_capa = capa; + +again: + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { + struct ccc_io *cio = ccc_env_io(env); + + if (attr->ia_valid & ATTR_FILE) + /* populate the file descriptor for ftruncate to honor + * group lock - see LU-787 */ + cio->cui_fd = cl_iattr2fd(inode, attr); + + result = cl_io_loop(env, io); + } else { + result = io->ci_result; + } + cl_io_fini(env, io); + if (unlikely(io->ci_need_restart)) + goto again; + cl_env_put(env, &refcheck); + RETURN(result); +} + +/***************************************************************************** + * + * Type conversions. + * + */ + +struct lu_device *ccc2lu_dev(struct ccc_device *vdv) +{ + return &vdv->cdv_cl.cd_lu_dev; +} + +struct ccc_device *lu2ccc_dev(const struct lu_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev); +} + +struct ccc_device *cl2ccc_dev(const struct cl_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl); +} + +struct lu_object *ccc2lu(struct ccc_object *vob) +{ + return &vob->cob_cl.co_lu; +} + +struct ccc_object *lu2ccc(const struct lu_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl.co_lu); +} + +struct ccc_object *cl2ccc(const struct cl_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl); +} + +struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice) +{ + return container_of(slice, struct ccc_lock, clk_cl); +} + +struct ccc_io *cl2ccc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct ccc_io *cio; + + cio = container_of(slice, struct ccc_io, cui_cl); + LASSERT(cio == ccc_env_io(env)); + return cio; +} + +struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct ccc_req, crq_cl); +} + +struct page *cl2vm_page(const struct cl_page_slice *slice) +{ + return cl2ccc_page(slice)->cpg_page; +} + +/***************************************************************************** + * + * Accessors. + * + */ +int ccc_object_invariant(const struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + struct cl_inode_info *lli = cl_i2info(inode); + + return (S_ISREG(cl_inode_mode(inode)) || + /* i_mode of unlinked inode is zeroed. */ + cl_inode_mode(inode) == 0) && lli->lli_clob == obj; +} + +struct inode *ccc_object_inode(const struct cl_object *obj) +{ + return cl2ccc(obj)->cob_inode; +} + +/** + * Returns a pointer to cl_page associated with \a vmpage, without acquiring + * additional reference to the resulting page. This is an unsafe version of + * cl_vmpage_page() that can only be used under vmpage lock. + */ +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage) +{ + KLASSERT(PageLocked(vmpage)); + return (struct cl_page *)vmpage->private; +} + +/** + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. + * + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. + */ +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) +{ + struct lu_env *env; + struct cl_inode_info *lli; + struct cl_object *clob; + struct lu_site *site; + struct lu_fid *fid; + struct cl_object_conf conf = { + .coc_inode = inode, + .u = { + .coc_md = md + } + }; + int result = 0; + int refcheck; + + LASSERT(md->body->valid & OBD_MD_FLID); + LASSERT(S_ISREG(cl_inode_mode(inode))); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + site = cl_i2sbi(inode)->ll_site; + lli = cl_i2info(inode); + fid = &lli->lli_fid; + LASSERT(fid_is_sane(fid)); + + if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. */ + LASSERT(inode->i_state & I_NEW); + conf.coc_lu.loc_flags = LOC_F_NEW; + clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), + fid, &conf); + if (!IS_ERR(clob)) { + /* + * No locking is necessary, as new inode is + * locked by I_NEW bit. + */ + lli->lli_clob = clob; + lli->lli_has_smd = md->lsm != NULL; + lu_object_ref_add(&clob->co_lu, "inode", inode); + } else + result = PTR_ERR(clob); + } else { + result = cl_conf_set(env, lli->lli_clob, &conf); + } + + cl_env_put(env, &refcheck); + + if (result != 0) + CERROR("Failure to initialize cl object "DFID": %d\n", + PFID(fid), result); + return result; +} + +/** + * Wait for others drop their references of the object at first, then we drop + * the last one, which will lead to the object be destroyed immediately. + * Must be called after cl_object_kill() against this object. + * + * The reason we want to do this is: destroying top object will wait for sub + * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) + * to initiate top object destroying which may deadlock. See bz22520. + */ +static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *header = obj->co_lu.lo_header; + wait_queue_t waiter; + + if (unlikely(atomic_read(&header->loh_ref) != 1)) { + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + struct lu_site_bkt_data *bkt; + + bkt = lu_site_bkt_from_fid(site, &header->loh_fid); + + init_waitqueue_entry_current(&waiter); + add_wait_queue(&bkt->lsb_marche_funebre, &waiter); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&header->loh_ref) == 1) + break; + waitq_wait(&waiter, TASK_UNINTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&bkt->lsb_marche_funebre, &waiter); + } + + cl_object_put(env, obj); +} + +void cl_inode_fini(struct inode *inode) +{ + struct lu_env *env; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int refcheck; + int emergency; + + if (clob != NULL) { + void *cookie; + + cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + emergency = IS_ERR(env); + if (emergency) { + mutex_lock(&ccc_inode_fini_guard); + LASSERT(ccc_inode_fini_env != NULL); + cl_env_implant(ccc_inode_fini_env, &refcheck); + env = ccc_inode_fini_env; + } + /* + * cl_object cache is a slave to inode cache (which, in turn + * is a slave to dentry cache), don't keep cl_object in memory + * when its master is evicted. + */ + cl_object_kill(env, clob); + lu_object_ref_del(&clob->co_lu, "inode", inode); + cl_object_put_last(env, clob); + lli->lli_clob = NULL; + if (emergency) { + cl_env_unplant(ccc_inode_fini_env, &refcheck); + mutex_unlock(&ccc_inode_fini_guard); + } else + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + } +} + +/** + * return IF_* type for given lu_dirent entry. + * IF_* flag shld be converted to particular OS file type in + * platform llite module. + */ +__u16 ll_dirent_type_get(struct lu_dirent *ent) +{ + __u16 type = 0; + struct luda_type *lt; + int len = 0; + + if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + + len = le16_to_cpu(ent->lde_namelen); + len = (len + align) & ~align; + lt = (void *)ent->lde_name + len; + type = IFTODT(le16_to_cpu(lt->lt_type)); + } + return type; +} + +/** + * build inode number from passed @fid */ +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) +{ + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + else + RETURN(fid_flatten(fid)); +} + +/** + * build inode generation from passed @fid. If our FID overflows the 32-bit + * inode number then return a non-zero generation to distinguish them. */ +__u32 cl_fid_build_gen(const struct lu_fid *fid) +{ + __u32 gen; + ENTRY; + + if (fid_is_igif(fid)) { + gen = lu_igif_gen(fid); + RETURN(gen); + } + + gen = (fid_flatten(fid) >> 32); + RETURN(gen); +} + +/* lsm is unreliable after hsm implementation as layout can be changed at + * any time. This is only to support old, non-clio-ized interfaces. It will + * cause deadlock if clio operations are called with this extra layout refcount + * because in case the layout changed during the IO, ll_layout_refresh() will + * have to wait for the refcount to become zero to destroy the older layout. + * + * Notice that the lsm returned by this function may not be valid unless called + * inside layout lock - MDS_INODELOCK_LAYOUT. */ +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode) +{ + return lov_lsm_get(cl_i2info(inode)->lli_clob); +} + +void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm) +{ + lov_lsm_put(cl_i2info(inode)->lli_clob, lsm); +} diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_misc.c b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c new file mode 100644 index 000000000000..8ecbef92753d --- /dev/null +++ b/drivers/staging/lustre/lustre/lclient/lcommon_misc.c @@ -0,0 +1,194 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code shared between vvp and liblustre (and other Lustre clients in the + * future). + * + */ +#include +#include +#include +#include +#include + +#include + + +/* Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold the + * maximum-sized (= maximum striped) EA and cookie without having to + * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ +int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) +{ + struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 }; + __u32 valsize = sizeof(struct lov_desc); + int rc, easize, def_easize, cookiesize; + struct lov_desc desc; + __u16 stripes; + ENTRY; + + rc = obd_get_info(NULL, dt_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC, + &valsize, &desc, NULL); + if (rc) + RETURN(rc); + + stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT); + lsm.lsm_stripe_count = stripes; + easize = obd_size_diskmd(dt_exp, &lsm); + + lsm.lsm_stripe_count = desc.ld_default_stripe_count; + def_easize = obd_size_diskmd(dt_exp, &lsm); + + cookiesize = stripes * sizeof(struct llog_cookie); + + CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n", + easize, cookiesize); + + rc = md_init_ea_size(md_exp, easize, def_easize, cookiesize); + RETURN(rc); +} + +/** + * This function is used as an upcall-callback hooked by liblustre and llite + * clients into obd_notify() listeners chain to handle notifications about + * change of import connect_flags. See llu_fsswop_mount() and + * lustre_common_fill_super(). + */ +int cl_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data) +{ + struct lustre_client_ocd *lco; + struct client_obd *cli; + __u64 flags; + int result; + + ENTRY; + if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + cli = &watched->u.cli; + lco = owner; + flags = cli->cl_import->imp_connect_data.ocd_connect_flags; + CDEBUG(D_SUPER, "Changing connect_flags: "LPX64" -> "LPX64"\n", + lco->lco_flags, flags); + mutex_lock(&lco->lco_lock); + lco->lco_flags &= flags; + /* for each osc event update ea size */ + if (lco->lco_dt_exp) + cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); + + mutex_unlock(&lco->lco_lock); + result = 0; + } else { + CERROR("unexpected notification from %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + result = -EINVAL; + } + RETURN(result); +} + +#define GROUPLOCK_SCOPE "grouplock" + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ccc_grouplock *cg) +{ + struct lu_env *env; + struct cl_io *io; + struct cl_lock *lock; + struct cl_lock_descr *descr; + __u32 enqflags; + int refcheck; + int rc; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + io = ccc_env_thread_io(env); + io->ci_obj = obj; + io->ci_ignore_layout = 1; + + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc) { + LASSERT(rc < 0); + cl_env_put(env, &refcheck); + return rc; + } + + descr = &ccc_env_info(env)->cti_descr; + descr->cld_obj = obj; + descr->cld_start = 0; + descr->cld_end = CL_PAGE_EOF; + descr->cld_gid = gid; + descr->cld_mode = CLM_GROUP; + + enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); + descr->cld_enq_flags = enqflags; + + lock = cl_lock_request(env, io, descr, GROUPLOCK_SCOPE, current); + if (IS_ERR(lock)) { + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + return PTR_ERR(lock); + } + + cg->cg_env = cl_env_get(&refcheck); + cg->cg_io = io; + cg->cg_lock = lock; + cg->cg_gid = gid; + LASSERT(cg->cg_env == env); + + cl_env_unplant(env, &refcheck); + return 0; +} + +void cl_put_grouplock(struct ccc_grouplock *cg) +{ + struct lu_env *env = cg->cg_env; + struct cl_io *io = cg->cg_io; + struct cl_lock *lock = cg->cg_lock; + int refcheck; + + LASSERT(cg->cg_env); + LASSERT(cg->cg_gid); + + cl_env_implant(env, &refcheck); + cl_env_put(env, &refcheck); + + cl_unuse(env, lock); + cl_lock_release(env, lock, GROUPLOCK_SCOPE, current); + cl_io_fini(env, io); + cl_env_put(env, NULL); +} diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c new file mode 100644 index 000000000000..ce90c7e3c488 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/interval_tree.c @@ -0,0 +1,764 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/interval_tree.c + * + * Interval tree library used by ldlm extent lock code + * + * Author: Huang Wei + * Author: Jay Xiong + */ +# include +#include +#include + +enum { + INTERVAL_RED = 0, + INTERVAL_BLACK = 1 +}; + +static inline int node_is_left_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_left; +} + +static inline int node_is_right_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_right; +} + +static inline int node_is_red(struct interval_node *node) +{ + return node->in_color == INTERVAL_RED; +} + +static inline int node_is_black(struct interval_node *node) +{ + return node->in_color == INTERVAL_BLACK; +} + +static inline int extent_compare(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + int rc; + if (e1->start == e2->start) { + if (e1->end < e2->end) + rc = -1; + else if (e1->end > e2->end) + rc = 1; + else + rc = 0; + } else { + if (e1->start < e2->start) + rc = -1; + else + rc = 1; + } + return rc; +} + +static inline int extent_equal(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start == e2->start) && (e1->end == e2->end); +} + +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_compare(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_compare(&n1->in_extent, &n2->in_extent); +} + +static inline int node_equal(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + +static inline __u64 max_u64(__u64 x, __u64 y) +{ + return x > y ? x : y; +} + +static inline __u64 min_u64(__u64 x, __u64 y) +{ + return x < y ? x : y; +} + +#define interval_for_each(node, root) \ +for (node = interval_first(root); node != NULL; \ + node = interval_next(node)) + +#define interval_for_each_reverse(node, root) \ +for (node = interval_last(root); node != NULL; \ + node = interval_prev(node)) + +static struct interval_node *interval_first(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_left) + node = node->in_left; + RETURN(node); +} + +static struct interval_node *interval_last(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_right) + node = node->in_right; + RETURN(node); +} + +static struct interval_node *interval_next(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + if (node->in_right) + RETURN(interval_first(node->in_right)); + while (node->in_parent && node_is_right_child(node)) + node = node->in_parent; + RETURN(node->in_parent); +} + +static struct interval_node *interval_prev(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + + if (node->in_left) + RETURN(interval_last(node->in_left)); + + while (node->in_parent && node_is_left_child(node)) + node = node->in_parent; + + RETURN(node->in_parent); +} + +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate); + +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each_reverse(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate_reverse); + +/* try to find a node with same interval in the tree, + * if found, return the pointer to the node, otherwise return NULL*/ +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex) +{ + struct interval_node *walk = root; + int rc; + ENTRY; + + while (walk) { + rc = extent_compare(ex, &walk->in_extent); + if (rc == 0) + break; + else if (rc < 0) + walk = walk->in_left; + else + walk = walk->in_right; + } + + RETURN(walk); +} +EXPORT_SYMBOL(interval_find); + +static void __rotate_change_maxhigh(struct interval_node *node, + struct interval_node *rotate) +{ + __u64 left_max, right_max; + + rotate->in_max_high = node->in_max_high; + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max,right_max)); +} + +/* The left rotation "pivots" around the link from node to node->right, and + * - node will be linked to node->right's left child, and + * - node->right's left child will be linked to node's right child. */ +static void __rotate_left(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *right = node->in_right; + struct interval_node *parent = node->in_parent; + + node->in_right = right->in_left; + if (node->in_right) + right->in_left->in_parent = node; + + right->in_left = node; + right->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = right; + else + parent->in_right = right; + } else { + *root = right; + } + node->in_parent = right; + + /* update max_high for node and right */ + __rotate_change_maxhigh(node, right); +} + +/* The right rotation "pivots" around the link from node to node->left, and + * - node will be linked to node->left's right child, and + * - node->left's right child will be linked to node's left child. */ +static void __rotate_right(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *left = node->in_left; + struct interval_node *parent = node->in_parent; + + node->in_left = left->in_right; + if (node->in_left) + left->in_right->in_parent = node; + left->in_right = node; + + left->in_parent = parent; + if (parent) { + if (node_is_right_child(node)) + parent->in_right = left; + else + parent->in_left = left; + } else { + *root = left; + } + node->in_parent = left; + + /* update max_high for node and left */ + __rotate_change_maxhigh(node, left); +} + +#define interval_swap(a, b) do { \ + struct interval_node *c = a; a = b; b = c; \ +} while (0) + +/* + * Operations INSERT and DELETE, when run on a tree with n keys, + * take O(logN) time.Because they modify the tree, the result + * may violate the red-black properties.To restore these properties, + * we must change the colors of some of the nodes in the tree + * and also change the pointer structure. + */ +static void interval_insert_color(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *parent, *gparent; + ENTRY; + + while ((parent = node->in_parent) && node_is_red(parent)) { + gparent = parent->in_parent; + /* Parent is RED, so gparent must not be NULL */ + if (node_is_left_child(parent)) { + struct interval_node *uncle; + uncle = gparent->in_right; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (parent->in_right == node) { + __rotate_left(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_right(gparent, root); + } else { + struct interval_node *uncle; + uncle = gparent->in_left; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (node_is_left_child(node)) { + __rotate_right(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_left(gparent, root); + } + } + + (*root)->in_color = INTERVAL_BLACK; + EXIT; +} + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root) + +{ + struct interval_node **p, *parent = NULL; + ENTRY; + + LASSERT(!interval_is_intree(node)); + p = root; + while (*p) { + parent = *p; + if (node_equal(parent, node)) + RETURN(parent); + + /* max_high field must be updated after each iteration */ + if (parent->in_max_high < interval_high(node)) + parent->in_max_high = interval_high(node); + + if (node_compare(node, parent) < 0) + p = &parent->in_left; + else + p = &parent->in_right; + } + + /* link node into the tree */ + node->in_parent = parent; + node->in_color = INTERVAL_RED; + node->in_left = node->in_right = NULL; + *p = node; + + interval_insert_color(node, root); + node->in_intree = 1; + + RETURN(NULL); +} +EXPORT_SYMBOL(interval_insert); + +static inline int node_is_black_or_0(struct interval_node *node) +{ + return !node || node_is_black(node); +} + +static void interval_erase_color(struct interval_node *node, + struct interval_node *parent, + struct interval_node **root) +{ + struct interval_node *tmp; + ENTRY; + + while (node_is_black_or_0(node) && node != *root) { + if (parent->in_left == node) { + tmp = parent->in_right; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_left(parent, root); + tmp = parent->in_right; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_right)) { + struct interval_node *o_left; + if ((o_left = tmp->in_left)) + o_left->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_right(tmp, root); + tmp = parent->in_right; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_right) + tmp->in_right->in_color = INTERVAL_BLACK; + __rotate_left(parent, root); + node = *root; + break; + } + } else { + tmp = parent->in_left; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_right(parent, root); + tmp = parent->in_left; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_left)) { + struct interval_node *o_right; + if ((o_right = tmp->in_right)) + o_right->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_left(tmp, root); + tmp = parent->in_left; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_left) + tmp->in_left->in_color = INTERVAL_BLACK; + __rotate_right(parent, root); + node = *root; + break; + } + } + } + if (node) + node->in_color = INTERVAL_BLACK; + EXIT; +} + +/* + * if the @max_high value of @node is changed, this function traverse a path + * from node up to the root to update max_high for the whole tree. + */ +static void update_maxhigh(struct interval_node *node, + __u64 old_maxhigh) +{ + __u64 left_max, right_max; + ENTRY; + + while (node) { + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max, right_max)); + + if (node->in_max_high >= old_maxhigh) + break; + node = node->in_parent; + } + EXIT; +} + +void interval_erase(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *child, *parent; + int color; + ENTRY; + + LASSERT(interval_is_intree(node)); + node->in_intree = 0; + if (!node->in_left) { + child = node->in_right; + } else if (!node->in_right) { + child = node->in_left; + } else { /* Both left and right child are not NULL */ + struct interval_node *old = node; + + node = interval_next(node); + child = node->in_right; + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent == old) + parent->in_right = child; + else + parent->in_left = child; + + node->in_color = old->in_color; + node->in_right = old->in_right; + node->in_left = old->in_left; + node->in_parent = old->in_parent; + + if (old->in_parent) { + if (node_is_left_child(old)) + old->in_parent->in_left = node; + else + old->in_parent->in_right = node; + } else { + *root = node; + } + + old->in_left->in_parent = node; + if (old->in_right) + old->in_right->in_parent = node; + update_maxhigh(child ? : parent, node->in_max_high); + update_maxhigh(node, old->in_max_high); + if (parent == old) + parent = node; + goto color; + } + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = child; + else + parent->in_right = child; + } else { + *root = child; + } + + update_maxhigh(child ? : parent, node->in_max_high); + +color: + if (color == INTERVAL_BLACK) + interval_erase_color(child, parent, root); + EXIT; +} +EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (node == NULL) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + struct interval_node *parent; + enum interval_iter rc = INTERVAL_ITER_CONT; + + LASSERT(ext != NULL); + LASSERT(func != NULL); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* If we ever got the left, it means that the + * parent met ext->endin_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (parent == NULL || !interval_may_overlap(parent, ext)) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_search); + +static enum interval_iter interval_overlap_cb(struct interval_node *n, + void *args) +{ + *(int *)args = 1; + return INTERVAL_ITER_STOP; +} + +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ext) +{ + int has = 0; + (void)interval_search(root, ext, interval_overlap_cb, &has); + return has; +} +EXPORT_SYMBOL(interval_is_overlapped); + +/* Don't expand to low. Expanding downwards is expensive, and meaningless to + * some extents, because programs seldom do IO backward. + * + * The recursive algorithm of expanding low: + * expand_low { + * struct interval_node *tmp; + * static __u64 res = 0; + * + * if (root == NULL) + * return res; + * if (root->in_max_high < low) { + * res = max_u64(root->in_max_high + 1, res); + * return res; + * } else if (low < interval_low(root)) { + * interval_expand_low(root->in_left, low); + * return res; + * } + * + * if (interval_high(root) < low) + * res = max_u64(interval_high(root) + 1, res); + * interval_expand_low(root->in_left, low); + * interval_expand_low(root->in_right, low); + * + * return res; + * } + * + * It's much easy to eliminate the recursion, see interval_search for + * an example. -jay + */ +static inline __u64 interval_expand_low(struct interval_node *root, __u64 low) +{ + /* we only concern the empty tree right now. */ + if (root == NULL) + return 0; + return low; +} + +static inline __u64 interval_expand_high(struct interval_node *node, __u64 high) +{ + __u64 result = ~0; + + while (node != NULL) { + if (node->in_max_high < high) + break; + + if (interval_low(node) > high) { + result = interval_low(node) - 1; + node = node->in_left; + } else { + node = node->in_right; + } + } + + return result; +} + +/* expanding the extent based on @ext. */ +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter) +{ + /* The assertion of interval_is_overlapped is expensive because we may + * travel many nodes to find the overlapped node. */ + LASSERT(interval_is_overlapped(root, ext) == 0); + if (!limiter || limiter->start < ext->start) + ext->start = interval_expand_low(root, ext->start); + if (!limiter || limiter->end > ext->end) + ext->end = interval_expand_high(root, ext->end); + LASSERT(interval_is_overlapped(root, ext) == 0); +} +EXPORT_SYMBOL(interval_expand); diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c new file mode 100644 index 000000000000..853409aa945d --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/l_lock.c @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LDLM +#include + +#include +#include + +/** + * Lock a lock and its resource. + * + * LDLM locking uses resource to serialize access to locks + * but there is a case when we change resource of lock upon + * enqueue reply. We rely on lock->l_resource = new_res + * being an atomic operation. + */ +struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + if (!lock->l_ns_srv) + spin_lock(&lock->l_lock); + + lock_res(lock->l_resource); + + lock->l_res_locked = 1; + return lock->l_resource; +} +EXPORT_SYMBOL(lock_res_and_lock); + +/** + * Unlock a lock and its resource previously locked with lock_res_and_lock + */ +void unlock_res_and_lock(struct ldlm_lock *lock) +{ + /* on server-side resource of lock doesn't change */ + lock->l_res_locked = 0; + + unlock_res(lock->l_resource); + if (!lock->l_ns_srv) + spin_unlock(&lock->l_lock); +} +EXPORT_SYMBOL(unlock_res_and_lock); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c new file mode 100644 index 000000000000..f7432f78e396 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c @@ -0,0 +1,242 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_extent.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of EXTENT lock type + * + * EXTENT lock type is for locking a contiguous range of values, represented + * by 64-bit starting and ending offsets (inclusive). There are several extent + * lock modes, some of which may be mutually incompatible. Extent locks are + * considered incompatible if their modes are incompatible and their extents + * intersect. See the lock mode compatibility matrix in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM +# include + +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + + +/* When a lock is cancelled by a client, the KMS may undergo change if this + * is the "highest lock". This function returns the new KMS value. + * Caller must hold lr_lock already. + * + * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) +{ + struct ldlm_resource *res = lock->l_resource; + struct list_head *tmp; + struct ldlm_lock *lck; + __u64 kms = 0; + ENTRY; + + /* don't let another thread in ldlm_extent_shift_kms race in + * just after we finish and take our lock into account in its + * calculation of the kms */ + lock->l_flags |= LDLM_FL_KMS_IGNORE; + + list_for_each(tmp, &res->lr_granted) { + lck = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (lck->l_flags & LDLM_FL_KMS_IGNORE) + continue; + + if (lck->l_policy_data.l_extent.end >= old_kms) + RETURN(old_kms); + + /* This extent _has_ to be smaller than old_kms (checked above) + * so kms can only ever be smaller or the same as old_kms. */ + if (lck->l_policy_data.l_extent.end + 1 > kms) + kms = lck->l_policy_data.l_extent.end + 1; + } + LASSERTF(kms <= old_kms, "kms "LPU64" old_kms "LPU64"\n", kms, old_kms); + + RETURN(kms); +} +EXPORT_SYMBOL(ldlm_extent_shift_kms); + +struct kmem_cache *ldlm_interval_slab; +struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) +{ + struct ldlm_interval *node; + ENTRY; + + LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO); + if (node == NULL) + RETURN(NULL); + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + RETURN(node); +} + +void ldlm_interval_free(struct ldlm_interval *node) +{ + if (node) { + LASSERT(list_empty(&node->li_group)); + LASSERT(!interval_is_intree(&node->li_node)); + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + } +} + +/* interval tree, for LDLM_EXTENT. */ +void ldlm_interval_attach(struct ldlm_interval *n, + struct ldlm_lock *l) +{ + LASSERT(l->l_tree_node == NULL); + LASSERT(l->l_resource->lr_type == LDLM_EXTENT); + + list_add_tail(&l->l_sl_policy, &n->li_group); + l->l_tree_node = n; +} + +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) +{ + struct ldlm_interval *n = l->l_tree_node; + + if (n == NULL) + return NULL; + + LASSERT(!list_empty(&n->li_group)); + l->l_tree_node = NULL; + list_del_init(&l->l_sl_policy); + + return (list_empty(&n->li_group) ? n : NULL); +} + +static inline int lock_mode_to_index(ldlm_mode_t mode) +{ + int index; + + LASSERT(mode != 0); + LASSERT(IS_PO2(mode)); + for (index = -1; mode; index++, mode >>= 1) ; + LASSERT(index < LCK_MODE_NUM); + return index; +} + +/** Add newly granted lock into interval tree for the resource. */ +void ldlm_extent_add_lock(struct ldlm_resource *res, + struct ldlm_lock *lock) +{ + struct interval_node *found, **root; + struct ldlm_interval *node; + struct ldlm_extent *extent; + int idx; + + LASSERT(lock->l_granted_mode == lock->l_req_mode); + + node = lock->l_tree_node; + LASSERT(node != NULL); + LASSERT(!interval_is_intree(&node->li_node)); + + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); + + /* node extent initialize */ + extent = &lock->l_policy_data.l_extent; + interval_set(&node->li_node, extent->start, extent->end); + + root = &res->lr_itree[idx].lit_root; + found = interval_insert(&node->li_node, root); + if (found) { /* The policy group found. */ + struct ldlm_interval *tmp = ldlm_interval_detach(lock); + LASSERT(tmp != NULL); + ldlm_interval_free(tmp); + ldlm_interval_attach(to_ldlm_interval(found), lock); + } + res->lr_itree[idx].lit_size++; + + /* even though we use interval tree to manage the extent lock, we also + * add the locks into grant list, for debug purpose, .. */ + ldlm_resource_add_lock(res, &res->lr_granted, lock); +} + +/** Remove cancelled lock from resource interval tree. */ +void ldlm_extent_unlink_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval *node = lock->l_tree_node; + struct ldlm_interval_tree *tree; + int idx; + + if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ + return; + + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + tree = &res->lr_itree[idx]; + + LASSERT(tree->lit_root != NULL); /* assure the tree is not null */ + + tree->lit_size--; + node = ldlm_interval_detach(lock); + if (node) { + interval_erase(&node->li_node, &tree->lit_root); + ldlm_interval_free(node); + } +} + +void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_extent.start = wpolicy->l_extent.start; + lpolicy->l_extent.end = wpolicy->l_extent.end; + lpolicy->l_extent.gid = wpolicy->l_extent.gid; +} + +void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_extent.start = lpolicy->l_extent.start; + wpolicy->l_extent.end = lpolicy->l_extent.end; + wpolicy->l_extent.gid = lpolicy->l_extent.gid; +} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c new file mode 100644 index 000000000000..f100a84bde73 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c @@ -0,0 +1,849 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003 Hewlett-Packard Development Company LP. + * Developed under the sponsorship of the US Government under + * Subcontract No. B514193 + * + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file implements POSIX lock type for Lustre. + * Its policy properties are start and end of extent and PID. + * + * These locks are only done through MDS due to POSIX semantics requiring + * e.g. that locks could be only partially released and as such split into + * two parts, and also that two adjacent locks from the same process may be + * merged into a single wider lock. + * + * Lock modes are mapped like this: + * PR and PW for READ and WRITE locks + * NL to request a releasing of a portion of the lock + * + * These flock locks never timeout. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include +#include +#include + +#include "ldlm_internal.h" + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); + +/** + * list_for_remaining_safe - iterate over the remaining entries in a list + * and safeguard against removal of a list entry. + * \param pos the &struct list_head to use as a loop counter. pos MUST + * have been initialized prior to using it in this macro. + * \param n another &struct list_head to use as temporary storage + * \param head the head for your list. + */ +#define list_for_remaining_safe(pos, n, head) \ + for (n = pos->next; pos != (head); pos = n, n = pos->next) + +static inline int +ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.owner == + lock->l_policy_data.l_flock.owner) && + (new->l_export == lock->l_export)); +} + +static inline int +ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new) +{ + return((new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.end) && + (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.start)); +} + +static inline int ldlm_flock_blocking_link(struct ldlm_lock *req, + struct ldlm_lock *lock) +{ + int rc = 0; + + /* For server only */ + if (req->l_export == NULL) + return 0; + + if (unlikely(req->l_export->exp_flock_hash == NULL)) { + rc = ldlm_init_flock_export(req->l_export); + if (rc) + goto error; + } + + LASSERT(hlist_unhashed(&req->l_exp_flock_hash)); + + req->l_policy_data.l_flock.blocking_owner = + lock->l_policy_data.l_flock.owner; + req->l_policy_data.l_flock.blocking_export = + lock->l_export; + req->l_policy_data.l_flock.blocking_refs = 0; + + cfs_hash_add(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +error: + return rc; +} + +static inline void ldlm_flock_blocking_unlink(struct ldlm_lock *req) +{ + /* For server only */ + if (req->l_export == NULL) + return; + + check_res_locked(req->l_resource); + if (req->l_export->exp_flock_hash != NULL && + !hlist_unhashed(&req->l_exp_flock_hash)) + cfs_hash_del(req->l_export->exp_flock_hash, + &req->l_policy_data.l_flock.owner, + &req->l_exp_flock_hash); +} + +static inline void +ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, __u64 flags) +{ + ENTRY; + + LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%llx)", + mode, flags); + + /* Safe to not lock here, since it should be empty anyway */ + LASSERT(hlist_unhashed(&lock->l_exp_flock_hash)); + + list_del_init(&lock->l_res_link); + if (flags == LDLM_FL_WAIT_NOREPROC && + !(lock->l_flags & LDLM_FL_FAILED)) { + /* client side - set a flag to prevent sending a CANCEL */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; + + /* when reaching here, it is under lock_res_and_lock(). Thus, + need call the nolock version of ldlm_lock_decref_internal*/ + ldlm_lock_decref_internal_nolock(lock, mode); + } + + ldlm_lock_destroy_nolock(lock); + EXIT; +} + +/** + * POSIX locks deadlock detection code. + * + * Given a new lock \a req and an existing lock \a bl_lock it conflicts + * with, we need to iterate through all blocked POSIX locks for this + * export and see if there is a deadlock condition arising. (i.e. when + * one client holds a lock on something and want a lock on something + * else and at the same time another client has the opposite situation). + */ +static int +ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *bl_lock) +{ + struct obd_export *req_exp = req->l_export; + struct obd_export *bl_exp = bl_lock->l_export; + __u64 req_owner = req->l_policy_data.l_flock.owner; + __u64 bl_owner = bl_lock->l_policy_data.l_flock.owner; + + /* For server only */ + if (req_exp == NULL) + return 0; + + class_export_get(bl_exp); + while (1) { + struct obd_export *bl_exp_new; + struct ldlm_lock *lock = NULL; + struct ldlm_flock *flock; + + if (bl_exp->exp_flock_hash != NULL) + lock = cfs_hash_lookup(bl_exp->exp_flock_hash, + &bl_owner); + if (lock == NULL) + break; + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->owner == bl_owner); + bl_owner = flock->blocking_owner; + bl_exp_new = class_export_get(flock->blocking_export); + class_export_put(bl_exp); + + cfs_hash_put(bl_exp->exp_flock_hash, &lock->l_exp_flock_hash); + bl_exp = bl_exp_new; + + if (bl_owner == req_owner && bl_exp == req_exp) { + class_export_put(bl_exp); + return 1; + } + } + class_export_put(bl_exp); + + return 0; +} + +/** + * Process a granting attempt for flock lock. + * Must be called under ns lock held. + * + * This function looks for any conflicts for \a lock in the granted or + * waiting queues. The lock is granted if no conflicts are found in + * either queue. + * + * It is also responsible for splitting a lock if a portion of the lock + * is released. + * + * If \a first_enq is 0 (ie, called from ldlm_reprocess_queue): + * - blocking ASTs have already been sent + * + * If \a first_enq is 1 (ie, called from ldlm_lock_enqueue): + * - blocking ASTs have not been sent yet, so list of conflicting locks + * would be collected and ASTs sent. + */ +int +ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, int first_enq, + ldlm_error_t *err, struct list_head *work_list) +{ + struct ldlm_resource *res = req->l_resource; + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + struct list_head *tmp; + struct list_head *ownlocks = NULL; + struct ldlm_lock *lock = NULL; + struct ldlm_lock *new = req; + struct ldlm_lock *new2 = NULL; + ldlm_mode_t mode = req->l_req_mode; + int local = ns_is_client(ns); + int added = (mode == LCK_NL); + int overlaps = 0; + int splitted = 0; + const struct ldlm_callback_suite null_cbs = { NULL }; + int rc; + ENTRY; + + CDEBUG(D_DLMTRACE, "flags %#llx owner "LPU64" pid %u mode %u start " + LPU64" end "LPU64"\n", *flags, + new->l_policy_data.l_flock.owner, + new->l_policy_data.l_flock.pid, mode, + req->l_policy_data.l_flock.start, + req->l_policy_data.l_flock.end); + + *err = ELDLM_OK; + + if (local) { + /* No blocking ASTs are sent to the clients for + * Posix file & record locks */ + req->l_blocking_ast = NULL; + } else { + /* Called on the server for lock cancels. */ + req->l_blocking_ast = ldlm_flock_blocking_ast; + } + +reprocess: + if ((*flags == LDLM_FL_WAIT_NOREPROC) || (mode == LCK_NL)) { + /* This loop determines where this processes locks start + * in the resource lr_granted list. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (ldlm_same_flock_owner(lock, req)) { + ownlocks = tmp; + break; + } + } + } else { + lockmode_verify(mode); + + /* This loop determines if there are existing locks + * that conflict with the new lock request. */ + list_for_each(tmp, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + + if (ldlm_same_flock_owner(lock, req)) { + if (!ownlocks) + ownlocks = tmp; + continue; + } + + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + if (!ldlm_flocks_overlap(lock, req)) + continue; + + if (!first_enq) + RETURN(LDLM_ITER_CONTINUE); + + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + ldlm_flock_destroy(req, mode, *flags); + *err = -EAGAIN; + RETURN(LDLM_ITER_STOP); + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = lock->l_granted_mode; + req->l_policy_data.l_flock.pid = + lock->l_policy_data.l_flock.pid; + req->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + req->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + if (ldlm_flock_deadlock(req, lock)) { + ldlm_flock_destroy(req, mode, *flags); + *err = -EDEADLK; + RETURN(LDLM_ITER_STOP); + } + + rc = ldlm_flock_blocking_link(req, lock); + if (rc) { + ldlm_flock_destroy(req, mode, *flags); + *err = rc; + RETURN(LDLM_ITER_STOP); + } + ldlm_resource_add_lock(res, &res->lr_waiting, req); + *flags |= LDLM_FL_BLOCK_GRANTED; + RETURN(LDLM_ITER_STOP); + } + } + + if (*flags & LDLM_FL_TEST_LOCK) { + ldlm_flock_destroy(req, mode, *flags); + req->l_req_mode = LCK_NL; + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(LDLM_ITER_STOP); + } + + /* In case we had slept on this lock request take it off of the + * deadlock detection hash list. */ + ldlm_flock_blocking_unlink(req); + + /* Scan the locks owned by this process that overlap this request. + * We may have to merge or split existing locks. */ + + if (!ownlocks) + ownlocks = &res->lr_granted; + + list_for_remaining_safe(ownlocks, tmp, &res->lr_granted) { + lock = list_entry(ownlocks, struct ldlm_lock, l_res_link); + + if (!ldlm_same_flock_owner(lock, new)) + break; + + if (lock->l_granted_mode == mode) { + /* If the modes are the same then we need to process + * locks that overlap OR adjoin the new lock. The extra + * logic condition is necessary to deal with arithmetic + * overflow and underflow. */ + if ((new->l_policy_data.l_flock.start > + (lock->l_policy_data.l_flock.end + 1)) + && (lock->l_policy_data.l_flock.end != + OBD_OBJECT_EOF)) + continue; + + if ((new->l_policy_data.l_flock.end < + (lock->l_policy_data.l_flock.start - 1)) + && (lock->l_policy_data.l_flock.start != 0)) + break; + + if (new->l_policy_data.l_flock.start < + lock->l_policy_data.l_flock.start) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.start; + } else { + new->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + } + + if (new->l_policy_data.l_flock.end > + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.end; + } else { + new->l_policy_data.l_flock.end = + lock->l_policy_data.l_flock.end; + } + + if (added) { + ldlm_flock_destroy(lock, mode, *flags); + } else { + new = lock; + added = 1; + } + continue; + } + + if (new->l_policy_data.l_flock.start > + lock->l_policy_data.l_flock.end) + continue; + + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.start) + break; + + ++overlaps; + + if (new->l_policy_data.l_flock.start <= + lock->l_policy_data.l_flock.start) { + if (new->l_policy_data.l_flock.end < + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + break; + } + ldlm_flock_destroy(lock, lock->l_req_mode, *flags); + continue; + } + if (new->l_policy_data.l_flock.end >= + lock->l_policy_data.l_flock.end) { + lock->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + continue; + } + + /* split the existing lock into two locks */ + + /* if this is an F_UNLCK operation then we could avoid + * allocating a new lock and use the req lock passed in + * with the request but this would complicate the reply + * processing since updates to req get reflected in the + * reply. The client side replays the lock request so + * it must see the original lock data in the reply. */ + + /* XXX - if ldlm_lock_new() can sleep we should + * release the lr_lock, allocate the new lock, + * and restart processing this lock. */ + if (!new2) { + unlock_res_and_lock(req); + new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK, + lock->l_granted_mode, &null_cbs, + NULL, 0, LVB_T_NONE); + lock_res_and_lock(req); + if (!new2) { + ldlm_flock_destroy(req, lock->l_granted_mode, + *flags); + *err = -ENOLCK; + RETURN(LDLM_ITER_STOP); + } + goto reprocess; + } + + splitted = 1; + + new2->l_granted_mode = lock->l_granted_mode; + new2->l_policy_data.l_flock.pid = + new->l_policy_data.l_flock.pid; + new2->l_policy_data.l_flock.owner = + new->l_policy_data.l_flock.owner; + new2->l_policy_data.l_flock.start = + lock->l_policy_data.l_flock.start; + new2->l_policy_data.l_flock.end = + new->l_policy_data.l_flock.start - 1; + lock->l_policy_data.l_flock.start = + new->l_policy_data.l_flock.end + 1; + new2->l_conn_export = lock->l_conn_export; + if (lock->l_export != NULL) { + new2->l_export = class_export_lock_get(lock->l_export, new2); + if (new2->l_export->exp_lock_hash && + hlist_unhashed(&new2->l_exp_hash)) + cfs_hash_add(new2->l_export->exp_lock_hash, + &new2->l_remote_handle, + &new2->l_exp_hash); + } + if (*flags == LDLM_FL_WAIT_NOREPROC) + ldlm_lock_addref_internal_nolock(new2, + lock->l_granted_mode); + + /* insert new2 at lock */ + ldlm_resource_add_lock(res, ownlocks, new2); + LDLM_LOCK_RELEASE(new2); + break; + } + + /* if new2 is created but never used, destroy it*/ + if (splitted == 0 && new2 != NULL) + ldlm_lock_destroy_nolock(new2); + + /* At this point we're granting the lock request. */ + req->l_granted_mode = req->l_req_mode; + + /* Add req to the granted queue before calling ldlm_reprocess_all(). */ + if (!added) { + list_del_init(&req->l_res_link); + /* insert new lock before ownlocks in list. */ + ldlm_resource_add_lock(res, ownlocks, req); + } + + if (*flags != LDLM_FL_WAIT_NOREPROC) { + /* The only one possible case for client-side calls flock + * policy function is ldlm_flock_completion_ast inside which + * carries LDLM_FL_WAIT_NOREPROC flag. */ + CERROR("Illegal parameter for client-side-only module.\n"); + LBUG(); + } + + /* In case we're reprocessing the requested lock we can't destroy + * it until after calling ldlm_add_ast_work_item() above so that laawi() + * can bump the reference count on \a req. Otherwise \a req + * could be freed before the completion AST can be sent. */ + if (added) + ldlm_flock_destroy(req, mode, *flags); + + ldlm_resource_dump(D_INFO, res); + RETURN(LDLM_ITER_CONTINUE); +} + +struct ldlm_flock_wait_data { + struct ldlm_lock *fwd_lock; + int fwd_generation; +}; + +static void +ldlm_flock_interrupted_wait(void *data) +{ + struct ldlm_lock *lock; + ENTRY; + + lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock; + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + + /* client side - set flag to prevent lock from being put on LRU list */ + lock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + + EXIT; +} + +/** + * Flock completion callback function. + * + * \param lock [in,out]: A lock to be handled + * \param flags [in]: flags + * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg + * + * \retval 0 : success + * \retval <0 : failure + */ +int +ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + struct file_lock *getlk = lock->l_ast_data; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct ldlm_flock_wait_data fwd; + struct l_wait_info lwi; + ldlm_error_t err; + int rc = 0; + ENTRY; + + CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n", + flags, data, getlk); + + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. */ + if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == + (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { + if (lock->l_req_mode == lock->l_granted_mode && + lock->l_granted_mode != LCK_NL && + NULL == data) + ldlm_lock_decref_internal(lock, lock->l_req_mode); + + /* Need to wake up the waiter if we were evicted */ + wake_up(&lock->l_waitq); + RETURN(0); + } + + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + if (NULL == data) + /* mds granted the lock in the reply */ + goto granted; + /* CP AST RPC: lock get granted, wake it up */ + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " + "sleeping"); + fwd.fwd_lock = lock; + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, there is no import */ + if (NULL != obd) + imp = obd->u.cli.cl_import; + + if (NULL != imp) { + spin_lock(&imp->imp_lock); + fwd.fwd_generation = imp->imp_generation; + spin_unlock(&imp->imp_lock); + } + + lwi = LWI_TIMEOUT_INTR(0, NULL, ldlm_flock_interrupted_wait, &fwd); + + /* Go to sleep until the lock is granted. */ + rc = l_wait_event(lock->l_waitq, is_granted_or_cancelled(lock), &lwi); + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + +granted: + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); + + if (lock->l_destroyed) { + LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); + RETURN(0); + } + + if (lock->l_flags & LDLM_FL_FAILED) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed"); + RETURN(-EIO); + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + + LDLM_DEBUG(lock, "client-side enqueue granted"); + + lock_res_and_lock(lock); + + /* take lock off the deadlock detection hash list. */ + ldlm_flock_blocking_unlink(lock); + + /* ldlm_lock_enqueue() has already placed lock on the granted list. */ + list_del_init(&lock->l_res_link); + + if (flags & LDLM_FL_TEST_LOCK) { + /* fcntl(F_GETLK) request */ + /* The old mode was saved in getlk->fl_type so that if the mode + * in the lock changes we can decref the appropriate refcount.*/ + ldlm_flock_destroy(lock, flock_type(getlk), + LDLM_FL_WAIT_NOREPROC); + switch (lock->l_granted_mode) { + case LCK_PR: + flock_set_type(getlk, F_RDLCK); + break; + case LCK_PW: + flock_set_type(getlk, F_WRLCK); + break; + default: + flock_set_type(getlk, F_UNLCK); + } + flock_set_pid(getlk, (pid_t)lock->l_policy_data.l_flock.pid); + flock_set_start(getlk, + (loff_t)lock->l_policy_data.l_flock.start); + flock_set_end(getlk, + (loff_t)lock->l_policy_data.l_flock.end); + } else { + __u64 noreproc = LDLM_FL_WAIT_NOREPROC; + + /* We need to reprocess the lock to do merges or splits + * with existing locks owned by this process. */ + ldlm_process_flock_lock(lock, &noreproc, 1, &err, NULL); + } + unlock_res_and_lock(lock); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_flock_completion_ast); + +int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + LASSERT(lock); + LASSERT(flag == LDLM_CB_CANCELING); + + /* take lock off the deadlock detection hash list. */ + lock_res_and_lock(lock); + ldlm_flock_blocking_unlink(lock); + unlock_res_and_lock(lock); + RETURN(0); +} + +void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + /* Compat code, old clients had no idea about owner field and + * relied solely on pid for ownership. Introduced in LU-104, 2.1, + * April 2011 */ + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_pid; +} + + +void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; + lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; + lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; + lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner; +} + +void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_flock.lfw_start = lpolicy->l_flock.start; + wpolicy->l_flock.lfw_end = lpolicy->l_flock.end; + wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid; + wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner; +} + +/* + * Export handle<->flock hash operations. + */ +static unsigned +ldlm_export_flock_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(*(__u64 *)key, mask); +} + +static void * +ldlm_export_flock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + return &lock->l_policy_data.l_flock.owner; +} + +static int +ldlm_export_flock_keycmp(const void *key, struct hlist_node *hnode) +{ + return !memcmp(ldlm_export_flock_key(hnode), key, sizeof(__u64)); +} + +static void * +ldlm_export_flock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); +} + +static void +ldlm_export_flock_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_GET(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_get(flock->blocking_export); + flock->blocking_refs++; +} + +static void +ldlm_export_flock_put(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + struct ldlm_flock *flock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_flock_hash); + LDLM_LOCK_RELEASE(lock); + + flock = &lock->l_policy_data.l_flock; + LASSERT(flock->blocking_export != NULL); + class_export_put(flock->blocking_export); + if (--flock->blocking_refs == 0) { + flock->blocking_owner = 0; + flock->blocking_export = NULL; + } +} + +static cfs_hash_ops_t ldlm_export_flock_ops = { + .hs_hash = ldlm_export_flock_hash, + .hs_key = ldlm_export_flock_key, + .hs_keycmp = ldlm_export_flock_keycmp, + .hs_object = ldlm_export_flock_object, + .hs_get = ldlm_export_flock_get, + .hs_put = ldlm_export_flock_put, + .hs_put_locked = ldlm_export_flock_put, +}; + +int ldlm_init_flock_export(struct obd_export *exp) +{ + exp->exp_flock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_flock_ops, + CFS_HASH_DEFAULT | CFS_HASH_NBLK_CHANGE); + if (!exp->exp_flock_hash) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(ldlm_init_flock_export); + +void ldlm_destroy_flock_export(struct obd_export *exp) +{ + ENTRY; + if (exp->exp_flock_hash) { + cfs_hash_putref(exp->exp_flock_hash); + exp->exp_flock_hash = NULL; + } + EXIT; +} +EXPORT_SYMBOL(ldlm_destroy_flock_export); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c new file mode 100644 index 000000000000..574b2ff43b74 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_inodebits.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of IBITS lock type + * + * IBITS lock type contains a bit mask determining various properties of an + * object. The meanings of specific bits are specific to the caller and are + * opaque to LDLM code. + * + * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW) + * are considered conflicting. See the lock mode compatibility matrix + * in lustre_dlm.h. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include + +#include "ldlm_internal.h" + + +void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + memset(lpolicy, 0, sizeof(*lpolicy)); + lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits; +} + +void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + memset(wpolicy, 0, sizeof(*wpolicy)); + wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits; +} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h new file mode 100644 index 000000000000..a08e6d9757be --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h @@ -0,0 +1,277 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define MAX_STRING_SIZE 128 + +extern atomic_t ldlm_srv_namespace_nr; +extern atomic_t ldlm_cli_namespace_nr; +extern struct mutex ldlm_srv_namespace_lock; +extern struct list_head ldlm_srv_namespace_list; +extern struct mutex ldlm_cli_namespace_lock; +extern struct list_head ldlm_cli_namespace_list; + +static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr; +} + +static inline struct list_head *ldlm_namespace_list(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_list : &ldlm_cli_namespace_list; +} + +static inline struct mutex *ldlm_namespace_lock(ldlm_side_t client) +{ + return client == LDLM_NAMESPACE_SERVER ? + &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; +} + +/* ldlm_request.c */ +/* Cancel lru flag, it indicates we cancel aged locks. */ +enum { + LDLM_CANCEL_AGED = 1 << 0, /* Cancel aged locks (non lru resize). */ + LDLM_CANCEL_PASSED = 1 << 1, /* Cancel passed number of locks. */ + LDLM_CANCEL_SHRINK = 1 << 2, /* Cancel locks from shrinker. */ + LDLM_CANCEL_LRUR = 1 << 3, /* Cancel locks from lru resize. */ + LDLM_CANCEL_NO_WAIT = 1 << 4 /* Cancel locks w/o blocking (neither + * sending nor waiting for any rpcs) */ +}; + +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, + ldlm_cancel_flags_t sync, int flags); +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, + struct list_head *cancels, int count, int max, + ldlm_cancel_flags_t cancel_flags, int flags); +extern int ldlm_enqueue_min; +int ldlm_get_enq_timeout(struct ldlm_lock *lock); + +/* ldlm_resource.c */ +int ldlm_resource_putref_locked(struct ldlm_resource *res); +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new); +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_free_post(struct ldlm_namespace *ns); +/* ldlm_lock.c */ + +struct ldlm_cb_set_arg { + struct ptlrpc_request_set *set; + int type; /* LDLM_{CP,BL,GL}_CALLBACK */ + atomic_t restart; + struct list_head *list; + union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */ +}; + +typedef enum { + LDLM_WORK_BL_AST, + LDLM_WORK_CP_AST, + LDLM_WORK_REVOKE_AST, + LDLM_WORK_GL_AST +} ldlm_desc_ast_t; + +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size); +struct ldlm_lock * +ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *, + ldlm_type_t type, ldlm_mode_t, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, enum lvb_type lvb_type); +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **, + void *cookie, __u64 *flags); +void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode); +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *, __u32 mode); +void ldlm_lock_decref_internal(struct ldlm_lock *, __u32 mode); +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *, __u32 mode); +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list); +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type); +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq); +int ldlm_lock_remove_from_lru(struct ldlm_lock *lock); +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock); +void ldlm_lock_add_to_lru(struct ldlm_lock *lock); +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock); +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock); + +void ldlm_cancel_locks_for_export(struct obd_export *export); + +/* ldlm_lockd.c */ +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock); +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags); + +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock); + + +/* ldlm_extent.c */ +void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); +void ldlm_extent_unlink_lock(struct ldlm_lock *lock); + +/* ldlm_flock.c */ +int ldlm_process_flock_lock(struct ldlm_lock *req, __u64 *flags, + int first_enq, ldlm_error_t *err, + struct list_head *work_list); +int ldlm_init_flock_export(struct obd_export *exp); +void ldlm_destroy_flock_export(struct obd_export *exp); + +/* l_lock.c */ +void l_check_ns_lock(struct ldlm_namespace *ns); +void l_check_no_ns_lock(struct ldlm_namespace *ns); + +extern proc_dir_entry_t *ldlm_svc_proc_dir; +extern proc_dir_entry_t *ldlm_type_proc_dir; + +struct ldlm_state { + struct ptlrpc_service *ldlm_cb_service; + struct ptlrpc_service *ldlm_cancel_service; + struct ptlrpc_client *ldlm_client; + struct ptlrpc_connection *ldlm_server_conn; + struct ldlm_bl_pool *ldlm_bl_pool; +}; + +/* interval tree, for LDLM_EXTENT. */ +extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */ +extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock); +extern void ldlm_interval_free(struct ldlm_interval *node); +/* this function must be called with res lock held */ +static inline struct ldlm_extent * +ldlm_interval_extent(struct ldlm_interval *node) +{ + struct ldlm_lock *lock; + LASSERT(!list_empty(&node->li_group)); + + lock = list_entry(node->li_group.next, struct ldlm_lock, + l_sl_policy); + return &lock->l_policy_data.l_extent; +} + +int ldlm_init(void); +void ldlm_exit(void); + +enum ldlm_policy_res { + LDLM_POLICY_CANCEL_LOCK, + LDLM_POLICY_KEEP_LOCK, + LDLM_POLICY_SKIP_LOCK +}; + +typedef enum ldlm_policy_res ldlm_policy_res_t; + +#define LDLM_POOL_PROC_READER(var, type) \ + static int lprocfs_rd_##var(char *page, char **start, off_t off, \ + int count, int *eof, void *data) \ + { \ + struct ldlm_pool *pl = data; \ + type tmp; \ + \ + spin_lock(&pl->pl_lock); \ + tmp = pl->pl_##var; \ + spin_unlock(&pl->pl_lock); \ + \ + return lprocfs_rd_uint(page, start, off, count, eof, &tmp); \ + } \ + struct __##var##__dummy_read {;} /* semicolon catcher */ + +#define LDLM_POOL_PROC_WRITER(var, type) \ + int lprocfs_wr_##var(struct file *file, const char *buffer, \ + unsigned long count, void *data) \ + { \ + struct ldlm_pool *pl = data; \ + type tmp; \ + int rc; \ + \ + rc = lprocfs_wr_uint(file, buffer, count, &tmp); \ + if (rc < 0) { \ + CERROR("Can't parse user input, rc = %d\n", rc); \ + return rc; \ + } \ + \ + spin_lock(&pl->pl_lock); \ + pl->pl_##var = tmp; \ + spin_unlock(&pl->pl_lock); \ + \ + return rc; \ + } \ + struct __##var##__dummy_write {;} /* semicolon catcher */ + +static inline int is_granted_or_cancelled(struct ldlm_lock *lock) +{ + int ret = 0; + + lock_res_and_lock(lock); + if (((lock->l_req_mode == lock->l_granted_mode) && + !(lock->l_flags & LDLM_FL_CP_REQD)) || + (lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_CANCEL))) + ret = 1; + unlock_res_and_lock(lock); + + return ret; +} + +typedef void (*ldlm_policy_wire_to_local_t)(const ldlm_wire_policy_data_t *, + ldlm_policy_data_t *); + +typedef void (*ldlm_policy_local_to_wire_t)(const ldlm_policy_data_t *, + ldlm_wire_policy_data_t *); + +void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_ibits_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_ibits_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_extent_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_extent_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_flock_policy_wire18_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); +void ldlm_flock_policy_wire21_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); + +void ldlm_flock_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c new file mode 100644 index 000000000000..42df53072dc3 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c @@ -0,0 +1,868 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** + * This file deals with various client/target related logic including recovery. + * + * TODO: This code more logically belongs in the ptlrpc module than in ldlm and + * should be moved. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +# include +#include +#include +#include +#include +#include +#include "ldlm_internal.h" + +/* @priority: If non-zero, move the selected connection to the list head. + * @create: If zero, only search in existing connections. + */ +static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority, int create) +{ + struct ptlrpc_connection *ptlrpc_conn; + struct obd_import_conn *imp_conn = NULL, *item; + int rc = 0; + ENTRY; + + if (!create && !priority) { + CDEBUG(D_HA, "Nothing to do\n"); + RETURN(-EINVAL); + } + + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid); + if (!ptlrpc_conn) { + CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); + RETURN (-ENOENT); + } + + if (create) { + OBD_ALLOC(imp_conn, sizeof(*imp_conn)); + if (!imp_conn) { + GOTO(out_put, rc = -ENOMEM); + } + } + + spin_lock(&imp->imp_lock); + list_for_each_entry(item, &imp->imp_conn_list, oic_item) { + if (obd_uuid_equals(uuid, &item->oic_uuid)) { + if (priority) { + list_del(&item->oic_item); + list_add(&item->oic_item, + &imp->imp_conn_list); + item->oic_last_attempt = 0; + } + CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? ", moved to head" : "")); + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = 0); + } + } + /* No existing import connection found for \a uuid. */ + if (create) { + imp_conn->oic_conn = ptlrpc_conn; + imp_conn->oic_uuid = *uuid; + imp_conn->oic_last_attempt = 0; + if (priority) + list_add(&imp_conn->oic_item, &imp->imp_conn_list); + else + list_add_tail(&imp_conn->oic_item, + &imp->imp_conn_list); + CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? "head" : "tail")); + } else { + spin_unlock(&imp->imp_lock); + GOTO(out_free, rc = -ENOENT); + } + + spin_unlock(&imp->imp_lock); + RETURN(0); +out_free: + if (imp_conn) + OBD_FREE(imp_conn, sizeof(*imp_conn)); +out_put: + ptlrpc_connection_put(ptlrpc_conn); + RETURN(rc); +} + +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) +{ + return import_set_conn(imp, uuid, 1, 0); +} + +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + return import_set_conn(imp, uuid, priority, 1); +} +EXPORT_SYMBOL(client_import_add_conn); + +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_import_conn *imp_conn; + struct obd_export *dlmexp; + int rc = -ENOENT; + ENTRY; + + spin_lock(&imp->imp_lock); + if (list_empty(&imp->imp_conn_list)) { + LASSERT(!imp->imp_connection); + GOTO(out, rc); + } + + list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) { + if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid)) + continue; + LASSERT(imp_conn->oic_conn); + + if (imp_conn == imp->imp_conn_current) { + LASSERT(imp_conn->oic_conn == imp->imp_connection); + + if (imp->imp_state != LUSTRE_IMP_CLOSED && + imp->imp_state != LUSTRE_IMP_DISCON) { + CERROR("can't remove current connection\n"); + GOTO(out, rc = -EBUSY); + } + + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = NULL; + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp && dlmexp->exp_connection) { + LASSERT(dlmexp->exp_connection == + imp_conn->oic_conn); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = NULL; + } + } + + list_del(&imp_conn->oic_item); + ptlrpc_connection_put(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + CDEBUG(D_HA, "imp %p@%s: remove connection %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid); + rc = 0; + break; + } +out: + spin_unlock(&imp->imp_lock); + if (rc == -ENOENT) + CERROR("connection %s not found\n", uuid->uuid); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_del_conn); + +/** + * Find conn UUID by peer NID. \a peer is a server NID. This function is used + * to find a conn uuid of \a imp which can reach \a peer. + */ +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid) +{ + struct obd_import_conn *conn; + int rc = -ENOENT; + ENTRY; + + spin_lock(&imp->imp_lock); + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + /* Check if conn UUID does have this peer NID. */ + if (class_check_uuid(&conn->oic_uuid, peer)) { + *uuid = conn->oic_uuid; + rc = 0; + break; + } + } + spin_unlock(&imp->imp_lock); + RETURN(rc); +} +EXPORT_SYMBOL(client_import_find_conn); + +void client_destroy_import(struct obd_import *imp) +{ + /* Drop security policy instance after all RPCs have finished/aborted + * to let all busy contexts be released. */ + class_import_get(imp); + class_destroy_import(imp); + sptlrpc_import_sec_put(imp); + class_import_put(imp); +} +EXPORT_SYMBOL(client_destroy_import); + +/** + * Check whether or not the OSC is on MDT. + * In the config log, + * osc on MDT + * setup 0:{fsname}-OSTxxxx-osc[-MDTxxxx] 1:lustre-OST0000_UUID 2:NID + * osc on client + * setup 0:{fsname}-OSTxxxx-osc 1:lustre-OST0000_UUID 2:NID + * + **/ +static int osc_on_mdt(char *obdname) +{ + char *ptr; + + ptr = strrchr(obdname, '-'); + if (ptr == NULL) + return 0; + + if (strncmp(ptr + 1, "MDT", 3) == 0) + return 1; + + return 0; +} + +/* Configure an RPC client OBD device. + * + * lcfg parameters: + * 1 - client UUID + * 2 - server UUID + * 3 - inactive-on-startup + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct client_obd *cli = &obddev->u.cli; + struct obd_import *imp; + struct obd_uuid server_uuid; + int rq_portal, rp_portal, connect_op; + char *name = obddev->obd_type->typ_name; + ldlm_ns_type_t ns_type = LDLM_NS_TYPE_UNKNOWN; + int rc; + char *cli_name = lustre_cfg_buf(lcfg, 0); + ENTRY; + + /* In a more perfect world, we would hang a ptlrpc_client off of + * obd_type and just use the values from there. */ + if (!strcmp(name, LUSTRE_OSC_NAME) || + (!(strcmp(name, LUSTRE_OSP_NAME)) && + (is_osp_on_mdt(cli_name) && + strstr(lustre_cfg_buf(lcfg, 1), "OST") != NULL))) { + /* OSC or OSP_on_MDT for OSTs */ + rq_portal = OST_REQUEST_PORTAL; + rp_portal = OSC_REPLY_PORTAL; + connect_op = OST_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_OST; + ns_type = LDLM_NS_TYPE_OSC; + } else if (!strcmp(name, LUSTRE_MDC_NAME) || + !strcmp(name, LUSTRE_LWP_NAME) || + (!strcmp(name, LUSTRE_OSP_NAME) && + (is_osp_on_mdt(cli_name) && + strstr(lustre_cfg_buf(lcfg, 1), "OST") == NULL))) { + /* MDC or OSP_on_MDT for other MDTs */ + rq_portal = MDS_REQUEST_PORTAL; + rp_portal = MDC_REPLY_PORTAL; + connect_op = MDS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_MDT; + ns_type = LDLM_NS_TYPE_MDC; + } else if (!strcmp(name, LUSTRE_MGC_NAME)) { + rq_portal = MGS_REQUEST_PORTAL; + rp_portal = MGC_REPLY_PORTAL; + connect_op = MGS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_MGC; + cli->cl_sp_to = LUSTRE_SP_MGS; + cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; + ns_type = LDLM_NS_TYPE_MGC; + } else { + CERROR("unknown client OBD type \"%s\", can't setup\n", + name); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { + CERROR("client UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { + CERROR("setup requires a SERVER UUID\n"); + RETURN(-EINVAL); + } + + if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { + CERROR("target UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + init_rwsem(&cli->cl_sem); + sema_init(&cli->cl_mgc_sem, 1); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); + + cli->cl_dirty = 0; + cli->cl_avail_grant = 0; + /* FIXME: Should limit this for the sum of all cl_dirty_max. */ + cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024; + if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > num_physpages / 8) + cli->cl_dirty_max = num_physpages << (PAGE_CACHE_SHIFT - 3); + INIT_LIST_HEAD(&cli->cl_cache_waiters); + INIT_LIST_HEAD(&cli->cl_loi_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); + INIT_LIST_HEAD(&cli->cl_loi_write_list); + INIT_LIST_HEAD(&cli->cl_loi_read_list); + client_obd_list_lock_init(&cli->cl_loi_list_lock); + atomic_set(&cli->cl_pending_w_pages, 0); + atomic_set(&cli->cl_pending_r_pages, 0); + cli->cl_r_in_flight = 0; + cli->cl_w_in_flight = 0; + + spin_lock_init(&cli->cl_read_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); + spin_lock_init(&cli->cl_read_page_hist.oh_lock); + spin_lock_init(&cli->cl_write_page_hist.oh_lock); + spin_lock_init(&cli->cl_read_offset_hist.oh_lock); + spin_lock_init(&cli->cl_write_offset_hist.oh_lock); + + /* lru for osc. */ + INIT_LIST_HEAD(&cli->cl_lru_osc); + atomic_set(&cli->cl_lru_shrinkers, 0); + atomic_set(&cli->cl_lru_busy, 0); + atomic_set(&cli->cl_lru_in_list, 0); + INIT_LIST_HEAD(&cli->cl_lru_list); + client_obd_list_lock_init(&cli->cl_lru_list_lock); + + init_waitqueue_head(&cli->cl_destroy_waitq); + atomic_set(&cli->cl_destroy_in_flight, 0); + /* Turn on checksumming by default. */ + cli->cl_checksum = 1; + /* + * The supported checksum types will be worked out at connect time + * Set cl_chksum* to CRC32 for now to avoid returning screwed info + * through procfs. + */ + cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; + atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); + + /* This value may be reduced at connect time in + * ptlrpc_connect_interpret() . We initialize it to only + * 1MB until we know what the performance looks like. + * In the future this should likely be increased. LU-1431 */ + cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES, + LNET_MTU >> PAGE_CACHE_SHIFT); + + if (!strcmp(name, LUSTRE_MDC_NAME)) { + cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT; + } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) { + cli->cl_max_rpcs_in_flight = 2; + } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) { + cli->cl_max_rpcs_in_flight = 3; + } else if (num_physpages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) { + cli->cl_max_rpcs_in_flight = 4; + } else { + if (osc_on_mdt(obddev->obd_name)) + cli->cl_max_rpcs_in_flight = MDS_OSC_MAX_RIF_DEFAULT; + else + cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; + } + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + GOTO(err, rc); + } + + ptlrpc_init_client(rq_portal, rp_portal, name, + &obddev->obd_ldlm_client); + + imp = class_new_import(obddev); + if (imp == NULL) + GOTO(err_ldlm, rc = -ENOENT); + imp->imp_client = &obddev->obd_ldlm_client; + imp->imp_connect_op = connect_op; + memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 1)); + class_import_put(imp); + + rc = client_import_add_conn(imp, &server_uuid, 1); + if (rc) { + CERROR("can't add initial connection\n"); + GOTO(err_import, rc); + } + + cli->cl_import = imp; + /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */ + cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); + cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie); + + if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { + if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { + CDEBUG(D_HA, "marking %s %s->%s as inactive\n", + name, obddev->obd_name, + cli->cl_target_uuid.uuid); + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + } + } + + obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name, + LDLM_NAMESPACE_CLIENT, + LDLM_NAMESPACE_GREEDY, + ns_type); + if (obddev->obd_namespace == NULL) { + CERROR("Unable to create client namespace - %s\n", + obddev->obd_name); + GOTO(err_import, rc = -ENOMEM); + } + + cli->cl_qchk_stat = CL_NOT_QUOTACHECKED; + + RETURN(rc); + +err_import: + class_destroy_import(imp); +err_ldlm: + ldlm_put_ref(); +err: + RETURN(rc); + +} +EXPORT_SYMBOL(client_obd_setup); + +int client_obd_cleanup(struct obd_device *obddev) +{ + ENTRY; + + ldlm_namespace_free_post(obddev->obd_namespace); + obddev->obd_namespace = NULL; + + LASSERT(obddev->u.cli.cl_import == NULL); + + ldlm_put_ref(); + RETURN(0); +} +EXPORT_SYMBOL(client_obd_cleanup); + +/* ->o_connect() method for client side (OSC and MDC and MGC) */ +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct obd_connect_data *ocd; + struct lustre_handle conn = { 0 }; + int rc; + ENTRY; + + *exp = NULL; + down_write(&cli->cl_sem); + if (cli->cl_conn_count > 0 ) + GOTO(out_sem, rc = -EALREADY); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + GOTO(out_sem, rc); + + cli->cl_conn_count++; + *exp = class_conn2export(&conn); + + LASSERT(obd->obd_namespace); + + imp->imp_dlm_handle = conn; + rc = ptlrpc_init_import(imp); + if (rc != 0) + GOTO(out_ldlm, rc); + + ocd = &imp->imp_connect_data; + if (data) { + *ocd = *data; + imp->imp_connect_flags_orig = data->ocd_connect_flags; + } + + rc = ptlrpc_connect_import(imp); + if (rc != 0) { + LASSERT (imp->imp_state == LUSTRE_IMP_DISCON); + GOTO(out_ldlm, rc); + } + LASSERT((*exp)->exp_connection); + + if (data) { + LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == + ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n", + data->ocd_connect_flags, ocd->ocd_connect_flags); + data->ocd_connect_flags = ocd->ocd_connect_flags; + } + + ptlrpc_pinger_add_import(imp); + + EXIT; + + if (rc) { +out_ldlm: + cli->cl_conn_count--; + class_disconnect(*exp); + *exp = NULL; + } +out_sem: + up_write(&cli->cl_sem); + + return rc; +} +EXPORT_SYMBOL(client_connect_import); + +int client_disconnect_export(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct client_obd *cli; + struct obd_import *imp; + int rc = 0, err; + ENTRY; + + if (!obd) { + CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n", + exp, exp ? exp->exp_handle.h_cookie : -1); + RETURN(-EINVAL); + } + + cli = &obd->u.cli; + imp = cli->cl_import; + + down_write(&cli->cl_sem); + CDEBUG(D_INFO, "disconnect %s - %d\n", obd->obd_name, + cli->cl_conn_count); + + if (!cli->cl_conn_count) { + CERROR("disconnecting disconnected device (%s)\n", + obd->obd_name); + GOTO(out_disconnect, rc = -EINVAL); + } + + cli->cl_conn_count--; + if (cli->cl_conn_count) + GOTO(out_disconnect, rc = 0); + + /* Mark import deactivated now, so we don't try to reconnect if any + * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't + * fully deactivate the import, or that would drop all requests. */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + /* Some non-replayable imports (MDS's OSCs) are pinged, so just + * delete it regardless. (It's safe to delete an import that was + * never added.) */ + (void)ptlrpc_pinger_del_import(imp); + + if (obd->obd_namespace != NULL) { + /* obd_force == local only */ + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, + obd->obd_force ? LCF_LOCAL : 0, NULL); + ldlm_namespace_free_prior(obd->obd_namespace, imp, obd->obd_force); + } + + /* There's no need to hold sem while disconnecting an import, + * and it may actually cause deadlock in GSS. */ + up_write(&cli->cl_sem); + rc = ptlrpc_disconnect_import(imp, 0); + down_write(&cli->cl_sem); + + ptlrpc_invalidate_import(imp); + + EXIT; + +out_disconnect: + /* Use server style - class_disconnect should be always called for + * o_disconnect. */ + err = class_disconnect(exp); + if (!rc && err) + rc = err; + + up_write(&cli->cl_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(client_disconnect_export); + + +/** + * Packs current SLV and Limit into \a req. + */ +int target_pack_pool_reply(struct ptlrpc_request *req) +{ + struct obd_device *obd; + ENTRY; + + /* Check that we still have all structures alive as this may + * be some late RPC at shutdown time. */ + if (unlikely(!req->rq_export || !req->rq_export->exp_obd || + !exp_connect_lru_resize(req->rq_export))) { + lustre_msg_set_slv(req->rq_repmsg, 0); + lustre_msg_set_limit(req->rq_repmsg, 0); + RETURN(0); + } + + /* OBD is alive here as export is alive, which we checked above. */ + obd = req->rq_export->exp_obd; + + read_lock(&obd->obd_pool_lock); + lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); + lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); + + RETURN(0); +} +EXPORT_SYMBOL(target_pack_pool_reply); + +int target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) +{ + if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) { + DEBUG_REQ(D_ERROR, req, "dropping reply"); + return (-ECOMM); + } + + if (unlikely(rc)) { + DEBUG_REQ(D_NET, req, "processing error (%d)", rc); + req->rq_status = rc; + return (ptlrpc_send_error(req, 1)); + } else { + DEBUG_REQ(D_NET, req, "sending reply"); + } + + return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT)); +} + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + struct ptlrpc_service_part *svcpt; + int netrc; + struct ptlrpc_reply_state *rs; + struct obd_export *exp; + ENTRY; + + if (req->rq_no_reply) { + EXIT; + return; + } + + svcpt = req->rq_rqbd->rqbd_svcpt; + rs = req->rq_reply_state; + if (rs == NULL || !rs->rs_difficult) { + /* no notifiers */ + target_send_reply_msg (req, rc, fail_id); + EXIT; + return; + } + + /* must be an export if locks saved */ + LASSERT (req->rq_export != NULL); + /* req/reply consistent */ + LASSERT(rs->rs_svcpt == svcpt); + + /* "fresh" reply */ + LASSERT (!rs->rs_scheduled); + LASSERT (!rs->rs_scheduled_ever); + LASSERT (!rs->rs_handled); + LASSERT (!rs->rs_on_net); + LASSERT (rs->rs_export == NULL); + LASSERT (list_empty(&rs->rs_obd_list)); + LASSERT (list_empty(&rs->rs_exp_list)); + + exp = class_export_get (req->rq_export); + + /* disable reply scheduling while I'm setting up */ + rs->rs_scheduled = 1; + rs->rs_on_net = 1; + rs->rs_xid = req->rq_xid; + rs->rs_transno = req->rq_transno; + rs->rs_export = exp; + rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); + + spin_lock(&exp->exp_uncommitted_replies_lock); + CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n", + rs->rs_transno, exp->exp_last_committed); + if (rs->rs_transno > exp->exp_last_committed) { + /* not committed already */ + list_add_tail(&rs->rs_obd_list, + &exp->exp_uncommitted_replies); + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + + spin_lock(&exp->exp_lock); + list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies); + spin_unlock(&exp->exp_lock); + + netrc = target_send_reply_msg(req, rc, fail_id); + + spin_lock(&svcpt->scp_rep_lock); + + atomic_inc(&svcpt->scp_nreps_difficult); + + if (netrc != 0) { + /* error sending: reply is off the net. Also we need +1 + * reply ref until ptlrpc_handle_rs() is done + * with the reply state (if the send was successful, there + * would have been +1 ref for the net, which + * reply_out_callback leaves alone) */ + rs->rs_on_net = 0; + ptlrpc_rs_addref(rs); + } + + spin_lock(&rs->rs_lock); + if (rs->rs_transno <= exp->exp_last_committed || + (!rs->rs_on_net && !rs->rs_no_ack) || + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + CDEBUG(D_HA, "Schedule reply immediately\n"); + ptlrpc_dispatch_difficult_reply(rs); + } else { + list_add(&rs->rs_list, &svcpt->scp_rep_active); + rs->rs_scheduled = 0; /* allow notifier to schedule */ + } + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + EXIT; +} +EXPORT_SYMBOL(target_send_reply); + +ldlm_mode_t lck_compat_array[] = { + [LCK_EX] LCK_COMPAT_EX, + [LCK_PW] LCK_COMPAT_PW, + [LCK_PR] LCK_COMPAT_PR, + [LCK_CW] LCK_COMPAT_CW, + [LCK_CR] LCK_COMPAT_CR, + [LCK_NL] LCK_COMPAT_NL, + [LCK_GROUP] LCK_COMPAT_GROUP, + [LCK_COS] LCK_COMPAT_COS, +}; + +/** + * Rather arbitrary mapping from LDLM error codes to errno values. This should + * not escape to the user level. + */ +int ldlm_error2errno(ldlm_error_t error) +{ + int result; + + switch (error) { + case ELDLM_OK: + result = 0; + break; + case ELDLM_LOCK_CHANGED: + result = -ESTALE; + break; + case ELDLM_LOCK_ABORTED: + result = -ENAVAIL; + break; + case ELDLM_LOCK_REPLACED: + result = -ESRCH; + break; + case ELDLM_NO_LOCK_DATA: + result = -ENOENT; + break; + case ELDLM_NAMESPACE_EXISTS: + result = -EEXIST; + break; + case ELDLM_BAD_NAMESPACE: + result = -EBADF; + break; + default: + if (((int)error) < 0) /* cast to signed type */ + result = error; /* as ldlm_error_t can be unsigned */ + else { + CERROR("Invalid DLM result code: %d\n", error); + result = -EPROTO; + } + } + return result; +} +EXPORT_SYMBOL(ldlm_error2errno); + +/** + * Dual to ldlm_error2errno(): maps errno values back to ldlm_error_t. + */ +ldlm_error_t ldlm_errno2error(int err_no) +{ + int error; + + switch (err_no) { + case 0: + error = ELDLM_OK; + break; + case -ESTALE: + error = ELDLM_LOCK_CHANGED; + break; + case -ENAVAIL: + error = ELDLM_LOCK_ABORTED; + break; + case -ESRCH: + error = ELDLM_LOCK_REPLACED; + break; + case -ENOENT: + error = ELDLM_NO_LOCK_DATA; + break; + case -EEXIST: + error = ELDLM_NAMESPACE_EXISTS; + break; + case -EBADF: + error = ELDLM_BAD_NAMESPACE; + break; + default: + error = err_no; + } + return error; +} +EXPORT_SYMBOL(ldlm_errno2error); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp) +{ + spin_lock(&exp->exp_locks_list_guard); + if (!list_empty(&exp->exp_locks_list)) { + struct ldlm_lock *lock; + + CERROR("dumping locks for export %p," + "ignore if the unmount doesn't hang\n", exp); + list_for_each_entry(lock, &exp->exp_locks_list, + l_exp_refs_link) + LDLM_ERROR(lock, "lock:"); + } + spin_unlock(&exp->exp_locks_list_guard); +} +#endif diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c new file mode 100644 index 000000000000..bd39e1c78527 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c @@ -0,0 +1,2443 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lock.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +# include +# include + +#include +#include "ldlm_internal.h" + +/* lock types */ +char *ldlm_lockname[] = { + [0] "--", + [LCK_EX] "EX", + [LCK_PW] "PW", + [LCK_PR] "PR", + [LCK_CW] "CW", + [LCK_CR] "CR", + [LCK_NL] "NL", + [LCK_GROUP] "GROUP", + [LCK_COS] "COS" +}; +EXPORT_SYMBOL(ldlm_lockname); + +char *ldlm_typename[] = { + [LDLM_PLAIN] "PLN", + [LDLM_EXTENT] "EXT", + [LDLM_FLOCK] "FLK", + [LDLM_IBITS] "IBT", +}; +EXPORT_SYMBOL(ldlm_typename); + +static ldlm_policy_wire_to_local_t ldlm_policy_wire18_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire18_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_wire_to_local_t ldlm_policy_wire21_to_local[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_wire_to_local, + [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_wire_to_local, + [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_wire21_to_local, + [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_wire_to_local, +}; + +static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = { + [LDLM_PLAIN - LDLM_MIN_TYPE] ldlm_plain_policy_local_to_wire, + [LDLM_EXTENT - LDLM_MIN_TYPE] ldlm_extent_policy_local_to_wire, + [LDLM_FLOCK - LDLM_MIN_TYPE] ldlm_flock_policy_local_to_wire, + [LDLM_IBITS - LDLM_MIN_TYPE] ldlm_ibits_policy_local_to_wire, +}; + +/** + * Converts lock policy from local format to on the wire lock_desc format + */ +void ldlm_convert_policy_to_wire(ldlm_type_t type, + const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + ldlm_policy_local_to_wire_t convert; + + convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE]; + + convert(lpolicy, wpolicy); +} + +/** + * Converts lock policy from on the wire lock_desc format to local format + */ +void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type, + const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + ldlm_policy_wire_to_local_t convert; + int new_client; + + /** some badness for 2.0.0 clients, but 2.0.0 isn't supported */ + new_client = (exp_connect_flags(exp) & OBD_CONNECT_FULL20) != 0; + if (new_client) + convert = ldlm_policy_wire21_to_local[type - LDLM_MIN_TYPE]; + else + convert = ldlm_policy_wire18_to_local[type - LDLM_MIN_TYPE]; + + convert(wpolicy, lpolicy); +} + +char *ldlm_it2str(int it) +{ + switch (it) { + case IT_OPEN: + return "open"; + case IT_CREAT: + return "creat"; + case (IT_OPEN | IT_CREAT): + return "open|creat"; + case IT_READDIR: + return "readdir"; + case IT_GETATTR: + return "getattr"; + case IT_LOOKUP: + return "lookup"; + case IT_UNLINK: + return "unlink"; + case IT_GETXATTR: + return "getxattr"; + case IT_LAYOUT: + return "layout"; + default: + CERROR("Unknown intent %d\n", it); + return "UNKNOWN"; + } +} +EXPORT_SYMBOL(ldlm_it2str); + +extern struct kmem_cache *ldlm_lock_slab; + + +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg) +{ + ns->ns_policy = arg; +} +EXPORT_SYMBOL(ldlm_register_intent); + +/* + * REFCOUNTED LOCK OBJECTS + */ + + +/** + * Get a reference on a lock. + * + * Lock refcounts, during creation: + * - one special one for allocation, dec'd only once in destroy + * - one for being a lock that's in-use + * - one for the addref associated with a new lock + */ +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock) +{ + atomic_inc(&lock->l_refc); + return lock; +} +EXPORT_SYMBOL(ldlm_lock_get); + +/** + * Release lock reference. + * + * Also frees the lock if it was last reference. + */ +void ldlm_lock_put(struct ldlm_lock *lock) +{ + ENTRY; + + LASSERT(lock->l_resource != LP_POISON); + LASSERT(atomic_read(&lock->l_refc) > 0); + if (atomic_dec_and_test(&lock->l_refc)) { + struct ldlm_resource *res; + + LDLM_DEBUG(lock, + "final lock_put on destroyed lock, freeing it."); + + res = lock->l_resource; + LASSERT(lock->l_destroyed); + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_pending_chain)); + + lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats, + LDLM_NSS_LOCKS); + lu_ref_del(&res->lr_reference, "lock", lock); + ldlm_resource_putref(res); + lock->l_resource = NULL; + if (lock->l_export) { + class_export_lock_put(lock->l_export, lock); + lock->l_export = NULL; + } + + if (lock->l_lvb_data != NULL) + OBD_FREE(lock->l_lvb_data, lock->l_lvb_len); + + ldlm_interval_free(ldlm_interval_detach(lock)); + lu_ref_fini(&lock->l_reference); + OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle); + } + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_put); + +/** + * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked. + */ +int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) +{ + int rc = 0; + if (!list_empty(&lock->l_lru)) { + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_del_init(&lock->l_lru); + if (lock->l_flags & LDLM_FL_SKIPPED) + lock->l_flags &= ~LDLM_FL_SKIPPED; + LASSERT(ns->ns_nr_unused > 0); + ns->ns_nr_unused--; + rc = 1; + } + return rc; +} + +/** + * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first. + */ +int ldlm_lock_remove_from_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + int rc; + + ENTRY; + if (lock->l_ns_srv) { + LASSERT(list_empty(&lock->l_lru)); + RETURN(0); + } + + spin_lock(&ns->ns_lock); + rc = ldlm_lock_remove_from_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + EXIT; + return rc; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked. + */ +void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + lock->l_last_used = cfs_time_current(); + LASSERT(list_empty(&lock->l_lru)); + LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); + list_add_tail(&lock->l_lru, &ns->ns_unused_list); + LASSERT(ns->ns_nr_unused >= 0); + ns->ns_nr_unused++; +} + +/** + * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks + * first. + */ +void ldlm_lock_add_to_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + spin_lock(&ns->ns_lock); + ldlm_lock_add_to_lru_nolock(lock); + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Moves LDLM lock \a lock that is already in namespace LRU to the tail of + * the LRU. Performs necessary LRU locking + */ +void ldlm_lock_touch_in_lru(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + ENTRY; + if (lock->l_ns_srv) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + if (!list_empty(&lock->l_lru)) { + ldlm_lock_remove_from_lru_nolock(lock); + ldlm_lock_add_to_lru_nolock(lock); + } + spin_unlock(&ns->ns_lock); + EXIT; +} + +/** + * Helper to destroy a locked lock. + * + * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock + * Must be called with l_lock and lr_lock held. + * + * Does not actually free the lock data, but rather marks the lock as + * destroyed by setting l_destroyed field in the lock to 1. Destroys a + * handle->lock association too, so that the lock can no longer be found + * and removes the lock from LRU list. Actual lock freeing occurs when + * last lock reference goes away. + * + * Original comment (of some historical value): + * This used to have a 'strict' flag, which recovery would use to mark an + * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I + * shall explain why it's gone: with the new hash table scheme, once you call + * ldlm_lock_destroy, you can never drop your final references on this lock. + * Because it's not in the hash table anymore. -phil + */ +int ldlm_lock_destroy_internal(struct ldlm_lock *lock) +{ + ENTRY; + + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (!list_empty(&lock->l_res_link)) { + LDLM_ERROR(lock, "lock still on resource"); + LBUG(); + } + + if (lock->l_destroyed) { + LASSERT(list_empty(&lock->l_lru)); + EXIT; + return 0; + } + lock->l_destroyed = 1; + + if (lock->l_export && lock->l_export->exp_lock_hash) { + /* NB: it's safe to call cfs_hash_del() even lock isn't + * in exp_lock_hash. */ + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_del(lock->l_export->exp_lock_hash, + &lock->l_remote_handle, &lock->l_exp_hash); + } + + ldlm_lock_remove_from_lru(lock); + class_handle_unhash(&lock->l_handle); + +#if 0 + /* Wake anyone waiting for this lock */ + /* FIXME: I should probably add yet another flag, instead of using + * l_export to only call this on clients */ + if (lock->l_export) + class_export_put(lock->l_export); + lock->l_export = NULL; + if (lock->l_export && lock->l_completion_ast) + lock->l_completion_ast(lock, 0); +#endif + EXIT; + return 1; +} + +/** + * Destroys a LDLM lock \a lock. Performs necessary locking first. + */ +void ldlm_lock_destroy(struct ldlm_lock *lock) +{ + int first; + ENTRY; + lock_res_and_lock(lock); + first = ldlm_lock_destroy_internal(lock); + unlock_res_and_lock(lock); + + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/** + * Destroys a LDLM lock \a lock that is already locked. + */ +void ldlm_lock_destroy_nolock(struct ldlm_lock *lock) +{ + int first; + ENTRY; + first = ldlm_lock_destroy_internal(lock); + /* drop reference from hashtable only for first destroy */ + if (first) { + lu_ref_del(&lock->l_reference, "hash", lock); + LDLM_LOCK_RELEASE(lock); + } + EXIT; +} + +/* this is called by portals_handle2object with the handle lock taken */ +static void lock_handle_addref(void *lock) +{ + LDLM_LOCK_GET((struct ldlm_lock *)lock); +} + +static void lock_handle_free(void *lock, int size) +{ + LASSERT(size == sizeof(struct ldlm_lock)); + OBD_SLAB_FREE(lock, ldlm_lock_slab, size); +} + +struct portals_handle_ops lock_handle_ops = { + .hop_addref = lock_handle_addref, + .hop_free = lock_handle_free, +}; + +/** + * + * Allocate and initialize new lock structure. + * + * usage: pass in a resource on which you have done ldlm_resource_get + * new lock will take over the refcount. + * returns: lock with refcount 2 - one for current caller and one for remote + */ +static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) +{ + struct ldlm_lock *lock; + ENTRY; + + if (resource == NULL) + LBUG(); + + OBD_SLAB_ALLOC_PTR_GFP(lock, ldlm_lock_slab, __GFP_IO); + if (lock == NULL) + RETURN(NULL); + + spin_lock_init(&lock->l_lock); + lock->l_resource = resource; + lu_ref_add(&resource->lr_reference, "lock", lock); + + atomic_set(&lock->l_refc, 2); + INIT_LIST_HEAD(&lock->l_res_link); + INIT_LIST_HEAD(&lock->l_lru); + INIT_LIST_HEAD(&lock->l_pending_chain); + INIT_LIST_HEAD(&lock->l_bl_ast); + INIT_LIST_HEAD(&lock->l_cp_ast); + INIT_LIST_HEAD(&lock->l_rk_ast); + init_waitqueue_head(&lock->l_waitq); + lock->l_blocking_lock = NULL; + INIT_LIST_HEAD(&lock->l_sl_mode); + INIT_LIST_HEAD(&lock->l_sl_policy); + INIT_HLIST_NODE(&lock->l_exp_hash); + INIT_HLIST_NODE(&lock->l_exp_flock_hash); + + lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats, + LDLM_NSS_LOCKS); + INIT_LIST_HEAD(&lock->l_handle.h_link); + class_handle_hash(&lock->l_handle, &lock_handle_ops); + + lu_ref_init(&lock->l_reference); + lu_ref_add(&lock->l_reference, "hash", lock); + lock->l_callback_timeout = 0; + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&lock->l_exp_refs_link); + lock->l_exp_refs_nr = 0; + lock->l_exp_refs_target = NULL; +#endif + INIT_LIST_HEAD(&lock->l_exp_list); + + RETURN(lock); +} + +/** + * Moves LDLM lock \a lock to another resource. + * This is used on client when server returns some other lock than requested + * (typically as a result of intent operation) + */ +int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, + const struct ldlm_res_id *new_resid) +{ + struct ldlm_resource *oldres = lock->l_resource; + struct ldlm_resource *newres; + int type; + ENTRY; + + LASSERT(ns_is_client(ns)); + + lock_res_and_lock(lock); + if (memcmp(new_resid, &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) == 0) { + /* Nothing to do */ + unlock_res_and_lock(lock); + RETURN(0); + } + + LASSERT(new_resid->name[0] != 0); + + /* This function assumes that the lock isn't on any lists */ + LASSERT(list_empty(&lock->l_res_link)); + + type = oldres->lr_type; + unlock_res_and_lock(lock); + + newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); + if (newres == NULL) + RETURN(-ENOMEM); + + lu_ref_add(&newres->lr_reference, "lock", lock); + /* + * To flip the lock from the old to the new resource, lock, oldres and + * newres have to be locked. Resource spin-locks are nested within + * lock->l_lock, and are taken in the memory address order to avoid + * dead-locks. + */ + spin_lock(&lock->l_lock); + oldres = lock->l_resource; + if (oldres < newres) { + lock_res(oldres); + lock_res_nested(newres, LRT_NEW); + } else { + lock_res(newres); + lock_res_nested(oldres, LRT_NEW); + } + LASSERT(memcmp(new_resid, &oldres->lr_name, + sizeof oldres->lr_name) != 0); + lock->l_resource = newres; + unlock_res(oldres); + unlock_res_and_lock(lock); + + /* ...and the flowers are still standing! */ + lu_ref_del(&oldres->lr_reference, "lock", lock); + ldlm_resource_putref(oldres); + + RETURN(0); +} +EXPORT_SYMBOL(ldlm_lock_change_resource); + +/** \defgroup ldlm_handles LDLM HANDLES + * Ways to get hold of locks without any addresses. + * @{ + */ + +/** + * Fills in handle for LDLM lock \a lock into supplied \a lockh + * Does not take any references. + */ +void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh) +{ + lockh->cookie = lock->l_handle.h_cookie; +} +EXPORT_SYMBOL(ldlm_lock2handle); + +/** + * Obtain a lock reference by handle. + * + * if \a flags: atomically get the lock and set the flags. + * Return NULL if flag already set + */ +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, + __u64 flags) +{ + struct ldlm_lock *lock; + ENTRY; + + LASSERT(handle); + + lock = class_handle2object(handle->cookie); + if (lock == NULL) + RETURN(NULL); + + /* It's unlikely but possible that someone marked the lock as + * destroyed after we did handle2object on it */ + if (flags == 0 && !lock->l_destroyed) { + lu_ref_add(&lock->l_reference, "handle", current); + RETURN(lock); + } + + lock_res_and_lock(lock); + + LASSERT(lock->l_resource != NULL); + + lu_ref_add_atomic(&lock->l_reference, "handle", current); + if (unlikely(lock->l_destroyed)) { + unlock_res_and_lock(lock); + CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + if (flags && (lock->l_flags & flags)) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + RETURN(NULL); + } + + if (flags) + lock->l_flags |= flags; + + unlock_res_and_lock(lock); + RETURN(lock); +} +EXPORT_SYMBOL(__ldlm_handle2lock); +/** @} ldlm_handles */ + +/** + * Fill in "on the wire" representation for given LDLM lock into supplied + * lock descriptor \a desc structure. + */ +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) +{ + struct obd_export *exp = lock->l_export ?: lock->l_conn_export; + + /* INODEBITS_INTEROP: If the other side does not support + * inodebits, reply with a plain lock descriptor. */ + if ((lock->l_resource->lr_type == LDLM_IBITS) && + (exp && !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) { + /* Make sure all the right bits are set in this lock we + are going to pass to client */ + LASSERTF(lock->l_policy_data.l_inodebits.bits == + (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT), + "Inappropriate inode lock bits during " + "conversion " LPU64 "\n", + lock->l_policy_data.l_inodebits.bits); + + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_resource.lr_type = LDLM_PLAIN; + + /* Convert "new" lock mode to something old client can + understand */ + if ((lock->l_req_mode == LCK_CR) || + (lock->l_req_mode == LCK_CW)) + desc->l_req_mode = LCK_PR; + else + desc->l_req_mode = lock->l_req_mode; + if ((lock->l_granted_mode == LCK_CR) || + (lock->l_granted_mode == LCK_CW)) { + desc->l_granted_mode = LCK_PR; + } else { + /* We never grant PW/EX locks to clients */ + LASSERT((lock->l_granted_mode != LCK_PW) && + (lock->l_granted_mode != LCK_EX)); + desc->l_granted_mode = lock->l_granted_mode; + } + + /* We do not copy policy here, because there is no + policy for plain locks */ + } else { + ldlm_res2desc(lock->l_resource, &desc->l_resource); + desc->l_req_mode = lock->l_req_mode; + desc->l_granted_mode = lock->l_granted_mode; + ldlm_convert_policy_to_wire(lock->l_resource->lr_type, + &lock->l_policy_data, + &desc->l_policy_data); + } +} +EXPORT_SYMBOL(ldlm_lock2desc); + +/** + * Add a lock to list of conflicting locks to send AST to. + * + * Only add if we have not sent a blocking AST to the lock yet. + */ +void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + if ((lock->l_flags & LDLM_FL_AST_SENT) == 0) { + LDLM_DEBUG(lock, "lock incompatible; sending blocking AST."); + lock->l_flags |= LDLM_FL_AST_SENT; + /* If the enqueuing client said so, tell the AST recipient to + * discard dirty data, rather than writing back. */ + if (new->l_flags & LDLM_AST_DISCARD_DATA) + lock->l_flags |= LDLM_FL_DISCARD_DATA; + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, work_list); + LDLM_LOCK_GET(lock); + LASSERT(lock->l_blocking_lock == NULL); + lock->l_blocking_lock = LDLM_LOCK_GET(new); + } +} + +/** + * Add a lock to list of just granted locks to send completion AST to. + */ +void ldlm_add_cp_work_item(struct ldlm_lock *lock, struct list_head *work_list) +{ + if ((lock->l_flags & LDLM_FL_CP_REQD) == 0) { + lock->l_flags |= LDLM_FL_CP_REQD; + LDLM_DEBUG(lock, "lock granted; sending completion AST."); + LASSERT(list_empty(&lock->l_cp_ast)); + list_add(&lock->l_cp_ast, work_list); + LDLM_LOCK_GET(lock); + } +} + +/** + * Aggregator function to add AST work items into a list. Determines + * what sort of an AST work needs to be done and calls the proper + * adding function. + * Must be called with lr_lock held. + */ +void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, + struct list_head *work_list) +{ + ENTRY; + check_res_locked(lock->l_resource); + if (new) + ldlm_add_bl_work_item(lock, new, work_list); + else + ldlm_add_cp_work_item(lock, work_list); + EXIT; +} + +/** + * Add specified reader/writer reference to LDLM lock with handle \a lockh. + * r/w reference type is determined by \a mode + * Calls ldlm_lock_addref_internal. + */ +void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + ldlm_lock_addref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_addref); + +/** + * Helper function. + * Add specified reader/writer reference to LDLM lock \a lock. + * r/w reference type is determined by \a mode + * Removes lock from LRU if it is there. + * Assumes the LDLM lock is already locked. + */ +void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode) +{ + ldlm_lock_remove_from_lru(lock); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + lock->l_readers++; + lu_ref_add_atomic(&lock->l_reference, "reader", lock); + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + lock->l_writers++; + lu_ref_add_atomic(&lock->l_reference, "writer", lock); + } + LDLM_LOCK_GET(lock); + lu_ref_add_atomic(&lock->l_reference, "user", lock); + LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); +} + +/** + * Attempts to add reader/writer reference to a lock with handle \a lockh, and + * fails if lock is already LDLM_FL_CBPENDING or destroyed. + * + * \retval 0 success, lock was addref-ed + * + * \retval -EAGAIN lock is being canceled. + */ +int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock; + int result; + + result = -EAGAIN; + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_readers != 0 || lock->l_writers != 0 || + !(lock->l_flags & LDLM_FL_CBPENDING)) { + ldlm_lock_addref_internal_nolock(lock, mode); + result = 0; + } + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return result; +} +EXPORT_SYMBOL(ldlm_lock_addref_try); + +/** + * Add specified reader/writer reference to LDLM lock \a lock. + * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work. + * Only called for local locks. + */ +void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode) +{ + lock_res_and_lock(lock); + ldlm_lock_addref_internal_nolock(lock, mode); + unlock_res_and_lock(lock); +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Assumes LDLM lock is already locked. + * only called in ldlm_flock_destroy and for local locks. + * Does NOT add lock to LRU if no r/w references left to accomodate flock locks + * that cannot be placed in LRU. + */ +void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode) +{ + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + if (mode & (LCK_NL | LCK_CR | LCK_PR)) { + LASSERT(lock->l_readers > 0); + lu_ref_del(&lock->l_reference, "reader", lock); + lock->l_readers--; + } + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { + LASSERT(lock->l_writers > 0); + lu_ref_del(&lock->l_reference, "writer", lock); + lock->l_writers--; + } + + lu_ref_del(&lock->l_reference, "user", lock); + LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */ +} + +/** + * Removes reader/writer reference for LDLM lock \a lock. + * Locks LDLM lock first. + * If the lock is determined to be client lock on a client and r/w refcount + * drops to zero and the lock is not blocked, the lock is added to LRU lock + * on the namespace. + * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called. + */ +void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) +{ + struct ldlm_namespace *ns; + ENTRY; + + lock_res_and_lock(lock); + + ns = ldlm_lock_to_ns(lock); + + ldlm_lock_decref_internal_nolock(lock, mode); + + /* release lvb data for layout lock */ + if (ns_is_client(ns) && !lock->l_readers && !lock->l_writers && + ldlm_has_layout(lock) && lock->l_flags & LDLM_FL_LVB_READY) { + /* this is the last user of a layout lock and stripe has + * been set up, lvb is no longer used. + * This may be a large amount of memory, so we should free it + * when possible. */ + if (lock->l_lvb_data != NULL) { + OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len); + lock->l_lvb_data = NULL; + lock->l_lvb_len = 0; + } + } + + if (lock->l_flags & LDLM_FL_LOCAL && + !lock->l_readers && !lock->l_writers) { + /* If this is a local lock on a server namespace and this was + * the last reference, cancel the lock. */ + CDEBUG(D_INFO, "forcing cancel of local lock\n"); + lock->l_flags |= LDLM_FL_CBPENDING; + } + + if (!lock->l_readers && !lock->l_writers && + (lock->l_flags & LDLM_FL_CBPENDING)) { + /* If we received a blocked AST and this was the last reference, + * run the callback. */ + if (lock->l_ns_srv && lock->l_export) + CERROR("FL_CBPENDING set on non-local lock--just a " + "warning\n"); + + LDLM_DEBUG(lock, "final decref done on cbpending lock"); + + LDLM_LOCK_GET(lock); /* dropped by bl thread */ + ldlm_lock_remove_from_lru(lock); + unlock_res_and_lock(lock); + + if (lock->l_flags & LDLM_FL_FAIL_LOC) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + if ((lock->l_flags & LDLM_FL_ATOMIC_CB) || + ldlm_bl_to_thread_lock(ns, NULL, lock) != 0) + ldlm_handle_bl_callback(ns, NULL, lock); + } else if (ns_is_client(ns) && + !lock->l_readers && !lock->l_writers && + !(lock->l_flags & LDLM_FL_NO_LRU) && + !(lock->l_flags & LDLM_FL_BL_AST)) { + + LDLM_DEBUG(lock, "add lock into lru list"); + + /* If this is a client-side namespace and this was the last + * reference, put it on the LRU. */ + ldlm_lock_add_to_lru(lock); + unlock_res_and_lock(lock); + + if (lock->l_flags & LDLM_FL_FAIL_LOC) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE + * are not supported by the server, otherwise, it is done on + * enqueue. */ + if (!exp_connect_cancelset(lock->l_conn_export) && + !ns_connect_lru_resize(ns)) + ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); + } else { + LDLM_DEBUG(lock, "do not add lock into lru list"); + unlock_res_and_lock(lock); + } + + EXIT; +} + +/** + * Decrease reader/writer refcount for LDLM lock with handle \a lockh + */ +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + LASSERTF(lock != NULL, "Non-existing lock: "LPX64"\n", lockh->cookie); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref); + +/** + * Decrease reader/writer refcount for LDLM lock with handle + * \a lockh and mark it for subsequent cancellation once r/w refcount + * drops to zero instead of putting into LRU. + * + * Typical usage is for GROUP locks which we cannot allow to be cached. + */ +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + ENTRY; + + LASSERT(lock != NULL); + + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(lock); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); + +struct sl_insert_point { + struct list_head *res_link; + struct list_head *mode_link; + struct list_head *policy_link; +}; + +/** + * Finds a position to insert the new lock into granted lock list. + * + * Used for locks eligible for skiplist optimization. + * + * Parameters: + * queue [input]: the granted list where search acts on; + * req [input]: the lock whose position to be located; + * prev [output]: positions within 3 lists to insert @req to + * Return Value: + * filled @prev + * NOTE: called by + * - ldlm_grant_lock_with_skiplist + */ +static void search_granted_lock(struct list_head *queue, + struct ldlm_lock *req, + struct sl_insert_point *prev) +{ + struct list_head *tmp; + struct ldlm_lock *lock, *mode_end, *policy_end; + ENTRY; + + list_for_each(tmp, queue) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + mode_end = list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, l_sl_mode); + + if (lock->l_req_mode != req->l_req_mode) { + /* jump to last lock of mode group */ + tmp = &mode_end->l_res_link; + continue; + } + + /* suitable mode group is found */ + if (lock->l_resource->lr_type == LDLM_PLAIN) { + /* insert point is last lock of the mode group */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else if (lock->l_resource->lr_type == LDLM_IBITS) { + for (;;) { + policy_end = + list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy); + + if (lock->l_policy_data.l_inodebits.bits == + req->l_policy_data.l_inodebits.bits) { + /* insert point is last lock of + * the policy group */ + prev->res_link = + &policy_end->l_res_link; + prev->mode_link = + &policy_end->l_sl_mode; + prev->policy_link = + &policy_end->l_sl_policy; + EXIT; + return; + } + + if (policy_end == mode_end) + /* done with mode group */ + break; + + /* go to next policy group within mode group */ + tmp = policy_end->l_res_link.next; + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + } /* loop over policy groups within the mode group */ + + /* insert point is last lock of the mode group, + * new policy group is started */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; + } else { + LDLM_ERROR(lock,"is not LDLM_PLAIN or LDLM_IBITS lock"); + LBUG(); + } + } + + /* insert point is last lock on the queue, + * new mode group and new policy group are started */ + prev->res_link = queue->prev; + prev->mode_link = &req->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; +} + +/** + * Add a lock into resource granted list after a position described by + * \a prev. + */ +static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, + struct sl_insert_point *prev) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(lock, "About to add lock:"); + + if (lock->l_destroyed) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_sl_mode)); + LASSERT(list_empty(&lock->l_sl_policy)); + + /* + * lock->link == prev->link means lock is first starting the group. + * Don't re-add to itself to suppress kernel warnings. + */ + if (&lock->l_res_link != prev->res_link) + list_add(&lock->l_res_link, prev->res_link); + if (&lock->l_sl_mode != prev->mode_link) + list_add(&lock->l_sl_mode, prev->mode_link); + if (&lock->l_sl_policy != prev->policy_link) + list_add(&lock->l_sl_policy, prev->policy_link); + + EXIT; +} + +/** + * Add a lock to granted list on a resource maintaining skiplist + * correctness. + */ +static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) +{ + struct sl_insert_point prev; + ENTRY; + + LASSERT(lock->l_req_mode == lock->l_granted_mode); + + search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); + ldlm_granted_list_add_lock(lock, &prev); + EXIT; +} + +/** + * Perform lock granting bookkeeping. + * + * Includes putting the lock into granted list and updating lock mode. + * NOTE: called by + * - ldlm_lock_enqueue + * - ldlm_reprocess_queue + * - ldlm_lock_convert + * + * must be called with lr_lock held + */ +void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) +{ + struct ldlm_resource *res = lock->l_resource; + ENTRY; + + check_res_locked(res); + + lock->l_granted_mode = lock->l_req_mode; + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) + ldlm_grant_lock_with_skiplist(lock); + else if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); + else + ldlm_resource_add_lock(res, &res->lr_granted, lock); + + if (lock->l_granted_mode < res->lr_most_restr) + res->lr_most_restr = lock->l_granted_mode; + + if (work_list && lock->l_completion_ast != NULL) + ldlm_add_ast_work_item(lock, NULL, work_list); + + ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); + EXIT; +} + +/** + * Search for a lock with given properties in a queue. + * + * \retval a referenced lock or NULL. See the flag descriptions below, in the + * comment above ldlm_lock_match + */ +static struct ldlm_lock *search_queue(struct list_head *queue, + ldlm_mode_t *mode, + ldlm_policy_data_t *policy, + struct ldlm_lock *old_lock, + __u64 flags, int unref) +{ + struct ldlm_lock *lock; + struct list_head *tmp; + + list_for_each(tmp, queue) { + ldlm_mode_t match; + + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (lock == old_lock) + break; + + /* llite sometimes wants to match locks that will be + * canceled when their users drop, but we allow it to match + * if it passes in CBPENDING and the lock still has users. + * this is generally only going to be used by children + * whose parents already hold a lock so forward progress + * can still happen. */ + if (lock->l_flags & LDLM_FL_CBPENDING && + !(flags & LDLM_FL_CBPENDING)) + continue; + if (!unref && lock->l_flags & LDLM_FL_CBPENDING && + lock->l_readers == 0 && lock->l_writers == 0) + continue; + + if (!(lock->l_req_mode & *mode)) + continue; + match = lock->l_req_mode; + + if (lock->l_resource->lr_type == LDLM_EXTENT && + (lock->l_policy_data.l_extent.start > + policy->l_extent.start || + lock->l_policy_data.l_extent.end < policy->l_extent.end)) + continue; + + if (unlikely(match == LCK_GROUP) && + lock->l_resource->lr_type == LDLM_EXTENT && + lock->l_policy_data.l_extent.gid != policy->l_extent.gid) + continue; + + /* We match if we have existing lock with same or wider set + of bits. */ + if (lock->l_resource->lr_type == LDLM_IBITS && + ((lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits) != + policy->l_inodebits.bits)) + continue; + + if (!unref && + (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED || + lock->l_failed)) + continue; + + if ((flags & LDLM_FL_LOCAL_ONLY) && + !(lock->l_flags & LDLM_FL_LOCAL)) + continue; + + if (flags & LDLM_FL_TEST_LOCK) { + LDLM_LOCK_GET(lock); + ldlm_lock_touch_in_lru(lock); + } else { + ldlm_lock_addref_internal_nolock(lock, match); + } + *mode = match; + return lock; + } + + return NULL; +} + +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock) +{ + if (!lock->l_failed) { + lock->l_failed = 1; + wake_up_all(&lock->l_waitq); + } +} +EXPORT_SYMBOL(ldlm_lock_fail_match_locked); + +void ldlm_lock_fail_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_fail_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_fail_match); + +/** + * Mark lock as "matchable" by OST. + * + * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB + * is not yet valid. + * Assumes LDLM lock is already locked. + */ +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) +{ + lock->l_flags |= LDLM_FL_LVB_READY; + wake_up_all(&lock->l_waitq); +} +EXPORT_SYMBOL(ldlm_lock_allow_match_locked); + +/** + * Mark lock as "matchable" by OST. + * Locks the lock and then \see ldlm_lock_allow_match_locked + */ +void ldlm_lock_allow_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_allow_match_locked(lock); + unlock_res_and_lock(lock); +} +EXPORT_SYMBOL(ldlm_lock_allow_match); + +/** + * Attempt to find a lock with specified properties. + * + * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is + * set in \a flags + * + * Can be called in two ways: + * + * If 'ns' is NULL, then lockh describes an existing lock that we want to look + * for a duplicate of. + * + * Otherwise, all of the fields must be filled in, to match against. + * + * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the + * server (ie, connh is NULL) + * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted + * list will be considered + * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked + * to be canceled can still be matched as long as they still have reader + * or writer refernces + * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock, + * just tell us if we would have matched. + * + * \retval 1 if it finds an already-existing lock that is compatible; in this + * case, lockh is filled in with a addref()ed lock + * + * We also check security context, and if that fails we simply return 0 (to + * keep caller code unchanged), the context failure will be discovered by + * caller sometime later. + */ +ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, + const struct ldlm_res_id *res_id, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh, int unref) +{ + struct ldlm_resource *res; + struct ldlm_lock *lock, *old_lock = NULL; + int rc = 0; + ENTRY; + + if (ns == NULL) { + old_lock = ldlm_handle2lock(lockh); + LASSERT(old_lock); + + ns = ldlm_lock_to_ns(old_lock); + res_id = &old_lock->l_resource->lr_name; + type = old_lock->l_resource->lr_type; + mode = old_lock->l_req_mode; + } + + res = ldlm_resource_get(ns, NULL, res_id, type, 0); + if (res == NULL) { + LASSERT(old_lock == NULL); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + lock_res(res); + + lock = search_queue(&res->lr_granted, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) + GOTO(out, rc = 1); + if (flags & LDLM_FL_BLOCK_GRANTED) + GOTO(out, rc = 0); + lock = search_queue(&res->lr_converting, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) + GOTO(out, rc = 1); + lock = search_queue(&res->lr_waiting, &mode, policy, old_lock, + flags, unref); + if (lock != NULL) + GOTO(out, rc = 1); + + EXIT; + out: + unlock_res(res); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + + if (lock) { + ldlm_lock2handle(lock, lockh); + if ((flags & LDLM_FL_LVB_READY) && + (!(lock->l_flags & LDLM_FL_LVB_READY))) { + struct l_wait_info lwi; + if (lock->l_completion_ast) { + int err = lock->l_completion_ast(lock, + LDLM_FL_WAIT_NOREPROC, + NULL); + if (err) { + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else + ldlm_lock_decref_internal(lock, + mode); + rc = 0; + goto out2; + } + } + + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), + NULL, LWI_ON_SIGNAL_NOOP, NULL); + + /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */ + l_wait_event(lock->l_waitq, + lock->l_flags & LDLM_FL_LVB_READY || + lock->l_destroyed || lock->l_failed, + &lwi); + if (!(lock->l_flags & LDLM_FL_LVB_READY)) { + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + else + ldlm_lock_decref_internal(lock, mode); + rc = 0; + } + } + } + out2: + if (rc) { + LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")", + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] : policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + + /* check user's security context */ + if (lock->l_conn_export && + sptlrpc_import_check_ctx( + class_exp2cliimp(lock->l_conn_export))) { + if (!(flags & LDLM_FL_TEST_LOCK)) + ldlm_lock_decref_internal(lock, mode); + rc = 0; + } + + if (flags & LDLM_FL_TEST_LOCK) + LDLM_LOCK_RELEASE(lock); + + } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/ + LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res " + LPU64"/"LPU64" ("LPU64" "LPU64")", ns, + type, mode, res_id->name[0], res_id->name[1], + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[2] :policy->l_extent.start, + (type == LDLM_PLAIN || type == LDLM_IBITS) ? + res_id->name[3] : policy->l_extent.end); + } + if (old_lock) + LDLM_LOCK_PUT(old_lock); + + return rc ? mode : 0; +} +EXPORT_SYMBOL(ldlm_lock_match); + +ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh, + __u64 *bits) +{ + struct ldlm_lock *lock; + ldlm_mode_t mode = 0; + ENTRY; + + lock = ldlm_handle2lock(lockh); + if (lock != NULL) { + lock_res_and_lock(lock); + if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED || + lock->l_failed) + GOTO(out, mode); + + if (lock->l_flags & LDLM_FL_CBPENDING && + lock->l_readers == 0 && lock->l_writers == 0) + GOTO(out, mode); + + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + mode = lock->l_granted_mode; + ldlm_lock_addref_internal_nolock(lock, mode); + } + + EXIT; + +out: + if (lock != NULL) { + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + } + return mode; +} +EXPORT_SYMBOL(ldlm_revalidate_lock_handle); + +/** The caller must guarantee that the buffer is large enough. */ +int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, + enum req_location loc, void *data, int size) +{ + void *lvb; + ENTRY; + + LASSERT(data != NULL); + LASSERT(size >= 0); + + switch (lock->l_lvb_type) { + case LVB_T_OST: + if (size == sizeof(struct ost_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else if (size == sizeof(struct ost_lvb_v1)) { + struct ost_lvb *olvb = data; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_ost_lvb_v1); + else + lvb = req_capsule_server_sized_swab_get(pill, + &RMF_DLM_LVB, size, + lustre_swab_ost_lvb_v1); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + olvb->lvb_mtime_ns = 0; + olvb->lvb_atime_ns = 0; + olvb->lvb_ctime_ns = 0; + } else { + LDLM_ERROR(lock, "Replied unexpected ost LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LQUOTA: + if (size == sizeof(struct lquota_lvb)) { + if (loc == RCL_CLIENT) + lvb = req_capsule_client_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + else + lvb = req_capsule_server_swab_get(pill, + &RMF_DLM_LVB, + lustre_swab_lquota_lvb); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + } else { + LDLM_ERROR(lock, "Replied unexpected lquota LVB size %d", + size); + RETURN(-EINVAL); + } + break; + case LVB_T_LAYOUT: + if (size == 0) + break; + + if (loc == RCL_CLIENT) + lvb = req_capsule_client_get(pill, &RMF_DLM_LVB); + else + lvb = req_capsule_server_get(pill, &RMF_DLM_LVB); + if (unlikely(lvb == NULL)) { + LDLM_ERROR(lock, "no LVB"); + RETURN(-EPROTO); + } + + memcpy(data, lvb, size); + break; + default: + LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type); + libcfs_debug_dumpstack(NULL); + RETURN(-EINVAL); + } + + RETURN(0); +} + +/** + * Create and fill in new LDLM lock with specified properties. + * Returns a referenced lock + */ +struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, + ldlm_mode_t mode, + const struct ldlm_callback_suite *cbs, + void *data, __u32 lvb_len, + enum lvb_type lvb_type) +{ + struct ldlm_lock *lock; + struct ldlm_resource *res; + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, type, 1); + if (res == NULL) + RETURN(NULL); + + lock = ldlm_lock_new(res); + + if (lock == NULL) + RETURN(NULL); + + lock->l_req_mode = mode; + lock->l_ast_data = data; + lock->l_pid = current_pid(); + lock->l_ns_srv = !!ns_is_server(ns); + if (cbs) { + lock->l_blocking_ast = cbs->lcs_blocking; + lock->l_completion_ast = cbs->lcs_completion; + lock->l_glimpse_ast = cbs->lcs_glimpse; + lock->l_weigh_ast = cbs->lcs_weigh; + } + + lock->l_tree_node = NULL; + /* if this is the extent lock, allocate the interval tree node */ + if (type == LDLM_EXTENT) { + if (ldlm_interval_alloc(lock) == NULL) + GOTO(out, 0); + } + + if (lvb_len) { + lock->l_lvb_len = lvb_len; + OBD_ALLOC(lock->l_lvb_data, lvb_len); + if (lock->l_lvb_data == NULL) + GOTO(out, 0); + } + + lock->l_lvb_type = lvb_type; + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) + GOTO(out, 0); + + RETURN(lock); + +out: + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + return NULL; +} + +/** + * Enqueue (request) a lock. + * + * Does not block. As a result of enqueue the lock would be put + * into granted or waiting list. + * + * If namespace has intent policy sent and the lock has LDLM_FL_HAS_INTENT flag + * set, skip all the enqueueing and delegate lock processing to intent policy + * function. + */ +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, + struct ldlm_lock **lockp, + void *cookie, __u64 *flags) +{ + struct ldlm_lock *lock = *lockp; + struct ldlm_resource *res = lock->l_resource; + int local = ns_is_client(ldlm_res_to_ns(res)); + ldlm_error_t rc = ELDLM_OK; + struct ldlm_interval *node = NULL; + ENTRY; + + lock->l_last_activity = cfs_time_current_sec(); + /* policies are not executed on the client or during replay */ + if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT + && !local && ns->ns_policy) { + rc = ns->ns_policy(ns, lockp, cookie, lock->l_req_mode, *flags, + NULL); + if (rc == ELDLM_LOCK_REPLACED) { + /* The lock that was returned has already been granted, + * and placed into lockp. If it's not the same as the + * one we passed in, then destroy the old one and our + * work here is done. */ + if (lock != *lockp) { + ldlm_lock_destroy(lock); + LDLM_LOCK_RELEASE(lock); + } + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(0); + } else if (rc != ELDLM_OK || + (rc == ELDLM_OK && (*flags & LDLM_FL_INTENT_ONLY))) { + ldlm_lock_destroy(lock); + RETURN(rc); + } + } + + /* For a replaying lock, it might be already in granted list. So + * unlinking the lock will cause the interval node to be freed, we + * have to allocate the interval node early otherwise we can't regrant + * this lock in the future. - jay */ + if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT) + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO); + + lock_res_and_lock(lock); + if (local && lock->l_req_mode == lock->l_granted_mode) { + /* The server returned a blocked lock, but it was granted + * before we got a chance to actually enqueue it. We don't + * need to do anything else. */ + *flags &= ~(LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); + GOTO(out, ELDLM_OK); + } + + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) { + if (node == NULL) { + ldlm_lock_destroy_nolock(lock); + GOTO(out, rc = -ENOMEM); + } + + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + + /* Some flags from the enqueue want to make it into the AST, via the + * lock's l_flags. */ + lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA; + + /* This distinction between local lock trees is very important; a client + * namespace only has information about locks taken by that client, and + * thus doesn't have enough information to decide for itself if it can + * be granted (below). In this case, we do exactly what the server + * tells us to do, as dictated by the 'flags'. + * + * We do exactly the same thing during recovery, when the server is + * more or less trusting the clients not to lie. + * + * FIXME (bug 268): Detect obvious lies by checking compatibility in + * granted/converting queues. */ + if (local) { + if (*flags & LDLM_FL_BLOCK_CONV) + ldlm_resource_add_lock(res, &res->lr_converting, lock); + else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); + else + ldlm_grant_lock(lock, NULL); + GOTO(out, ELDLM_OK); + } else { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } + +out: + unlock_res_and_lock(lock); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + return rc; +} + + +/** + * Process a call to blocking AST callback for a lock in ast_work list + */ +static int +ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc d; + int rc; + struct ldlm_lock *lock; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_bl_ast); + + /* nobody should touch l_bl_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_bl_ast); + + LASSERT(lock->l_flags & LDLM_FL_AST_SENT); + LASSERT(lock->l_bl_ast_run == 0); + LASSERT(lock->l_blocking_lock); + lock->l_bl_ast_run++; + unlock_res_and_lock(lock); + + ldlm_lock2desc(lock->l_blocking_lock, &d); + + rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock->l_blocking_lock); + lock->l_blocking_lock = NULL; + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to completion AST callback for a lock in ast_work list + */ +static int +ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + int rc = 0; + struct ldlm_lock *lock; + ldlm_completion_callback completion_callback; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_cp_ast); + + /* It's possible to receive a completion AST before we've set + * the l_completion_ast pointer: either because the AST arrived + * before the reply, or simply because there's a small race + * window between receiving the reply and finishing the local + * enqueue. (bug 842) + * + * This can't happen with the blocking_ast, however, because we + * will never call the local blocking_ast until we drop our + * reader/writer reference, which we won't do until we get the + * reply and finish enqueueing. */ + + /* nobody should touch l_cp_ast */ + lock_res_and_lock(lock); + list_del_init(&lock->l_cp_ast); + LASSERT(lock->l_flags & LDLM_FL_CP_REQD); + /* save l_completion_ast since it can be changed by + * mds_intent_policy(), see bug 14225 */ + completion_callback = lock->l_completion_ast; + lock->l_flags &= ~LDLM_FL_CP_REQD; + unlock_res_and_lock(lock); + + if (completion_callback != NULL) + rc = completion_callback(lock, 0, (void *)arg); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to revocation AST callback for a lock in ast_work list + */ +static int +ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_lock_desc desc; + int rc; + struct ldlm_lock *lock; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + lock = list_entry(arg->list->next, struct ldlm_lock, l_rk_ast); + list_del_init(&lock->l_rk_ast); + + /* the desc just pretend to exclusive */ + ldlm_lock2desc(lock, &desc); + desc.l_req_mode = LCK_EX; + desc.l_granted_mode = 0; + + rc = lock->l_blocking_ast(lock, &desc, (void*)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock); + + RETURN(rc); +} + +/** + * Process a call to glimpse AST callback for a lock in ast_work list + */ +int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) +{ + struct ldlm_cb_set_arg *arg = opaq; + struct ldlm_glimpse_work *gl_work; + struct ldlm_lock *lock; + int rc = 0; + ENTRY; + + if (list_empty(arg->list)) + RETURN(-ENOENT); + + gl_work = list_entry(arg->list->next, struct ldlm_glimpse_work, + gl_list); + list_del_init(&gl_work->gl_list); + + lock = gl_work->gl_lock; + + /* transfer the glimpse descriptor to ldlm_cb_set_arg */ + arg->gl_desc = gl_work->gl_desc; + + /* invoke the actual glimpse callback */ + if (lock->l_glimpse_ast(lock, (void*)arg) == 0) + rc = 1; + + LDLM_LOCK_RELEASE(lock); + + if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0) + OBD_FREE_PTR(gl_work); + + RETURN(rc); +} + +/** + * Process list of locks in need of ASTs being sent. + * + * Used on server to send multiple ASTs together instead of sending one by + * one. + */ +int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, + ldlm_desc_ast_t ast_type) +{ + struct ldlm_cb_set_arg *arg; + set_producer_func work_ast_lock; + int rc; + + if (list_empty(rpc_list)) + RETURN(0); + + OBD_ALLOC_PTR(arg); + if (arg == NULL) + RETURN(-ENOMEM); + + atomic_set(&arg->restart, 0); + arg->list = rpc_list; + + switch (ast_type) { + case LDLM_WORK_BL_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_bl_ast_lock; + break; + case LDLM_WORK_CP_AST: + arg->type = LDLM_CP_CALLBACK; + work_ast_lock = ldlm_work_cp_ast_lock; + break; + case LDLM_WORK_REVOKE_AST: + arg->type = LDLM_BL_CALLBACK; + work_ast_lock = ldlm_work_revoke_ast_lock; + break; + case LDLM_WORK_GL_AST: + arg->type = LDLM_GL_CALLBACK; + work_ast_lock = ldlm_work_gl_ast_lock; + break; + default: + LBUG(); + } + + /* We create a ptlrpc request set with flow control extension. + * This request set will use the work_ast_lock function to produce new + * requests and will send a new request each time one completes in order + * to keep the number of requests in flight to ns_max_parallel_ast */ + arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX, + work_ast_lock, arg); + if (arg->set == NULL) + GOTO(out, rc = -ENOMEM); + + ptlrpc_set_wait(arg->set); + ptlrpc_set_destroy(arg->set); + + rc = atomic_read(&arg->restart) ? -ERESTART : 0; + GOTO(out, rc); +out: + OBD_FREE_PTR(arg); + return rc; +} + +static int reprocess_one_queue(struct ldlm_resource *res, void *closure) +{ + ldlm_reprocess_all(res); + return LDLM_ITER_CONTINUE; +} + +static int ldlm_reprocess_res(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int rc; + + rc = reprocess_one_queue(res, arg); + + return rc == LDLM_ITER_STOP; +} + +/** + * Iterate through all resources on a namespace attempting to grant waiting + * locks. + */ +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) +{ + ENTRY; + + if (ns != NULL) { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_reprocess_res, NULL); + } + EXIT; +} +EXPORT_SYMBOL(ldlm_reprocess_all_ns); + +/** + * Try to grant all waiting locks on a resource. + * + * Calls ldlm_reprocess_queue on converting and waiting queues. + * + * Typically called after some resource locks are cancelled to see + * if anything could be granted as a result of the cancellation. + */ +void ldlm_reprocess_all(struct ldlm_resource *res) +{ + LIST_HEAD(rpc_list); + + ENTRY; + if (!ns_is_client(ldlm_res_to_ns(res))) { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } + EXIT; +} + +/** + * Helper function to call blocking AST for LDLM lock \a lock in a + * "cancelling" mode. + */ +void ldlm_cancel_callback(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + if (!(lock->l_flags & LDLM_FL_CANCEL)) { + lock->l_flags |= LDLM_FL_CANCEL; + if (lock->l_blocking_ast) { + unlock_res_and_lock(lock); + lock->l_blocking_ast(lock, NULL, lock->l_ast_data, + LDLM_CB_CANCELING); + lock_res_and_lock(lock); + } else { + LDLM_DEBUG(lock, "no blocking ast"); + } + } + lock->l_flags |= LDLM_FL_BL_DONE; +} + +/** + * Remove skiplist-enabled LDLM lock \a req from granted list + */ +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) +{ + if (req->l_resource->lr_type != LDLM_PLAIN && + req->l_resource->lr_type != LDLM_IBITS) + return; + + list_del_init(&req->l_sl_policy); + list_del_init(&req->l_sl_mode); +} + +/** + * Attempts to cancel LDLM lock \a lock that has no reader/writer references. + */ +void ldlm_lock_cancel(struct ldlm_lock *lock) +{ + struct ldlm_resource *res; + struct ldlm_namespace *ns; + ENTRY; + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + /* Please do not, no matter how tempting, remove this LBUG without + * talking to me first. -phik */ + if (lock->l_readers || lock->l_writers) { + LDLM_ERROR(lock, "lock still has references"); + LBUG(); + } + + if (lock->l_waited) + ldlm_del_waiting_lock(lock); + + /* Releases cancel callback. */ + ldlm_cancel_callback(lock); + + /* Yes, second time, just in case it was added again while we were + running with no res lock in ldlm_cancel_callback */ + if (lock->l_waited) + ldlm_del_waiting_lock(lock); + + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + + if (lock->l_granted_mode == lock->l_req_mode) + ldlm_pool_del(&ns->ns_pool, lock); + + /* Make sure we will not be called again for same lock what is possible + * if not to zero out lock->l_granted_mode */ + lock->l_granted_mode = LCK_MINMODE; + unlock_res_and_lock(lock); + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_cancel); + +/** + * Set opaque data into the lock that only makes sense to upper layer. + */ +int ldlm_lock_set_data(struct lustre_handle *lockh, void *data) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int rc = -EINVAL; + ENTRY; + + if (lock) { + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + rc = 0; + LDLM_LOCK_PUT(lock); + } + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_lock_set_data); + +struct export_cl_data { + struct obd_export *ecl_exp; + int ecl_loop; +}; + +/** + * Iterator function for ldlm_cancel_locks_for_export. + * Cancels passed locks. + */ +int ldlm_cancel_locks_for_export_cb(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) + +{ + struct export_cl_data *ecl = (struct export_cl_data *)data; + struct obd_export *exp = ecl->ecl_exp; + struct ldlm_lock *lock = cfs_hash_object(hs, hnode); + struct ldlm_resource *res; + + res = ldlm_resource_getref(lock->l_resource); + LDLM_LOCK_GET(lock); + + LDLM_DEBUG(lock, "export %p", exp); + ldlm_res_lvbo_update(res, NULL, 1); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(res); + ldlm_resource_putref(res); + LDLM_LOCK_RELEASE(lock); + + ecl->ecl_loop++; + if ((ecl->ecl_loop & -ecl->ecl_loop) == ecl->ecl_loop) { + CDEBUG(D_INFO, + "Cancel lock %p for export %p (loop %d), still have " + "%d locks left on hash table.\n", + lock, exp, ecl->ecl_loop, + atomic_read(&hs->hs_count)); + } + + return 0; +} + +/** + * Cancel all locks for given export. + * + * Typically called on client disconnection/eviction + */ +void ldlm_cancel_locks_for_export(struct obd_export *exp) +{ + struct export_cl_data ecl = { + .ecl_exp = exp, + .ecl_loop = 0, + }; + + cfs_hash_for_each_empty(exp->exp_lock_hash, + ldlm_cancel_locks_for_export_cb, &ecl); +} + +/** + * Downgrade an exclusive lock. + * + * A fast variant of ldlm_lock_convert for convertion of exclusive + * locks. The convertion is always successful. + * Used by Commit on Sharing (COS) code. + * + * \param lock A lock to convert + * \param new_mode new lock mode + */ +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode) +{ + ENTRY; + + LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX)); + LASSERT(new_mode == LCK_COS); + + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + /* + * Remove the lock from pool as it will be added again in + * ldlm_grant_lock() called below. + */ + ldlm_pool_del(&ldlm_lock_to_ns(lock)->ns_pool, lock); + + lock->l_req_mode = new_mode; + ldlm_grant_lock(lock, NULL); + unlock_res_and_lock(lock); + ldlm_reprocess_all(lock->l_resource); + + EXIT; +} +EXPORT_SYMBOL(ldlm_lock_downgrade); + +/** + * Attempt to convert already granted lock to a different mode. + * + * While lock conversion is not currently used, future client-side + * optimizations could take advantage of it to avoid discarding cached + * pages on a file. + */ +struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, + __u32 *flags) +{ + LIST_HEAD(rpc_list); + struct ldlm_resource *res; + struct ldlm_namespace *ns; + int granted = 0; + struct ldlm_interval *node; + ENTRY; + + /* Just return if mode is unchanged. */ + if (new_mode == lock->l_granted_mode) { + *flags |= LDLM_FL_BLOCK_GRANTED; + RETURN(lock->l_resource); + } + + /* I can't check the type of lock here because the bitlock of lock + * is not held here, so do the allocation blindly. -jay */ + OBD_SLAB_ALLOC_PTR_GFP(node, ldlm_interval_slab, __GFP_IO); + if (node == NULL) /* Actually, this causes EDEADLOCK to be returned */ + RETURN(NULL); + + LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR), + "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode); + + lock_res_and_lock(lock); + + res = lock->l_resource; + ns = ldlm_res_to_ns(res); + + lock->l_req_mode = new_mode; + if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) { + ldlm_resource_unlink_lock(lock); + } else { + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT) { + /* FIXME: ugly code, I have to attach the lock to a + * interval node again since perhaps it will be granted + * soon */ + INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } + } + + /* + * Remove old lock from the pool before adding the lock with new + * mode below in ->policy() + */ + ldlm_pool_del(&ns->ns_pool, lock); + + /* If this is a local resource, put it on the appropriate list. */ + if (ns_is_client(ldlm_res_to_ns(res))) { + if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) { + ldlm_resource_add_lock(res, &res->lr_converting, lock); + } else { + /* This should never happen, because of the way the + * server handles conversions. */ + LDLM_ERROR(lock, "Erroneous flags %x on local lock\n", + *flags); + LBUG(); + + ldlm_grant_lock(lock, &rpc_list); + granted = 1; + /* FIXME: completion handling not with lr_lock held ! */ + if (lock->l_completion_ast) + lock->l_completion_ast(lock, 0, NULL); + } + } else { + CERROR("This is client-side-only module, cannot handle " + "LDLM_NAMESPACE_SERVER resource type lock.\n"); + LBUG(); + } + unlock_res_and_lock(lock); + + if (granted) + ldlm_run_ast_work(ns, &rpc_list, LDLM_WORK_CP_AST); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + RETURN(res); +} +EXPORT_SYMBOL(ldlm_lock_convert); + +/** + * Print lock with lock handle \a lockh description into debug log. + * + * Used when printing all locks on a resource for debug purposes. + */ +void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + lock = ldlm_handle2lock(lockh); + if (lock == NULL) + return; + + LDLM_DEBUG_LIMIT(level, lock, "###"); + + LDLM_LOCK_PUT(lock); +} +EXPORT_SYMBOL(ldlm_lock_dump_handle); + +/** + * Print lock information with custom message into debug log. + * Helper function. + */ +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ...) +{ + va_list args; + struct obd_export *exp = lock->l_export; + struct ldlm_resource *resource = lock->l_resource; + char *nid = "local"; + + va_start(args, fmt); + + if (exp && exp->exp_connection) { + nid = libcfs_nid2str(exp->exp_connection->c_peer.nid); + } else if (exp && exp->exp_obd != NULL) { + struct obd_import *imp = exp->exp_obd->u.cli.cl_import; + nid = libcfs_nid2str(imp->imp_connection->c_peer.nid); + } + + if (resource == NULL) { + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " + "res: \?\? rrc=\?\? type: \?\?\? flags: "LPX64" nid: %s " + "remote: "LPX64" expref: %d pid: %u timeout: %lu " + "lvb_type: %d\n", + lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + va_end(args); + return; + } + + switch (resource->lr_type) { + case LDLM_EXTENT: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " + "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64 + "] (req "LPU64"->"LPU64") flags: "LPX64" nid: %s remote:" + " "LPX64" expref: %d pid: %u timeout: %lu lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + resource->lr_name.name[0], + resource->lr_name.name[1], + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_extent.start, + lock->l_policy_data.l_extent.end, + lock->l_req_extent.start, lock->l_req_extent.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + break; + + case LDLM_FLOCK: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " + "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d " + "["LPU64"->"LPU64"] flags: "LPX64" nid: %s remote: "LPX64 + " expref: %d pid: %u timeout: %lu\n", + ldlm_lock_to_ns_name(lock), lock, + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + resource->lr_name.name[0], + resource->lr_name.name[1], + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_policy_data.l_flock.pid, + lock->l_policy_data.l_flock.start, + lock->l_policy_data.l_flock.end, + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout); + break; + + case LDLM_IBITS: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " + "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s " + "flags: "LPX64" nid: %s remote: "LPX64" expref: %d " + "pid: %u timeout: %lu lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read (&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + resource->lr_name.name[0], + resource->lr_name.name[1], + lock->l_policy_data.l_inodebits.bits, + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + break; + + default: + libcfs_debug_vmsg2(msgdata, fmt, args, + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " + "res: "LPU64"/"LPU64" rrc: %d type: %s flags: "LPX64" " + "nid: %s remote: "LPX64" expref: %d pid: %u timeout: %lu" + "lvb_type: %d\n", + ldlm_lock_to_ns_name(lock), + lock, lock->l_handle.h_cookie, + atomic_read (&lock->l_refc), + lock->l_readers, lock->l_writers, + ldlm_lockname[lock->l_granted_mode], + ldlm_lockname[lock->l_req_mode], + resource->lr_name.name[0], + resource->lr_name.name[1], + atomic_read(&resource->lr_refcount), + ldlm_typename[resource->lr_type], + lock->l_flags, nid, lock->l_remote_handle.cookie, + exp ? atomic_read(&exp->exp_refcount) : -99, + lock->l_pid, lock->l_callback_timeout, lock->l_lvb_type); + break; + } + va_end(args); +} +EXPORT_SYMBOL(_ldlm_lock_debug); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c new file mode 100644 index 000000000000..324d5e4286dc --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c @@ -0,0 +1,1238 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_lockd.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +# include + +#include +#include +#include +#include "ldlm_internal.h" + +static int ldlm_num_threads; +CFS_MODULE_PARM(ldlm_num_threads, "i", int, 0444, + "number of DLM service threads to start"); + +static char *ldlm_cpts; +CFS_MODULE_PARM(ldlm_cpts, "s", charp, 0444, + "CPU partitions ldlm threads should run on"); + +extern struct kmem_cache *ldlm_resource_slab; +extern struct kmem_cache *ldlm_lock_slab; +static struct mutex ldlm_ref_mutex; +static int ldlm_refcount; + +struct ldlm_cb_async_args { + struct ldlm_cb_set_arg *ca_set_arg; + struct ldlm_lock *ca_lock; +}; + +/* LDLM state */ + +static struct ldlm_state *ldlm_state; + +inline cfs_time_t round_timeout(cfs_time_t timeout) +{ + return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1); +} + +/* timeout for initial callback (AST) reply (bz10399) */ +static inline unsigned int ldlm_get_rq_timeout(void) +{ + /* Non-AT value */ + unsigned int timeout = min(ldlm_timeout, obd_timeout / 3); + + return timeout < 1 ? 1 : timeout; +} + +#define ELT_STOPPED 0 +#define ELT_READY 1 +#define ELT_TERMINATE 2 + +struct ldlm_bl_pool { + spinlock_t blp_lock; + + /* + * blp_prio_list is used for callbacks that should be handled + * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. + * see bug 13843 + */ + struct list_head blp_prio_list; + + /* + * blp_list is used for all other callbacks which are likely + * to take longer to process. + */ + struct list_head blp_list; + + wait_queue_head_t blp_waitq; + struct completion blp_comp; + atomic_t blp_num_threads; + atomic_t blp_busy_threads; + int blp_min_threads; + int blp_max_threads; +}; + +struct ldlm_bl_work_item { + struct list_head blwi_entry; + struct ldlm_namespace *blwi_ns; + struct ldlm_lock_desc blwi_ld; + struct ldlm_lock *blwi_lock; + struct list_head blwi_head; + int blwi_count; + struct completion blwi_comp; + ldlm_cancel_flags_t blwi_flags; + int blwi_mem_pressure; +}; + + +int ldlm_del_waiting_lock(struct ldlm_lock *lock) +{ + RETURN(0); +} + +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout) +{ + RETURN(0); +} + + + +/** + * Callback handler for receiving incoming blocking ASTs. + * + * This can only happen on client side. + */ +void ldlm_handle_bl_callback(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, struct ldlm_lock *lock) +{ + int do_ast; + ENTRY; + + LDLM_DEBUG(lock, "client blocking AST callback handler"); + + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) + lock->l_flags |= LDLM_FL_CANCEL; + + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + CDEBUG(D_DLMTRACE, "Lock %p already unused, calling callback (%p)\n", + lock, lock->l_blocking_ast); + if (lock->l_blocking_ast != NULL) + lock->l_blocking_ast(lock, ld, lock->l_ast_data, + LDLM_CB_BLOCKING); + } else { + CDEBUG(D_DLMTRACE, "Lock %p is referenced, will be cancelled later\n", + lock); + } + + LDLM_DEBUG(lock, "client blocking callback handler END"); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +/** + * Callback handler for receiving incoming completion ASTs. + * + * This only can happen on client side. + */ +static void ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + int lvb_len; + LIST_HEAD(ast_list); + int rc = 0; + ENTRY; + + LDLM_DEBUG(lock, "client completion callback handler START"); + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { + int to = cfs_time_seconds(1); + while (to > 0) { + schedule_timeout_and_set_state( + TASK_INTERRUPTIBLE, to); + if (lock->l_granted_mode == lock->l_req_mode || + lock->l_destroyed) + break; + } + } + + lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT); + if (lvb_len < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len); + GOTO(out, rc = lvb_len); + } else if (lvb_len > 0) { + if (lock->l_lvb_len > 0) { + /* for extent lock, lvb contains ost_lvb{}. */ + LASSERT(lock->l_lvb_data != NULL); + + if (unlikely(lock->l_lvb_len < lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than " + "expectation, expected = %d, " + "replied = %d", + lock->l_lvb_len, lvb_len); + GOTO(out, rc = -EINVAL); + } + } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has + * variable length */ + void *lvb_data; + + OBD_ALLOC(lvb_data, lvb_len); + if (lvb_data == NULL) { + LDLM_ERROR(lock, "No memory: %d.\n", lvb_len); + GOTO(out, rc = -ENOMEM); + } + + lock_res_and_lock(lock); + LASSERT(lock->l_lvb_data == NULL); + lock->l_lvb_data = lvb_data; + lock->l_lvb_len = lvb_len; + unlock_res_and_lock(lock); + } + } + + lock_res_and_lock(lock); + if (lock->l_destroyed || + lock->l_granted_mode == lock->l_req_mode) { + /* bug 11300: the lock has already been granted */ + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "Double grant race happened"); + GOTO(out, rc = 0); + } + + /* If we receive the completion AST before the actual enqueue returned, + * then we might need to switch lock modes, resources, or extents. */ + if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) { + lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; + LDLM_DEBUG(lock, "completion AST, new lock mode"); + } + + if (lock->l_resource->lr_type != LDLM_PLAIN) { + ldlm_convert_policy_to_local(req->rq_export, + dlm_req->lock_desc.l_resource.lr_type, + &dlm_req->lock_desc.l_policy_data, + &lock->l_policy_data); + LDLM_DEBUG(lock, "completion AST, new policy data"); + } + + ldlm_resource_unlink_lock(lock); + if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) != 0) { + unlock_res_and_lock(lock); + rc = ldlm_lock_change_resource(ns, lock, + &dlm_req->lock_desc.l_resource.lr_name); + if (rc < 0) { + LDLM_ERROR(lock, "Failed to allocate resource"); + GOTO(out, rc); + } + LDLM_DEBUG(lock, "completion AST, new resource"); + CERROR("change resource!\n"); + lock_res_and_lock(lock); + } + + if (dlm_req->lock_flags & LDLM_FL_AST_SENT) { + /* BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. */ + ldlm_lock_remove_from_lru(lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + LDLM_DEBUG(lock, "completion AST includes blocking AST"); + } + + if (lock->l_lvb_len > 0) { + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT, + lock->l_lvb_data, lvb_len); + if (rc < 0) { + unlock_res_and_lock(lock); + GOTO(out, rc); + } + } + + ldlm_grant_lock(lock, &ast_list); + unlock_res_and_lock(lock); + + LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); + + /* Let Enqueue to call osc_lock_upcall() and initialize + * l_ast_data */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2); + + ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST); + + LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)", + lock); + GOTO(out, rc); + +out: + if (rc < 0) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_FAILED; + unlock_res_and_lock(lock); + wake_up(&lock->l_waitq); + } + LDLM_LOCK_RELEASE(lock); +} + +/** + * Callback handler for receiving incoming glimpse ASTs. + * + * This only can happen on client side. After handling the glimpse AST + * we also consider dropping the lock here if it is unused locally for a + * long time. + */ +static void ldlm_handle_gl_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns, + struct ldlm_request *dlm_req, + struct ldlm_lock *lock) +{ + int rc = -ENOSYS; + ENTRY; + + LDLM_DEBUG(lock, "client glimpse AST callback handler"); + + if (lock->l_glimpse_ast != NULL) + rc = lock->l_glimpse_ast(lock, req); + + if (req->rq_repmsg != NULL) { + ptlrpc_reply(req); + } else { + req->rq_status = rc; + ptlrpc_error(req); + } + + lock_res_and_lock(lock); + if (lock->l_granted_mode == LCK_PW && + !lock->l_readers && !lock->l_writers && + cfs_time_after(cfs_time_current(), + cfs_time_add(lock->l_last_used, + cfs_time_seconds(10)))) { + unlock_res_and_lock(lock); + if (ldlm_bl_to_thread_lock(ns, NULL, lock)) + ldlm_handle_bl_callback(ns, NULL, lock); + + EXIT; + return; + } + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + EXIT; +} + +static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) +{ + if (req->rq_no_reply) + return 0; + + req->rq_status = rc; + if (!req->rq_packed_final) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + return rc; + } + return ptlrpc_reply(req); +} + +static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, + ldlm_cancel_flags_t cancel_flags) +{ + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + ENTRY; + + spin_lock(&blp->blp_lock); + if (blwi->blwi_lock && + blwi->blwi_lock->l_flags & LDLM_FL_DISCARD_DATA) { + /* add LDLM_FL_DISCARD_DATA requests to the priority list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); + } else { + /* other blocking callbacks are added to the regular list */ + list_add_tail(&blwi->blwi_entry, &blp->blp_list); + } + spin_unlock(&blp->blp_lock); + + wake_up(&blp->blp_waitq); + + /* can not check blwi->blwi_flags as blwi could be already freed in + LCF_ASYNC mode */ + if (!(cancel_flags & LCF_ASYNC)) + wait_for_completion(&blwi->blwi_comp); + + RETURN(0); +} + +static inline void init_blwi(struct ldlm_bl_work_item *blwi, + struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + struct ldlm_lock *lock, + ldlm_cancel_flags_t cancel_flags) +{ + init_completion(&blwi->blwi_comp); + INIT_LIST_HEAD(&blwi->blwi_head); + + if (memory_pressure_get()) + blwi->blwi_mem_pressure = 1; + + blwi->blwi_ns = ns; + blwi->blwi_flags = cancel_flags; + if (ld != NULL) + blwi->blwi_ld = *ld; + if (count) { + list_add(&blwi->blwi_head, cancels); + list_del_init(cancels); + blwi->blwi_count = count; + } else { + blwi->blwi_lock = lock; + } +} + +/** + * Queues a list of locks \a cancels containing \a count locks + * for later processing by a blocking thread. If \a count is zero, + * then the lock referenced as \a lock is queued instead. + * + * The blocking thread would then call ->l_blocking_ast callback in the lock. + * If list addition fails an error is returned and caller is supposed to + * call ->l_blocking_ast itself. + */ +static int ldlm_bl_to_thread(struct ldlm_namespace *ns, + struct ldlm_lock_desc *ld, + struct ldlm_lock *lock, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags) +{ + ENTRY; + + if (cancels && count == 0) + RETURN(0); + + if (cancel_flags & LCF_ASYNC) { + struct ldlm_bl_work_item *blwi; + + OBD_ALLOC(blwi, sizeof(*blwi)); + if (blwi == NULL) + RETURN(-ENOMEM); + init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags); + + RETURN(__ldlm_bl_to_thread(blwi, cancel_flags)); + } else { + /* if it is synchronous call do minimum mem alloc, as it could + * be triggered from kernel shrinker + */ + struct ldlm_bl_work_item blwi; + + memset(&blwi, 0, sizeof(blwi)); + init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags); + RETURN(__ldlm_bl_to_thread(&blwi, cancel_flags)); + } +} + + +int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct ldlm_lock *lock) +{ + return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC); +} + +int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, + struct list_head *cancels, int count, + ldlm_cancel_flags_t cancel_flags) +{ + return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags); +} + +/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */ +static int ldlm_handle_setinfo(struct ptlrpc_request *req) +{ + struct obd_device *obd = req->rq_export->exp_obd; + char *key; + void *val; + int keylen, vallen; + int rc = -ENOSYS; + ENTRY; + + DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name); + + req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO); + + key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + if (key == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info key"); + RETURN(-EFAULT); + } + keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT); + val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + if (val == NULL) { + DEBUG_REQ(D_IOCTL, req, "no set_info val"); + RETURN(-EFAULT); + } + vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT); + + /* We are responsible for swabbing contents of val */ + + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) + /* Pass it on to mdc (the "export" in this case) */ + rc = obd_set_info_async(req->rq_svc_thread->t_env, + req->rq_export, + sizeof(KEY_HSM_COPYTOOL_SEND), + KEY_HSM_COPYTOOL_SEND, + vallen, val, NULL); + else + DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key); + + return rc; +} + +static inline void ldlm_callback_errmsg(struct ptlrpc_request *req, + const char *msg, int rc, + struct lustre_handle *handle) +{ + DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req, + "%s: [nid %s] [rc %d] [lock "LPX64"]", + msg, libcfs_id2str(req->rq_peer), rc, + handle ? handle->cookie : 0); + if (req->rq_no_reply) + CWARN("No reply was sent, maybe cause bug 21636.\n"); + else if (rc) + CWARN("Send reply failed, maybe cause bug 21636.\n"); +} + +static int ldlm_handle_qc_callback(struct ptlrpc_request *req) +{ + struct obd_quotactl *oqctl; + struct client_obd *cli = &req->rq_export->exp_obd->u.cli; + + oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + if (oqctl == NULL) { + CERROR("Can't unpack obd_quotactl\n"); + RETURN(-EPROTO); + } + + cli->cl_qchk_stat = oqctl->qc_stat; + return 0; +} + +/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ +static int ldlm_callback_handler(struct ptlrpc_request *req) +{ + struct ldlm_namespace *ns; + struct ldlm_request *dlm_req; + struct ldlm_lock *lock; + int rc; + ENTRY; + + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ + + /* do nothing for sec context finalize */ + if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI) + RETURN(0); + + req_capsule_init(&req->rq_pill, req, RCL_SERVER); + + if (req->rq_export == NULL) { + rc = ldlm_callback_reply(req, -ENOTCONN); + ldlm_callback_errmsg(req, "Operate on unconnected server", + rc, NULL); + RETURN(0); + } + + LASSERT(req->rq_export != NULL); + LASSERT(req->rq_export->exp_obd != NULL); + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_CP_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_GL_CALLBACK: + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET)) + RETURN(0); + break; + case LDLM_SET_INFO: + rc = ldlm_handle_setinfo(req); + ldlm_callback_reply(req, rc); + RETURN(0); + case OBD_LOG_CANCEL: /* remove this eventually - for 1.4.0 compat */ + CERROR("shouldn't be handling OBD_LOG_CANCEL on DLM thread\n"); + req_capsule_set(&req->rq_pill, &RQF_LOG_CANCEL); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_NET)) + RETURN(0); + rc = llog_origin_handle_cancel(req); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOG_CANCEL_REP)) + RETURN(0); + ldlm_callback_reply(req, rc); + RETURN(0); + case LLOG_ORIGIN_HANDLE_CREATE: + req_capsule_set(&req->rq_pill, &RQF_LLOG_ORIGIN_HANDLE_CREATE); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET)) + RETURN(0); + rc = llog_origin_handle_open(req); + ldlm_callback_reply(req, rc); + RETURN(0); + case LLOG_ORIGIN_HANDLE_NEXT_BLOCK: + req_capsule_set(&req->rq_pill, + &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET)) + RETURN(0); + rc = llog_origin_handle_next_block(req); + ldlm_callback_reply(req, rc); + RETURN(0); + case LLOG_ORIGIN_HANDLE_READ_HEADER: + req_capsule_set(&req->rq_pill, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET)) + RETURN(0); + rc = llog_origin_handle_read_header(req); + ldlm_callback_reply(req, rc); + RETURN(0); + case LLOG_ORIGIN_HANDLE_CLOSE: + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LOGD_NET)) + RETURN(0); + rc = llog_origin_handle_close(req); + ldlm_callback_reply(req, rc); + RETURN(0); + case OBD_QC_CALLBACK: + req_capsule_set(&req->rq_pill, &RQF_QC_CALLBACK); + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_QC_CALLBACK_NET)) + RETURN(0); + rc = ldlm_handle_qc_callback(req); + ldlm_callback_reply(req, rc); + RETURN(0); + default: + CERROR("unknown opcode %u\n", + lustre_msg_get_opc(req->rq_reqmsg)); + ldlm_callback_reply(req, -EPROTO); + RETURN(0); + } + + ns = req->rq_export->exp_obd->obd_namespace; + LASSERT(ns != NULL); + + req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); + + dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + if (dlm_req == NULL) { + rc = ldlm_callback_reply(req, -EPROTO); + ldlm_callback_errmsg(req, "Operate without parameter", rc, + NULL); + RETURN(0); + } + + /* Force a known safe race, send a cancel to the server for a lock + * which the server has already started a blocking callback on. */ + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } + + lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0); + if (!lock) { + CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock " + "disappeared\n", dlm_req->lock_handle[0].cookie); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate with invalid parameter", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + + if ((lock->l_flags & LDLM_FL_FAIL_LOC) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) + OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); + + /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */ + lock_res_and_lock(lock); + lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & + LDLM_AST_FLAGS); + if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + /* If somebody cancels lock and cache is already dropped, + * or lock is failed before cp_ast received on client, + * we can tell the server we have no lock. Otherwise, we + * should send cancel after dropping the cache. */ + if (((lock->l_flags & LDLM_FL_CANCELING) && + (lock->l_flags & LDLM_FL_BL_DONE)) || + (lock->l_flags & LDLM_FL_FAILED)) { + LDLM_DEBUG(lock, "callback on lock " + LPX64" - lock disappeared\n", + dlm_req->lock_handle[0].cookie); + unlock_res_and_lock(lock); + LDLM_LOCK_RELEASE(lock); + rc = ldlm_callback_reply(req, -EINVAL); + ldlm_callback_errmsg(req, "Operate on stale lock", rc, + &dlm_req->lock_handle[0]); + RETURN(0); + } + /* BL_AST locks are not needed in LRU. + * Let ldlm_cancel_lru() be fast. */ + ldlm_lock_remove_from_lru(lock); + lock->l_flags |= LDLM_FL_BL_AST; + } + unlock_res_and_lock(lock); + + /* We want the ost thread to get this reply so that it can respond + * to ost requests (write cache writeback) that might be triggered + * in the callback. + * + * But we'd also like to be able to indicate in the reply that we're + * cancelling right now, because it's unused, or have an intent result + * in the reply, so we might have to push the responsibility for sending + * the reply down into the AST handlers, alas. */ + + switch (lustre_msg_get_opc(req->rq_reqmsg)) { + case LDLM_BL_CALLBACK: + CDEBUG(D_INODE, "blocking ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); + if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)) { + rc = ldlm_callback_reply(req, 0); + if (req->rq_no_reply || rc) + ldlm_callback_errmsg(req, "Normal process", rc, + &dlm_req->lock_handle[0]); + } + if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) + ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); + break; + case LDLM_CP_CALLBACK: + CDEBUG(D_INODE, "completion ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); + ldlm_callback_reply(req, 0); + ldlm_handle_cp_callback(req, ns, dlm_req, lock); + break; + case LDLM_GL_CALLBACK: + CDEBUG(D_INODE, "glimpse ast\n"); + req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); + ldlm_handle_gl_callback(req, ns, dlm_req, lock); + break; + default: + LBUG(); /* checked above */ + } + + RETURN(0); +} + + +static struct ldlm_bl_work_item *ldlm_bl_get_work(struct ldlm_bl_pool *blp) +{ + struct ldlm_bl_work_item *blwi = NULL; + static unsigned int num_bl = 0; + + spin_lock(&blp->blp_lock); + /* process a request from the blp_list at least every blp_num_threads */ + if (!list_empty(&blp->blp_list) && + (list_empty(&blp->blp_prio_list) || num_bl == 0)) + blwi = list_entry(blp->blp_list.next, + struct ldlm_bl_work_item, blwi_entry); + else + if (!list_empty(&blp->blp_prio_list)) + blwi = list_entry(blp->blp_prio_list.next, + struct ldlm_bl_work_item, + blwi_entry); + + if (blwi) { + if (++num_bl >= atomic_read(&blp->blp_num_threads)) + num_bl = 0; + list_del(&blwi->blwi_entry); + } + spin_unlock(&blp->blp_lock); + + return blwi; +} + +/* This only contains temporary data until the thread starts */ +struct ldlm_bl_thread_data { + char bltd_name[CFS_CURPROC_COMM_MAX]; + struct ldlm_bl_pool *bltd_blp; + struct completion bltd_comp; + int bltd_num; +}; + +static int ldlm_bl_thread_main(void *arg); + +static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp) +{ + struct ldlm_bl_thread_data bltd = { .bltd_blp = blp }; + task_t *task; + + init_completion(&bltd.bltd_comp); + bltd.bltd_num = atomic_read(&blp->blp_num_threads); + snprintf(bltd.bltd_name, sizeof(bltd.bltd_name) - 1, + "ldlm_bl_%02d", bltd.bltd_num); + task = kthread_run(ldlm_bl_thread_main, &bltd, bltd.bltd_name); + if (IS_ERR(task)) { + CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", + atomic_read(&blp->blp_num_threads), PTR_ERR(task)); + return PTR_ERR(task); + } + wait_for_completion(&bltd.bltd_comp); + + return 0; +} + +/** + * Main blocking requests processing thread. + * + * Callers put locks into its queue by calling ldlm_bl_to_thread. + * This thread in the end ends up doing actual call to ->l_blocking_ast + * for queued locks. + */ +static int ldlm_bl_thread_main(void *arg) +{ + struct ldlm_bl_pool *blp; + ENTRY; + + { + struct ldlm_bl_thread_data *bltd = arg; + + blp = bltd->bltd_blp; + + atomic_inc(&blp->blp_num_threads); + atomic_inc(&blp->blp_busy_threads); + + complete(&bltd->bltd_comp); + /* cannot use bltd after this, it is only on caller's stack */ + } + + while (1) { + struct l_wait_info lwi = { 0 }; + struct ldlm_bl_work_item *blwi = NULL; + int busy; + + blwi = ldlm_bl_get_work(blp); + + if (blwi == NULL) { + atomic_dec(&blp->blp_busy_threads); + l_wait_event_exclusive(blp->blp_waitq, + (blwi = ldlm_bl_get_work(blp)) != NULL, + &lwi); + busy = atomic_inc_return(&blp->blp_busy_threads); + } else { + busy = atomic_read(&blp->blp_busy_threads); + } + + if (blwi->blwi_ns == NULL) + /* added by ldlm_cleanup() */ + break; + + /* Not fatal if racy and have a few too many threads */ + if (unlikely(busy < blp->blp_max_threads && + busy >= atomic_read(&blp->blp_num_threads) && + !blwi->blwi_mem_pressure)) + /* discard the return value, we tried */ + ldlm_bl_thread_start(blp); + + if (blwi->blwi_mem_pressure) + memory_pressure_set(); + + if (blwi->blwi_count) { + int count; + /* The special case when we cancel locks in LRU + * asynchronously, we pass the list of locks here. + * Thus locks are marked LDLM_FL_CANCELING, but NOT + * canceled locally yet. */ + count = ldlm_cli_cancel_list_local(&blwi->blwi_head, + blwi->blwi_count, + LCF_BL_AST); + ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL, + blwi->blwi_flags); + } else { + ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld, + blwi->blwi_lock); + } + if (blwi->blwi_mem_pressure) + memory_pressure_clr(); + + if (blwi->blwi_flags & LCF_ASYNC) + OBD_FREE(blwi, sizeof(*blwi)); + else + complete(&blwi->blwi_comp); + } + + atomic_dec(&blp->blp_busy_threads); + atomic_dec(&blp->blp_num_threads); + complete(&blp->blp_comp); + RETURN(0); +} + + +static int ldlm_setup(void); +static int ldlm_cleanup(void); + +int ldlm_get_ref(void) +{ + int rc = 0; + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (++ldlm_refcount == 1) { + rc = ldlm_setup(); + if (rc) + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_get_ref); + +void ldlm_put_ref(void) +{ + ENTRY; + mutex_lock(&ldlm_ref_mutex); + if (ldlm_refcount == 1) { + int rc = ldlm_cleanup(); + if (rc) + CERROR("ldlm_cleanup failed: %d\n", rc); + else + ldlm_refcount--; + } else { + ldlm_refcount--; + } + mutex_unlock(&ldlm_ref_mutex); + + EXIT; +} +EXPORT_SYMBOL(ldlm_put_ref); + +/* + * Export handle<->lock hash operations. + */ +static unsigned +ldlm_export_lock_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_u64_hash(((struct lustre_handle *)key)->cookie, mask); +} + +static void * +ldlm_export_lock_key(struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + return &lock->l_remote_handle; +} + +static void +ldlm_export_lock_keycpy(struct hlist_node *hnode, void *key) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + lock->l_remote_handle = *(struct lustre_handle *)key; +} + +static int +ldlm_export_lock_keycmp(const void *key, struct hlist_node *hnode) +{ + return lustre_handle_equal(ldlm_export_lock_key(hnode), key); +} + +static void * +ldlm_export_lock_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_lock, l_exp_hash); +} + +static void +ldlm_export_lock_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_GET(lock); +} + +static void +ldlm_export_lock_put(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_lock *lock; + + lock = hlist_entry(hnode, struct ldlm_lock, l_exp_hash); + LDLM_LOCK_RELEASE(lock); +} + +static cfs_hash_ops_t ldlm_export_lock_ops = { + .hs_hash = ldlm_export_lock_hash, + .hs_key = ldlm_export_lock_key, + .hs_keycmp = ldlm_export_lock_keycmp, + .hs_keycpy = ldlm_export_lock_keycpy, + .hs_object = ldlm_export_lock_object, + .hs_get = ldlm_export_lock_get, + .hs_put = ldlm_export_lock_put, + .hs_put_locked = ldlm_export_lock_put, +}; + +int ldlm_init_export(struct obd_export *exp) +{ + ENTRY; + + exp->exp_lock_hash = + cfs_hash_create(obd_uuid2str(&exp->exp_client_uuid), + HASH_EXP_LOCK_CUR_BITS, + HASH_EXP_LOCK_MAX_BITS, + HASH_EXP_LOCK_BKT_BITS, 0, + CFS_HASH_MIN_THETA, CFS_HASH_MAX_THETA, + &ldlm_export_lock_ops, + CFS_HASH_DEFAULT | CFS_HASH_REHASH_KEY | + CFS_HASH_NBLK_CHANGE); + + if (!exp->exp_lock_hash) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(ldlm_init_export); + +void ldlm_destroy_export(struct obd_export *exp) +{ + ENTRY; + cfs_hash_putref(exp->exp_lock_hash); + exp->exp_lock_hash = NULL; + + ldlm_destroy_flock_export(exp); + EXIT; +} +EXPORT_SYMBOL(ldlm_destroy_export); + +static int ldlm_setup(void) +{ + static struct ptlrpc_service_conf conf; + struct ldlm_bl_pool *blp = NULL; + int rc = 0; + int i; + ENTRY; + + if (ldlm_state != NULL) + RETURN(-EALREADY); + + OBD_ALLOC(ldlm_state, sizeof(*ldlm_state)); + if (ldlm_state == NULL) + RETURN(-ENOMEM); + +#ifdef LPROCFS + rc = ldlm_proc_setup(); + if (rc != 0) + GOTO(out, rc); +#endif + + memset(&conf, 0, sizeof(conf)); + conf = (typeof(conf)) { + .psc_name = "ldlm_cbd", + .psc_watchdog_factor = 2, + .psc_buf = { + .bc_nbufs = LDLM_CLIENT_NBUFS, + .bc_buf_size = LDLM_BUFSIZE, + .bc_req_max_size = LDLM_MAXREQSIZE, + .bc_rep_max_size = LDLM_MAXREPSIZE, + .bc_req_portal = LDLM_CB_REQUEST_PORTAL, + .bc_rep_portal = LDLM_CB_REPLY_PORTAL, + }, + .psc_thr = { + .tc_thr_name = "ldlm_cb", + .tc_thr_factor = LDLM_THR_FACTOR, + .tc_nthrs_init = LDLM_NTHRS_INIT, + .tc_nthrs_base = LDLM_NTHRS_BASE, + .tc_nthrs_max = LDLM_NTHRS_MAX, + .tc_nthrs_user = ldlm_num_threads, + .tc_cpu_affinity = 1, + .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD, + }, + .psc_cpt = { + .cc_pattern = ldlm_cpts, + }, + .psc_ops = { + .so_req_handler = ldlm_callback_handler, + }, + }; + ldlm_state->ldlm_cb_service = \ + ptlrpc_register_service(&conf, ldlm_svc_proc_dir); + if (IS_ERR(ldlm_state->ldlm_cb_service)) { + CERROR("failed to start service\n"); + rc = PTR_ERR(ldlm_state->ldlm_cb_service); + ldlm_state->ldlm_cb_service = NULL; + GOTO(out, rc); + } + + + OBD_ALLOC(blp, sizeof(*blp)); + if (blp == NULL) + GOTO(out, rc = -ENOMEM); + ldlm_state->ldlm_bl_pool = blp; + + spin_lock_init(&blp->blp_lock); + INIT_LIST_HEAD(&blp->blp_list); + INIT_LIST_HEAD(&blp->blp_prio_list); + init_waitqueue_head(&blp->blp_waitq); + atomic_set(&blp->blp_num_threads, 0); + atomic_set(&blp->blp_busy_threads, 0); + + if (ldlm_num_threads == 0) { + blp->blp_min_threads = LDLM_NTHRS_INIT; + blp->blp_max_threads = LDLM_NTHRS_MAX; + } else { + blp->blp_min_threads = blp->blp_max_threads = \ + min_t(int, LDLM_NTHRS_MAX, max_t(int, LDLM_NTHRS_INIT, + ldlm_num_threads)); + } + + for (i = 0; i < blp->blp_min_threads; i++) { + rc = ldlm_bl_thread_start(blp); + if (rc < 0) + GOTO(out, rc); + } + + + rc = ldlm_pools_init(); + if (rc) { + CERROR("Failed to initialize LDLM pools: %d\n", rc); + GOTO(out, rc); + } + RETURN(0); + + out: + ldlm_cleanup(); + RETURN(rc); +} + +static int ldlm_cleanup(void) +{ + ENTRY; + + if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || + !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) { + CERROR("ldlm still has namespaces; clean these up first.\n"); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(-EBUSY); + } + + ldlm_pools_fini(); + + if (ldlm_state->ldlm_bl_pool != NULL) { + struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; + + while (atomic_read(&blp->blp_num_threads) > 0) { + struct ldlm_bl_work_item blwi = { .blwi_ns = NULL }; + + init_completion(&blp->blp_comp); + + spin_lock(&blp->blp_lock); + list_add_tail(&blwi.blwi_entry, &blp->blp_list); + wake_up(&blp->blp_waitq); + spin_unlock(&blp->blp_lock); + + wait_for_completion(&blp->blp_comp); + } + + OBD_FREE(blp, sizeof(*blp)); + } + + if (ldlm_state->ldlm_cb_service != NULL) + ptlrpc_unregister_service(ldlm_state->ldlm_cb_service); + + ldlm_proc_cleanup(); + + + OBD_FREE(ldlm_state, sizeof(*ldlm_state)); + ldlm_state = NULL; + + RETURN(0); +} + +int ldlm_init(void) +{ + mutex_init(&ldlm_ref_mutex); + mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); + mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); + ldlm_resource_slab = kmem_cache_create("ldlm_resources", + sizeof(struct ldlm_resource), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_resource_slab == NULL) + return -ENOMEM; + + ldlm_lock_slab = kmem_cache_create("ldlm_locks", + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU, NULL); + if (ldlm_lock_slab == NULL) { + kmem_cache_destroy(ldlm_resource_slab); + return -ENOMEM; + } + + ldlm_interval_slab = kmem_cache_create("interval_node", + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ldlm_interval_slab == NULL) { + kmem_cache_destroy(ldlm_resource_slab); + kmem_cache_destroy(ldlm_lock_slab); + return -ENOMEM; + } +#if LUSTRE_TRACKS_LOCK_EXP_REFS + class_export_dump_hook = ldlm_dump_export_locks; +#endif + return 0; +} + +void ldlm_exit(void) +{ + if (ldlm_refcount) + CERROR("ldlm_refcount is %d in ldlm_exit!\n", ldlm_refcount); + kmem_cache_destroy(ldlm_resource_slab); + /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call + * synchronize_rcu() to wait a grace period elapsed, so that + * ldlm_lock_free() get a chance to be called. */ + synchronize_rcu(); + kmem_cache_destroy(ldlm_lock_slab); + kmem_cache_destroy(ldlm_interval_slab); +} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c new file mode 100644 index 000000000000..ec29e28624fe --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_plain.c + * + * Author: Peter Braam + * Author: Phil Schwan + */ + +/** + * This file contains implementation of PLAIN lock type. + * + * PLAIN locks are the simplest form of LDLM locking, and are used when + * there only needs to be a single lock on a resource. This avoids some + * of the complexity of EXTENT and IBITS lock types, but doesn't allow + * different "parts" of a resource to be locked concurrently. Example + * use cases for PLAIN locks include locking of MGS configuration logs + * and (as of Lustre 2.4) quota records. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include + +#include "ldlm_internal.h" + + +void ldlm_plain_policy_wire_to_local(const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy) +{ + /* No policy for plain locks */ +} + +void ldlm_plain_policy_local_to_wire(const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy) +{ + /* No policy for plain locks */ +} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c new file mode 100644 index 000000000000..0604295a493d --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c @@ -0,0 +1,1406 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_pool.c + * + * Author: Yury Umanets + */ + +/* + * Idea of this code is rather simple. Each second, for each server namespace + * we have SLV - server lock volume which is calculated on current number of + * granted locks, grant speed for past period, etc - that is, locking load. + * This SLV number may be thought as a flow definition for simplicity. It is + * sent to clients with each occasion to let them know what is current load + * situation on the server. By default, at the beginning, SLV on server is + * set max value which is calculated as the following: allow to one client + * have all locks of limit ->pl_limit for 10h. + * + * Next, on clients, number of cached locks is not limited artificially in any + * way as it was before. Instead, client calculates CLV, that is, client lock + * volume for each lock and compares it with last SLV from the server. CLV is + * calculated as the number of locks in LRU * lock live time in seconds. If + * CLV > SLV - lock is canceled. + * + * Client has LVF, that is, lock volume factor which regulates how much sensitive + * client should be about last SLV from server. The higher LVF is the more locks + * will be canceled on client. Default value for it is 1. Setting LVF to 2 means + * that client will cancel locks 2 times faster. + * + * Locks on a client will be canceled more intensively in these cases: + * (1) if SLV is smaller, that is, load is higher on the server; + * (2) client has a lot of locks (the more locks are held by client, the bigger + * chances that some of them should be canceled); + * (3) client has old locks (taken some time ago); + * + * Thus, according to flow paradigm that we use for better understanding SLV, + * CLV is the volume of particle in flow described by SLV. According to this, + * if flow is getting thinner, more and more particles become outside of it and + * as particles are locks, they should be canceled. + * + * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). Andreas + * Dilger (adilger@clusterfs.com) proposed few nice ideas like using LVF and many + * cleanups. Flow definition to allow more easy understanding of the logic belongs + * to Nikita Danilov (nikita@clusterfs.com) as well as many cleanups and fixes. + * And design and implementation are done by Yury Umanets (umka@clusterfs.com). + * + * Glossary for terms used: + * + * pl_limit - Number of allowed locks in pool. Applies to server and client + * side (tunable); + * + * pl_granted - Number of granted locks (calculated); + * pl_grant_rate - Number of granted locks for last T (calculated); + * pl_cancel_rate - Number of canceled locks for last T (calculated); + * pl_grant_speed - Grant speed (GR - CR) for last T (calculated); + * pl_grant_plan - Planned number of granted locks for next T (calculated); + * pl_server_lock_volume - Current server lock volume (calculated); + * + * As it may be seen from list above, we have few possible tunables which may + * affect behavior much. They all may be modified via proc. However, they also + * give a possibility for constructing few pre-defined behavior policies. If + * none of predefines is suitable for a working pattern being used, new one may + * be "constructed" via proc tunables. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +# include + +#include + +#include +#include +#include "ldlm_internal.h" + + +/* + * 50 ldlm locks for 1MB of RAM. + */ +#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50) + +/* + * Maximal possible grant step plan in %. + */ +#define LDLM_POOL_MAX_GSP (30) + +/* + * Minimal possible grant step plan in %. + */ +#define LDLM_POOL_MIN_GSP (1) + +/* + * This controls the speed of reaching LDLM_POOL_MAX_GSP + * with increasing thread period. + */ +#define LDLM_POOL_GSP_STEP_SHIFT (2) + +/* + * LDLM_POOL_GSP% of all locks is default GP. + */ +#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) + +/* + * Max age for locks on clients. + */ +#define LDLM_POOL_MAX_AGE (36000) + +/* + * The granularity of SLV calculation. + */ +#define LDLM_POOL_SLV_SHIFT (10) + +extern proc_dir_entry_t *ldlm_ns_proc_dir; + +static inline __u64 dru(__u64 val, __u32 shift, int round_up) +{ + return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift; +} + +static inline __u64 ldlm_pool_slv_max(__u32 L) +{ + /* + * Allow to have all locks for 1 client for 10 hrs. + * Formula is the following: limit * 10h / 1 client. + */ + __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1; + return lim; +} + +static inline __u64 ldlm_pool_slv_min(__u32 L) +{ + return 1; +} + +enum { + LDLM_POOL_FIRST_STAT = 0, + LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, + LDLM_POOL_GRANT_STAT, + LDLM_POOL_CANCEL_STAT, + LDLM_POOL_GRANT_RATE_STAT, + LDLM_POOL_CANCEL_RATE_STAT, + LDLM_POOL_GRANT_PLAN_STAT, + LDLM_POOL_SLV_STAT, + LDLM_POOL_SHRINK_REQTD_STAT, + LDLM_POOL_SHRINK_FREED_STAT, + LDLM_POOL_RECALC_STAT, + LDLM_POOL_TIMING_STAT, + LDLM_POOL_LAST_STAT +}; + +static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) +{ + return container_of(pl, struct ldlm_namespace, ns_pool); +} + +/** + * Calculates suggested grant_step in % of available locks for passed + * \a period. This is later used in grant_plan calculations. + */ +static inline int ldlm_pool_t2gsp(unsigned int t) +{ + /* + * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP + * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. + * + * How this will affect execution is the following: + * + * - for thread period 1s we will have grant_step 1% which good from + * pov of taking some load off from server and push it out to clients. + * This is like that because 1% for grant_step means that server will + * not allow clients to get lots of locks in short period of time and + * keep all old locks in their caches. Clients will always have to + * get some locks back if they want to take some new; + * + * - for thread period 10s (which is default) we will have 23% which + * means that clients will have enough of room to take some new locks + * without getting some back. All locks from this 23% which were not + * taken by clients in current period will contribute in SLV growing. + * SLV growing means more locks cached on clients until limit or grant + * plan is reached. + */ + return LDLM_POOL_MAX_GSP - + ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >> + (t >> LDLM_POOL_GSP_STEP_SHIFT)); +} + +/** + * Recalculates next grant limit on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) +{ + int granted, grant_step, limit; + + limit = ldlm_pool_get_limit(pl); + granted = atomic_read(&pl->pl_granted); + + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + grant_step = ((limit - granted) * grant_step) / 100; + pl->pl_grant_plan = granted + grant_step; + limit = (limit * 5) >> 2; + if (pl->pl_grant_plan > limit) + pl->pl_grant_plan = limit; +} + +/** + * Recalculates next SLV on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_slv(struct ldlm_pool *pl) +{ + int granted; + int grant_plan; + int round_up; + __u64 slv; + __u64 slv_factor; + __u64 grant_usage; + __u32 limit; + + slv = pl->pl_server_lock_volume; + grant_plan = pl->pl_grant_plan; + limit = ldlm_pool_get_limit(pl); + granted = atomic_read(&pl->pl_granted); + round_up = granted < limit; + + grant_usage = max_t(int, limit - (granted - grant_plan), 1); + + /* + * Find out SLV change factor which is the ratio of grant usage + * from limit. SLV changes as fast as the ratio of grant plan + * consumption. The more locks from grant plan are not consumed + * by clients in last interval (idle time), the faster grows + * SLV. And the opposite, the more grant plan is over-consumed + * (load time) the faster drops SLV. + */ + slv_factor = (grant_usage << LDLM_POOL_SLV_SHIFT); + do_div(slv_factor, limit); + slv = slv * slv_factor; + slv = dru(slv, LDLM_POOL_SLV_SHIFT, round_up); + + if (slv > ldlm_pool_slv_max(limit)) { + slv = ldlm_pool_slv_max(limit); + } else if (slv < ldlm_pool_slv_min(limit)) { + slv = ldlm_pool_slv_min(limit); + } + + pl->pl_server_lock_volume = slv; +} + +/** + * Recalculates next stats on passed \a pl. + * + * \pre ->pl_lock is locked. + */ +static void ldlm_pool_recalc_stats(struct ldlm_pool *pl) +{ + int grant_plan = pl->pl_grant_plan; + __u64 slv = pl->pl_server_lock_volume; + int granted = atomic_read(&pl->pl_granted); + int grant_rate = atomic_read(&pl->pl_grant_rate); + int cancel_rate = atomic_read(&pl->pl_cancel_rate); + + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, + slv); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + granted); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + grant_rate); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + grant_plan); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + cancel_rate); +} + +/** + * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd. + */ +static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Set new SLV in obd field for using it later without accessing the + * pool. This is required to avoid race between sending reply to client + * with new SLV and cleanup server stack in which we can't guarantee + * that namespace is still alive. We know only that obd is alive as + * long as valid export is alive. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = pl->pl_server_lock_volume; + write_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates all pool fields on passed \a pl. + * + * \pre ->pl_lock is not locked. + */ +static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + ENTRY; + + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + /* + * Recalc SLV after last period. This should be done + * _before_ recalculating new grant plan. + */ + ldlm_pool_recalc_slv(pl); + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + + /* + * Update grant_plan for new period. + */ + ldlm_pool_recalc_grant_plan(pl); + + pl->pl_recalc_time = cfs_time_current_sec(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + RETURN(0); +} + +/** + * This function is used on server side as main entry point for memory + * pressure handling. It decreases SLV on \a pl according to passed + * \a nr and \a gfp_mask. + * + * Our goal here is to decrease SLV such a way that clients hold \a nr + * locks smaller in next 10h. + */ +static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, + int nr, unsigned int gfp_mask) +{ + __u32 limit; + + /* + * VM is asking how many entries may be potentially freed. + */ + if (nr == 0) + return atomic_read(&pl->pl_granted); + + /* + * Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. + */ + if (atomic_read(&pl->pl_granted) == 0) + RETURN(0); + + spin_lock(&pl->pl_lock); + + /* + * We want shrinker to possibly cause cancellation of @nr locks from + * clients or grant approximately @nr locks smaller next intervals. + * + * This is why we decreased SLV by @nr. This effect will only be as + * long as one re-calc interval (1s these days) and this should be + * enough to pass this decreased SLV to all clients. On next recalc + * interval pool will either increase SLV if locks load is not high + * or will keep on same level or even decrease again, thus, shrinker + * decreased SLV will affect next recalc intervals and this way will + * make locking load lower. + */ + if (nr < pl->pl_server_lock_volume) { + pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; + } else { + limit = ldlm_pool_get_limit(pl); + pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); + } + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + spin_unlock(&pl->pl_lock); + + /* + * We did not really free any memory here so far, it only will be + * freed later may be, so that we return 0 to not confuse VM. + */ + return 0; +} + +/** + * Setup server side pool \a pl with passed \a limit. + */ +static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) +{ + struct obd_device *obd; + + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL && obd != LP_POISON); + LASSERT(obd->obd_type != LP_POISON); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_limit = limit; + write_unlock(&obd->obd_pool_lock); + + ldlm_pool_set_limit(pl, limit); + return 0; +} + +/** + * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl. + */ +static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Get new SLV and Limit from obd which is updated with coming + * RPCs. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + read_lock(&obd->obd_pool_lock); + pl->pl_server_lock_volume = obd->obd_pool_slv; + ldlm_pool_set_limit(pl, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates client size pool \a pl according to current SLV and Limit. + */ +static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + ENTRY; + + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) + RETURN(0); + + spin_lock(&pl->pl_lock); + /* + * Check if we need to recalc lists now. + */ + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec < pl->pl_recalc_period) { + spin_unlock(&pl->pl_lock); + RETURN(0); + } + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + pl->pl_recalc_time = cfs_time_current_sec(); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + recalc_interval_sec); + spin_unlock(&pl->pl_lock); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) + RETURN(0); + + /* + * In the time of canceling locks on client we do not need to maintain + * sharp timing, we only want to cancel locks asap according to new SLV. + * It may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. + */ + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LCF_ASYNC, + LDLM_CANCEL_LRUR)); +} + +/** + * This function is main entry point for memory pressure handling on client + * side. Main goal of this function is to cancel some number of locks on + * passed \a pl according to \a nr and \a gfp_mask. + */ +static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, + int nr, unsigned int gfp_mask) +{ + struct ldlm_namespace *ns; + int canceled = 0, unused; + + ns = ldlm_pl2ns(pl); + + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ + if (!ns_connect_lru_resize(ns)) + RETURN(0); + + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + spin_unlock(&ns->ns_lock); + + if (nr) { + canceled = ldlm_cancel_lru(ns, nr, LCF_ASYNC, + LDLM_CANCEL_SHRINK); + } + /* + * Return the number of potentially reclaimable locks. + */ + return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure; +} + +struct ldlm_pool_ops ldlm_srv_pool_ops = { + .po_recalc = ldlm_srv_pool_recalc, + .po_shrink = ldlm_srv_pool_shrink, + .po_setup = ldlm_srv_pool_setup +}; + +struct ldlm_pool_ops ldlm_cli_pool_ops = { + .po_recalc = ldlm_cli_pool_recalc, + .po_shrink = ldlm_cli_pool_shrink +}; + +/** + * Pool recalc wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +int ldlm_pool_recalc(struct ldlm_pool *pl) +{ + time_t recalc_interval_sec; + int count; + + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec <= 0) + goto recalc; + + spin_lock(&pl->pl_lock); + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; + if (recalc_interval_sec > 0) { + /* + * Update pool statistics every 1s. + */ + ldlm_pool_recalc_stats(pl); + + /* + * Zero out all rates and speed for the last period. + */ + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + } + spin_unlock(&pl->pl_lock); + + recalc: + if (pl->pl_ops->po_recalc != NULL) { + count = pl->pl_ops->po_recalc(pl); + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + count); + return count; + } + + return 0; +} +EXPORT_SYMBOL(ldlm_pool_recalc); + +/** + * Pool shrink wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask) +{ + int cancel = 0; + + if (pl->pl_ops->po_shrink != NULL) { + cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); + if (nr > 0) { + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_REQTD_STAT, + nr); + lprocfs_counter_add(pl->pl_stats, + LDLM_POOL_SHRINK_FREED_STAT, + cancel); + CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, " + "shrunk %d\n", pl->pl_name, nr, cancel); + } + } + return cancel; +} +EXPORT_SYMBOL(ldlm_pool_shrink); + +/** + * Pool setup wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * Sets passed \a limit into pool \a pl. + */ +int ldlm_pool_setup(struct ldlm_pool *pl, int limit) +{ + if (pl->pl_ops->po_setup != NULL) + return(pl->pl_ops->po_setup(pl, limit)); + return 0; +} +EXPORT_SYMBOL(ldlm_pool_setup); + +static int lprocfs_rd_pool_state(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int granted, grant_rate, cancel_rate, grant_step; + int nr = 0, grant_speed, grant_plan, lvf; + struct ldlm_pool *pl = data; + __u64 slv, clv; + __u32 limit; + + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + clv = pl->pl_client_lock_volume; + limit = ldlm_pool_get_limit(pl); + grant_plan = pl->pl_grant_plan; + granted = atomic_read(&pl->pl_granted); + grant_rate = atomic_read(&pl->pl_grant_rate); + cancel_rate = atomic_read(&pl->pl_cancel_rate); + grant_speed = grant_rate - cancel_rate; + lvf = atomic_read(&pl->pl_lock_volume_factor); + grant_step = ldlm_pool_t2gsp(pl->pl_recalc_period); + spin_unlock(&pl->pl_lock); + + nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n", + pl->pl_name); + nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv); + nr += snprintf(page + nr, count - nr, " CLV: "LPU64"\n", clv); + nr += snprintf(page + nr, count - nr, " LVF: %d\n", lvf); + + if (ns_is_server(ldlm_pl2ns(pl))) { + nr += snprintf(page + nr, count - nr, " GSP: %d%%\n", + grant_step); + nr += snprintf(page + nr, count - nr, " GP: %d\n", + grant_plan); + } + nr += snprintf(page + nr, count - nr, " GR: %d\n", + grant_rate); + nr += snprintf(page + nr, count - nr, " CR: %d\n", + cancel_rate); + nr += snprintf(page + nr, count - nr, " GS: %d\n", + grant_speed); + nr += snprintf(page + nr, count - nr, " G: %d\n", + granted); + nr += snprintf(page + nr, count - nr, " L: %d\n", + limit); + return nr; +} + +static int lprocfs_rd_grant_speed(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ldlm_pool *pl = data; + int grant_speed; + + spin_lock(&pl->pl_lock); + /* serialize with ldlm_pool_recalc */ + grant_speed = atomic_read(&pl->pl_grant_rate) - + atomic_read(&pl->pl_cancel_rate); + spin_unlock(&pl->pl_lock); + return lprocfs_rd_uint(page, start, off, count, eof, &grant_speed); +} + +LDLM_POOL_PROC_READER(grant_plan, int); +LDLM_POOL_PROC_READER(recalc_period, int); +LDLM_POOL_PROC_WRITER(recalc_period, int); + +static int ldlm_pool_proc_init(struct ldlm_pool *pl) +{ + struct ldlm_namespace *ns = ldlm_pl2ns(pl); + struct proc_dir_entry *parent_ns_proc; + struct lprocfs_vars pool_vars[2]; + char *var_name = NULL; + int rc = 0; + ENTRY; + + OBD_ALLOC(var_name, MAX_STRING_SIZE + 1); + if (!var_name) + RETURN(-ENOMEM); + + parent_ns_proc = lprocfs_srch(ldlm_ns_proc_dir, + ldlm_ns_name(ns)); + if (parent_ns_proc == NULL) { + CERROR("%s: proc entry is not initialized\n", + ldlm_ns_name(ns)); + GOTO(out_free_name, rc = -EINVAL); + } + pl->pl_proc_dir = lprocfs_register("pool", parent_ns_proc, + NULL, NULL); + if (IS_ERR(pl->pl_proc_dir)) { + CERROR("LProcFS failed in ldlm-pool-init\n"); + rc = PTR_ERR(pl->pl_proc_dir); + GOTO(out_free_name, rc); + } + + var_name[MAX_STRING_SIZE] = '\0'; + memset(pool_vars, 0, sizeof(pool_vars)); + pool_vars[0].name = var_name; + + snprintf(var_name, MAX_STRING_SIZE, "server_lock_volume"); + pool_vars[0].data = &pl->pl_server_lock_volume; + pool_vars[0].read_fptr = lprocfs_rd_u64; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "limit"); + pool_vars[0].data = &pl->pl_limit; + pool_vars[0].read_fptr = lprocfs_rd_atomic; + pool_vars[0].write_fptr = lprocfs_wr_atomic; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "granted"); + pool_vars[0].data = &pl->pl_granted; + pool_vars[0].read_fptr = lprocfs_rd_atomic; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "grant_speed"); + pool_vars[0].data = pl; + pool_vars[0].read_fptr = lprocfs_rd_grant_speed; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "cancel_rate"); + pool_vars[0].data = &pl->pl_cancel_rate; + pool_vars[0].read_fptr = lprocfs_rd_atomic; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "grant_rate"); + pool_vars[0].data = &pl->pl_grant_rate; + pool_vars[0].read_fptr = lprocfs_rd_atomic; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "grant_plan"); + pool_vars[0].data = pl; + pool_vars[0].read_fptr = lprocfs_rd_grant_plan; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "recalc_period"); + pool_vars[0].data = pl; + pool_vars[0].read_fptr = lprocfs_rd_recalc_period; + pool_vars[0].write_fptr = lprocfs_wr_recalc_period; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "lock_volume_factor"); + pool_vars[0].data = &pl->pl_lock_volume_factor; + pool_vars[0].read_fptr = lprocfs_rd_atomic; + pool_vars[0].write_fptr = lprocfs_wr_atomic; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + snprintf(var_name, MAX_STRING_SIZE, "state"); + pool_vars[0].data = pl; + pool_vars[0].read_fptr = lprocfs_rd_pool_state; + lprocfs_add_vars(pl->pl_proc_dir, pool_vars, 0); + + pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - + LDLM_POOL_FIRST_STAT, 0); + if (!pl->pl_stats) + GOTO(out_free_name, rc = -ENOMEM); + + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "granted", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "cancel_rate", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "grant_plan", "locks/s"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "slv", "slv"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_request", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "shrink_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_freed", "locks"); + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, + "recalc_timing", "sec"); + rc = lprocfs_register_stats(pl->pl_proc_dir, "stats", pl->pl_stats); + + EXIT; +out_free_name: + OBD_FREE(var_name, MAX_STRING_SIZE + 1); + return rc; +} + +static void ldlm_pool_proc_fini(struct ldlm_pool *pl) +{ + if (pl->pl_stats != NULL) { + lprocfs_free_stats(&pl->pl_stats); + pl->pl_stats = NULL; + } + if (pl->pl_proc_dir != NULL) { + lprocfs_remove(&pl->pl_proc_dir); + pl->pl_proc_dir = NULL; + } +} + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, ldlm_side_t client) +{ + int rc; + ENTRY; + + spin_lock_init(&pl->pl_lock); + atomic_set(&pl->pl_granted, 0); + pl->pl_recalc_time = cfs_time_current_sec(); + atomic_set(&pl->pl_lock_volume_factor, 1); + + atomic_set(&pl->pl_grant_rate, 0); + atomic_set(&pl->pl_cancel_rate, 0); + pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L); + + snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d", + ldlm_ns_name(ns), idx); + + if (client == LDLM_NAMESPACE_SERVER) { + pl->pl_ops = &ldlm_srv_pool_ops; + ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); + pl->pl_recalc_period = LDLM_POOL_SRV_DEF_RECALC_PERIOD; + pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L); + } else { + ldlm_pool_set_limit(pl, 1); + pl->pl_server_lock_volume = 0; + pl->pl_ops = &ldlm_cli_pool_ops; + pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD; + } + pl->pl_client_lock_volume = 0; + rc = ldlm_pool_proc_init(pl); + if (rc) + RETURN(rc); + + CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_pool_init); + +void ldlm_pool_fini(struct ldlm_pool *pl) +{ + ENTRY; + ldlm_pool_proc_fini(pl); + + /* + * Pool should not be used after this point. We can't free it here as + * it lives in struct ldlm_namespace, but still interested in catching + * any abnormal using cases. + */ + POISON(pl, 0x5a, sizeof(*pl)); + EXIT; +} +EXPORT_SYMBOL(ldlm_pool_fini); + +/** + * Add new taken ldlm lock \a lock into pool \a pl accounting. + */ +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * FLOCK locks are special in a sense that they are almost never + * cancelled, instead special kind of lock is used to drop them. + * also there is no LRU for flock locks, so no point in tracking + * them anyway. + */ + if (lock->l_resource->lr_type == LDLM_FLOCK) + return; + + atomic_inc(&pl->pl_granted); + atomic_inc(&pl->pl_grant_rate); + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); + /* + * Do not do pool recalc for client side as all locks which + * potentially may be canceled has already been packed into + * enqueue/cancel rpc. Also we do not want to run out of stack + * with too long call paths. + */ + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} +EXPORT_SYMBOL(ldlm_pool_add); + +/** + * Remove ldlm lock \a lock from pool \a pl accounting. + */ +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) +{ + /* + * Filter out FLOCK locks. Read above comment in ldlm_pool_add(). + */ + if (lock->l_resource->lr_type == LDLM_FLOCK) + return; + + LASSERT(atomic_read(&pl->pl_granted) > 0); + atomic_dec(&pl->pl_granted); + atomic_inc(&pl->pl_cancel_rate); + + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); + + if (ns_is_server(ldlm_pl2ns(pl))) + ldlm_pool_recalc(pl); +} +EXPORT_SYMBOL(ldlm_pool_del); + +/** + * Returns current \a pl SLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) +{ + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} +EXPORT_SYMBOL(ldlm_pool_get_slv); + +/** + * Sets passed \a slv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) +{ + spin_lock(&pl->pl_lock); + pl->pl_server_lock_volume = slv; + spin_unlock(&pl->pl_lock); +} +EXPORT_SYMBOL(ldlm_pool_set_slv); + +/** + * Returns current \a pl CLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_client_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} +EXPORT_SYMBOL(ldlm_pool_get_clv); + +/** + * Sets passed \a clv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + spin_lock(&pl->pl_lock); + pl->pl_client_lock_volume = clv; + spin_unlock(&pl->pl_lock); +} +EXPORT_SYMBOL(ldlm_pool_set_clv); + +/** + * Returns current \a pl limit. + */ +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_limit); +} +EXPORT_SYMBOL(ldlm_pool_get_limit); + +/** + * Sets passed \a limit to \a pl. + */ +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) +{ + atomic_set(&pl->pl_limit, limit); +} +EXPORT_SYMBOL(ldlm_pool_set_limit); + +/** + * Returns current LVF from \a pl. + */ +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_lock_volume_factor); +} +EXPORT_SYMBOL(ldlm_pool_get_lvf); + +static int ldlm_pool_granted(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_granted); +} + +static struct ptlrpc_thread *ldlm_pools_thread; +static struct shrinker *ldlm_pools_srv_shrinker; +static struct shrinker *ldlm_pools_cli_shrinker; +static struct completion ldlm_pools_comp; + +/* + * Cancel \a nr locks from all namespaces (if possible). Returns number of + * cached locks after shrink is finished. All namespaces are asked to + * cancel approximately equal amount of locks to keep balancing. + */ +static int ldlm_pools_shrink(ldlm_side_t client, int nr, + unsigned int gfp_mask) +{ + int total = 0, cached = 0, nr_ns; + struct ldlm_namespace *ns; + void *cookie; + + if (client == LDLM_NAMESPACE_CLIENT && nr != 0 && + !(gfp_mask & __GFP_FS)) + return -1; + + CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n", + nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); + + cookie = cl_env_reenter(); + + /* + * Find out how many resources we may release. + */ + for (nr_ns = atomic_read(ldlm_namespace_nr(client)); + nr_ns > 0; nr_ns--) + { + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + cl_env_reexit(cookie); + return 0; + } + ns = ldlm_namespace_first_locked(client); + ldlm_namespace_get(ns); + ldlm_namespace_move_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); + ldlm_namespace_put(ns); + } + + if (nr == 0 || total == 0) { + cl_env_reexit(cookie); + return total; + } + + /* + * Shrink at least ldlm_namespace_nr(client) namespaces. + */ + for (nr_ns = atomic_read(ldlm_namespace_nr(client)); + nr_ns > 0; nr_ns--) + { + int cancel, nr_locks; + + /* + * Do not call shrink under ldlm_namespace_lock(client) + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + /* + * If list is empty, we can't return any @cached > 0, + * that probably would cause needless shrinker + * call. + */ + cached = 0; + break; + } + ns = ldlm_namespace_first_locked(client); + ldlm_namespace_get(ns); + ldlm_namespace_move_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + nr_locks = ldlm_pool_granted(&ns->ns_pool); + cancel = 1 + nr_locks * nr / total; + ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); + cached += ldlm_pool_granted(&ns->ns_pool); + ldlm_namespace_put(ns); + } + cl_env_reexit(cookie); + /* we only decrease the SLV in server pools shrinker, return -1 to + * kernel to avoid needless loop. LU-1128 */ + return (client == LDLM_NAMESPACE_SERVER) ? -1 : cached; +} + +static int ldlm_pools_srv_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_SERVER, + shrink_param(sc, nr_to_scan), + shrink_param(sc, gfp_mask)); +} + +static int ldlm_pools_cli_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + return ldlm_pools_shrink(LDLM_NAMESPACE_CLIENT, + shrink_param(sc, nr_to_scan), + shrink_param(sc, gfp_mask)); +} + +void ldlm_pools_recalc(ldlm_side_t client) +{ + __u32 nr_l = 0, nr_p = 0, l; + struct ldlm_namespace *ns; + int nr, equal = 0; + + /* + * No need to setup pool limit for client pools. + */ + if (client == LDLM_NAMESPACE_SERVER) { + /* + * Check all modest namespaces first. + */ + mutex_lock(ldlm_namespace_lock(client)); + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) + { + if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) + continue; + + l = ldlm_pool_granted(&ns->ns_pool); + if (l == 0) + l = 1; + + /* + * Set the modest pools limit equal to their avg granted + * locks + ~6%. + */ + l += dru(l, LDLM_POOLS_MODEST_MARGIN_SHIFT, 0); + ldlm_pool_setup(&ns->ns_pool, l); + nr_l += l; + nr_p++; + } + + /* + * Make sure that modest namespaces did not eat more that 2/3 + * of limit. + */ + if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { + CWARN("\"Modest\" pools eat out 2/3 of server locks " + "limit (%d of %lu). This means that you have too " + "many clients for this amount of server RAM. " + "Upgrade server!\n", nr_l, LDLM_POOL_HOST_L); + equal = 1; + } + + /* + * The rest is given to greedy namespaces. + */ + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) + { + if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) + continue; + + if (equal) { + /* + * In the case 2/3 locks are eaten out by + * modest pools, we re-setup equal limit + * for _all_ pools. + */ + l = LDLM_POOL_HOST_L / + atomic_read( + ldlm_namespace_nr(client)); + } else { + /* + * All the rest of greedy pools will have + * all locks in equal parts. + */ + l = (LDLM_POOL_HOST_L - nr_l) / + (atomic_read( + ldlm_namespace_nr(client)) - + nr_p); + } + ldlm_pool_setup(&ns->ns_pool, l); + } + mutex_unlock(ldlm_namespace_lock(client)); + } + + /* + * Recalc at least ldlm_namespace_nr(client) namespaces. + */ + for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) { + int skip; + /* + * Lock the list, get first @ns in the list, getref, move it + * to the tail, unlock and call pool recalc. This way we avoid + * calling recalc under @ns lock what is really good as we get + * rid of potential deadlock on client nodes when canceling + * locks synchronously. + */ + mutex_lock(ldlm_namespace_lock(client)); + if (list_empty(ldlm_namespace_list(client))) { + mutex_unlock(ldlm_namespace_lock(client)); + break; + } + ns = ldlm_namespace_first_locked(client); + + spin_lock(&ns->ns_lock); + /* + * skip ns which is being freed, and we don't want to increase + * its refcount again, not even temporarily. bz21519 & LU-499. + */ + if (ns->ns_stopping) { + skip = 1; + } else { + skip = 0; + ldlm_namespace_get(ns); + } + spin_unlock(&ns->ns_lock); + + ldlm_namespace_move_locked(ns, client); + mutex_unlock(ldlm_namespace_lock(client)); + + /* + * After setup is done - recalc the pool. + */ + if (!skip) { + ldlm_pool_recalc(&ns->ns_pool); + ldlm_namespace_put(ns); + } + } +} +EXPORT_SYMBOL(ldlm_pools_recalc); + +static int ldlm_pools_thread_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + ENTRY; + + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_DLMTRACE, "%s: pool thread starting, process %d\n", + "ldlm_poold", current_pid()); + + while (1) { + struct l_wait_info lwi; + + /* + * Recal all pools on this tick. + */ + ldlm_pools_recalc(LDLM_NAMESPACE_SERVER); + ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT); + + /* + * Wait until the next check time, or until we're + * stopped. + */ + lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD), + NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_event(thread), + &lwi); + + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) + break; + else + thread_test_and_clear_flags(thread, SVC_EVENT); + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_DLMTRACE, "%s: pool thread exiting, process %d\n", + "ldlm_poold", current_pid()); + + complete_and_exit(&ldlm_pools_comp, 0); +} + +static int ldlm_pools_thread_start(void) +{ + struct l_wait_info lwi = { 0 }; + task_t *task; + ENTRY; + + if (ldlm_pools_thread != NULL) + RETURN(-EALREADY); + + OBD_ALLOC_PTR(ldlm_pools_thread); + if (ldlm_pools_thread == NULL) + RETURN(-ENOMEM); + + init_completion(&ldlm_pools_comp); + init_waitqueue_head(&ldlm_pools_thread->t_ctl_waitq); + + task = kthread_run(ldlm_pools_thread_main, ldlm_pools_thread, + "ldlm_poold"); + if (IS_ERR(task)) { + CERROR("Can't start pool thread, error %ld\n", PTR_ERR(task)); + OBD_FREE(ldlm_pools_thread, sizeof(*ldlm_pools_thread)); + ldlm_pools_thread = NULL; + RETURN(PTR_ERR(task)); + } + l_wait_event(ldlm_pools_thread->t_ctl_waitq, + thread_is_running(ldlm_pools_thread), &lwi); + RETURN(0); +} + +static void ldlm_pools_thread_stop(void) +{ + ENTRY; + + if (ldlm_pools_thread == NULL) { + EXIT; + return; + } + + thread_set_flags(ldlm_pools_thread, SVC_STOPPING); + wake_up(&ldlm_pools_thread->t_ctl_waitq); + + /* + * Make sure that pools thread is finished before freeing @thread. + * This fixes possible race and oops due to accessing freed memory + * in pools thread. + */ + wait_for_completion(&ldlm_pools_comp); + OBD_FREE_PTR(ldlm_pools_thread); + ldlm_pools_thread = NULL; + EXIT; +} + +int ldlm_pools_init(void) +{ + int rc; + ENTRY; + + rc = ldlm_pools_thread_start(); + if (rc == 0) { + ldlm_pools_srv_shrinker = + set_shrinker(DEFAULT_SEEKS, + ldlm_pools_srv_shrink); + ldlm_pools_cli_shrinker = + set_shrinker(DEFAULT_SEEKS, + ldlm_pools_cli_shrink); + } + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_pools_init); + +void ldlm_pools_fini(void) +{ + if (ldlm_pools_srv_shrinker != NULL) { + remove_shrinker(ldlm_pools_srv_shrinker); + ldlm_pools_srv_shrinker = NULL; + } + if (ldlm_pools_cli_shrinker != NULL) { + remove_shrinker(ldlm_pools_cli_shrinker); + ldlm_pools_cli_shrinker = NULL; + } + ldlm_pools_thread_stop(); +} +EXPORT_SYMBOL(ldlm_pools_fini); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c new file mode 100644 index 000000000000..1a690edaba03 --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c @@ -0,0 +1,2333 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** + * This file contains Asynchronous System Trap (AST) handlers and related + * LDLM request-processing routines. + * + * An AST is a callback issued on a lock when its state is changed. There are + * several different types of ASTs (callbacks) registered for each lock: + * + * - completion AST: when a lock is enqueued by some process, but cannot be + * granted immediately due to other conflicting locks on the same resource, + * the completion AST is sent to notify the caller when the lock is + * eventually granted + * + * - blocking AST: when a lock is granted to some process, if another process + * enqueues a conflicting (blocking) lock on a resource, a blocking AST is + * sent to notify the holder(s) of the lock(s) of the conflicting lock + * request. The lock holder(s) must release their lock(s) on that resource in + * a timely manner or be evicted by the server. + * + * - glimpse AST: this is used when a process wants information about a lock + * (i.e. the lock value block (LVB)) but does not necessarily require holding + * the lock. If the resource is locked, the lock holder(s) are sent glimpse + * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL + * their lock(s) if they are idle. If the resource is not locked, the server + * may grant the lock. + */ + +#define DEBUG_SUBSYSTEM S_LDLM + +#include +#include +#include + +#include "ldlm_internal.h" + +int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; +CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644, + "lock enqueue timeout minimum"); + +/* in client side, whether the cached locks will be canceled before replay */ +unsigned int ldlm_cancel_unused_locks_before_replay = 1; + +static void interrupted_completion_wait(void *data) +{ +} + +struct lock_wait_data { + struct ldlm_lock *lwd_lock; + __u32 lwd_conn_cnt; +}; + +struct ldlm_async_args { + struct lustre_handle lock_handle; +}; + +int ldlm_expired_completion_wait(void *data) +{ + struct lock_wait_data *lwd = data; + struct ldlm_lock *lock = lwd->lwd_lock; + struct obd_import *imp; + struct obd_device *obd; + + ENTRY; + if (lock->l_conn_export == NULL) { + static cfs_time_t next_dump = 0, last_dump = 0; + + if (ptlrpc_check_suspend()) + RETURN(0); + + LCONSOLE_WARN("lock timed out (enqueued at "CFS_TIME_T", " + CFS_DURATION_T"s ago)\n", + lock->l_last_activity, + cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity)); + LDLM_DEBUG(lock, "lock timed out (enqueued at "CFS_TIME_T", " + CFS_DURATION_T"s ago); not entering recovery in " + "server code, just going back to sleep", + lock->l_last_activity, + cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity)); + if (cfs_time_after(cfs_time_current(), next_dump)) { + last_dump = next_dump; + next_dump = cfs_time_shift(300); + ldlm_namespace_dump(D_DLMTRACE, + ldlm_lock_to_ns(lock)); + if (last_dump == 0) + libcfs_debug_dumplog(); + } + RETURN(0); + } + + obd = lock->l_conn_export->exp_obd; + imp = obd->u.cli.cl_import; + ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); + LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " + CFS_DURATION_T"s ago), entering recovery for %s@%s", + lock->l_last_activity, + cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity), + obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); + + RETURN(0); +} +EXPORT_SYMBOL(ldlm_expired_completion_wait); + +/* We use the same basis for both server side and client side functions + from a single node. */ +int ldlm_get_enq_timeout(struct ldlm_lock *lock) +{ + int timeout = at_get(ldlm_lock_to_ns_at(lock)); + if (AT_OFF) + return obd_timeout / 2; + /* Since these are non-updating timeouts, we should be conservative. + It would be nice to have some kind of "early reply" mechanism for + lock callbacks too... */ + timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */ + return max(timeout, ldlm_enqueue_min); +} +EXPORT_SYMBOL(ldlm_get_enq_timeout); + +/** + * Helper function for ldlm_completion_ast(), updating timings when lock is + * actually granted. + */ +static int ldlm_completion_tail(struct ldlm_lock *lock) +{ + long delay; + int result; + + if (lock->l_destroyed || lock->l_flags & LDLM_FL_FAILED) { + LDLM_DEBUG(lock, "client-side enqueue: destroyed"); + result = -EIO; + } else { + delay = cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity); + LDLM_DEBUG(lock, "client-side enqueue: granted after " + CFS_DURATION_T"s", delay); + + /* Update our time estimate */ + at_measured(ldlm_lock_to_ns_at(lock), + delay); + result = 0; + } + return result; +} + +/** + * Implementation of ->l_completion_ast() for a client, that doesn't wait + * until lock is granted. Suitable for locks enqueued through ptlrpcd, of + * other threads that cannot block for long. + */ +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data) +{ + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + RETURN(0); + } + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + wake_up(&lock->l_waitq); + RETURN(ldlm_completion_tail(lock)); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " + "going forward"); + ldlm_reprocess_all(lock->l_resource); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_completion_ast_async); + +/** + * Generic LDLM "completion" AST. This is called in several cases: + * + * - when a reply to an ENQUEUE RPC is received from the server + * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at + * this point (determined by flags); + * + * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has + * been granted; + * + * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock + * gets correct lvb; + * + * - to force all locks when resource is destroyed (cleanup_resource()); + * + * - during lock conversion (not used currently). + * + * If lock is not granted in the first case, this function waits until second + * or penultimate cases happen in some other thread. + * + */ +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) +{ + /* XXX ALLOCATE - 160 bytes */ + struct lock_wait_data lwd; + struct obd_device *obd; + struct obd_import *imp = NULL; + struct l_wait_info lwi; + __u32 timeout; + int rc = 0; + ENTRY; + + if (flags == LDLM_FL_WAIT_NOREPROC) { + LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); + goto noreproc; + } + + if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV))) { + wake_up(&lock->l_waitq); + RETURN(0); + } + + LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " + "sleeping"); + +noreproc: + + obd = class_exp2obd(lock->l_conn_export); + + /* if this is a local lock, then there is no import */ + if (obd != NULL) { + imp = obd->u.cli.cl_import; + } + + /* Wait a long time for enqueue - server may have to callback a + lock from another client. Server will evict the other client if it + doesn't respond reasonably, and then give us the lock. */ + timeout = ldlm_get_enq_timeout(lock) * 2; + + lwd.lwd_lock = lock; + + if (lock->l_flags & LDLM_FL_NO_TIMEOUT) { + LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); + lwi = LWI_INTR(interrupted_completion_wait, &lwd); + } else { + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + ldlm_expired_completion_wait, + interrupted_completion_wait, &lwd); + } + + if (imp != NULL) { + spin_lock(&imp->imp_lock); + lwd.lwd_conn_cnt = imp->imp_conn_cnt; + spin_unlock(&imp->imp_lock); + } + + if (ns_is_client(ldlm_lock_to_ns(lock)) && + OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST, + OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) { + lock->l_flags |= LDLM_FL_FAIL_LOC; + rc = -EINTR; + } else { + /* Go to sleep until the lock is granted or cancelled. */ + rc = l_wait_event(lock->l_waitq, + is_granted_or_cancelled(lock), &lwi); + } + + if (rc) { + LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", + rc); + RETURN(rc); + } + + RETURN(ldlm_completion_tail(lock)); +} +EXPORT_SYMBOL(ldlm_completion_ast); + +/** + * A helper to build a blocking AST function + * + * Perform a common operation for blocking ASTs: + * defferred lock cancellation. + * + * \param lock the lock blocking or canceling AST was called on + * \retval 0 + * \see mdt_blocking_ast + * \see ldlm_blocking_ast + */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock) +{ + int do_ast; + ENTRY; + + lock->l_flags |= LDLM_FL_CBPENDING; + do_ast = (!lock->l_readers && !lock->l_writers); + unlock_res_and_lock(lock); + + if (do_ast) { + struct lustre_handle lockh; + int rc; + + LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + LDLM_DEBUG(lock, "Lock still has references, will be " + "cancelled later"); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_blocking_ast_nocheck); + +/** + * Server blocking AST + * + * ->l_blocking_ast() callback for LDLM locks acquired by server-side + * OBDs. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + RETURN(0); + } + + lock_res_and_lock(lock); + /* Get this: if ldlm_blocking_ast is racing with intent_policy, such + * that ldlm_blocking_ast is called just before intent_policy method + * takes the lr_lock, then by the time we get the lock, we might not + * be the correct blocking function anymore. So check, and return + * early, if so. */ + if (lock->l_blocking_ast != ldlm_blocking_ast) { + unlock_res_and_lock(lock); + RETURN(0); + } + RETURN(ldlm_blocking_ast_nocheck(lock)); +} +EXPORT_SYMBOL(ldlm_blocking_ast); + +/** + * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See + * comment in filter_intent_policy() on why you may need this. + */ +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp) +{ + /* + * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for + * that is rather subtle: with OST-side locking, it may so happen that + * _all_ extent locks are held by the OST. If client wants to obtain + * current file size it calls ll{,u}_glimpse_size(), and (as locks are + * on the server), dummy glimpse callback fires and does + * nothing. Client still receives correct file size due to the + * following fragment in filter_intent_policy(): + * + * rc = l->l_glimpse_ast(l, NULL); // this will update the LVB + * if (rc != 0 && res->lr_namespace->ns_lvbo && + * res->lr_namespace->ns_lvbo->lvbo_update) { + * res->lr_namespace->ns_lvbo->lvbo_update(res, NULL, 0, 1); + * } + * + * that is, after glimpse_ast() fails, filter_lvbo_update() runs, and + * returns correct file size to the client. + */ + return -ELDLM_NO_LOCK_DATA; +} +EXPORT_SYMBOL(ldlm_glimpse_ast); + +/** + * Enqueue a local lock (typically on a server). + */ +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh) +{ + struct ldlm_lock *lock; + int err; + const struct ldlm_callback_suite cbs = { .lcs_completion = completion, + .lcs_blocking = blocking, + .lcs_glimpse = glimpse, + }; + ENTRY; + + LASSERT(!(*flags & LDLM_FL_REPLAY)); + if (unlikely(ns_is_client(ns))) { + CERROR("Trying to enqueue local lock in a shadow namespace\n"); + LBUG(); + } + + lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len, + lvb_type); + if (unlikely(!lock)) + GOTO(out_nolock, err = -ENOMEM); + + ldlm_lock2handle(lock, lockh); + + /* NB: we don't have any lock now (lock_res_and_lock) + * because it's a new lock */ + ldlm_lock_addref_internal_nolock(lock, mode); + lock->l_flags |= LDLM_FL_LOCAL; + if (*flags & LDLM_FL_ATOMIC_CB) + lock->l_flags |= LDLM_FL_ATOMIC_CB; + + if (policy != NULL) + lock->l_policy_data = *policy; + if (client_cookie != NULL) + lock->l_client_cookie = *client_cookie; + if (type == LDLM_EXTENT) + lock->l_req_extent = policy->l_extent; + + err = ldlm_lock_enqueue(ns, &lock, policy, flags); + if (unlikely(err != ELDLM_OK)) + GOTO(out, err); + + if (policy != NULL) + *policy = lock->l_policy_data; + + if (lock->l_completion_ast) + lock->l_completion_ast(lock, *flags, NULL); + + LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); + EXIT; + out: + LDLM_LOCK_RELEASE(lock); + out_nolock: + return err; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_local); + +static void failed_lock_cleanup(struct ldlm_namespace *ns, + struct ldlm_lock *lock, int mode) +{ + int need_cancel = 0; + + /* Set a flag to prevent us from sending a CANCEL (bug 407) */ + lock_res_and_lock(lock); + /* Check that lock is not granted or failed, we might race. */ + if ((lock->l_req_mode != lock->l_granted_mode) && + !(lock->l_flags & LDLM_FL_FAILED)) { + /* Make sure that this lock will not be found by raced + * bl_ast and -EINVAL reply is sent to server anyways. + * bug 17645 */ + lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED | + LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING; + need_cancel = 1; + } + unlock_res_and_lock(lock); + + if (need_cancel) + LDLM_DEBUG(lock, + "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | " + "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING"); + else + LDLM_DEBUG(lock, "lock was granted or failed in race"); + + ldlm_lock_decref_internal(lock, mode); + + /* XXX - HACK because we shouldn't call ldlm_lock_destroy() + * from llite/file.c/ll_file_flock(). */ + /* This code makes for the fact that we do not have blocking handler on + * a client for flock locks. As such this is the place where we must + * completely kill failed locks. (interrupted and those that + * were waiting to be granted when server evicted us. */ + if (lock->l_resource->lr_type == LDLM_FLOCK) { + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + ldlm_lock_destroy_nolock(lock); + unlock_res_and_lock(lock); + } +} + +/** + * Finishing portion of client lock enqueue code. + * + * Called after receiving reply from server. + */ +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + __u64 *flags, void *lvb, __u32 lvb_len, + struct lustre_handle *lockh,int rc) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + int is_replay = *flags & LDLM_FL_REPLAY; + struct ldlm_lock *lock; + struct ldlm_reply *reply; + int cleanup_phase = 1; + int size = 0; + ENTRY; + + lock = ldlm_handle2lock(lockh); + /* ldlm_cli_enqueue is holding a reference on this lock. */ + if (!lock) { + LASSERT(type == LDLM_FLOCK); + RETURN(-ENOLCK); + } + + LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len), + "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len); + + if (rc != ELDLM_OK) { + LASSERT(!is_replay); + LDLM_DEBUG(lock, "client-side enqueue END (%s)", + rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); + + if (rc != ELDLM_LOCK_ABORTED) + GOTO(cleanup, rc); + } + + /* Before we return, swab the reply */ + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(cleanup, rc = -EPROTO); + + if (lvb_len != 0) { + LASSERT(lvb != NULL); + + size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, + RCL_SERVER); + if (size < 0) { + LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size); + GOTO(cleanup, rc = size); + } else if (unlikely(size > lvb_len)) { + LDLM_ERROR(lock, "Replied LVB is larger than " + "expectation, expected = %d, replied = %d", + lvb_len, size); + GOTO(cleanup, rc = -EINVAL); + } + } + + if (rc == ELDLM_LOCK_ABORTED) { + if (lvb_len != 0) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lvb, size); + GOTO(cleanup, rc = (rc != 0 ? rc : ELDLM_LOCK_ABORTED)); + } + + /* lock enqueued on the server */ + cleanup_phase = 0; + + lock_res_and_lock(lock); + /* Key change rehash lock in per-export hash with new key */ + if (exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + *flags = ldlm_flags_from_wire(reply->lock_flags); + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_INHERIT_FLAGS); + /* move NO_TIMEOUT flag to the lock to force ldlm_lock_match() + * to wait with no timeout as well */ + lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & + LDLM_FL_NO_TIMEOUT); + unlock_res_and_lock(lock); + + CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: 0x%llx\n", + lock, reply->lock_handle.cookie, *flags); + + /* If enqueue returned a blocked lock but the completion handler has + * already run, then it fixed up the resource and we don't need to do it + * again. */ + if ((*flags) & LDLM_FL_LOCK_CHANGED) { + int newmode = reply->lock_desc.l_req_mode; + LASSERT(!is_replay); + if (newmode && newmode != lock->l_req_mode) { + LDLM_DEBUG(lock, "server returned different mode %s", + ldlm_lockname[newmode]); + lock->l_req_mode = newmode; + } + + if (memcmp(reply->lock_desc.l_resource.lr_name.name, + lock->l_resource->lr_name.name, + sizeof(struct ldlm_res_id))) { + CDEBUG(D_INFO, "remote intent success, locking " + "(%ld,%ld,%ld) instead of " + "(%ld,%ld,%ld)\n", + (long)reply->lock_desc.l_resource.lr_name.name[0], + (long)reply->lock_desc.l_resource.lr_name.name[1], + (long)reply->lock_desc.l_resource.lr_name.name[2], + (long)lock->l_resource->lr_name.name[0], + (long)lock->l_resource->lr_name.name[1], + (long)lock->l_resource->lr_name.name[2]); + + rc = ldlm_lock_change_resource(ns, lock, + &reply->lock_desc.l_resource.lr_name); + if (rc || lock->l_resource == NULL) + GOTO(cleanup, rc = -ENOMEM); + LDLM_DEBUG(lock, "client-side enqueue, new resource"); + } + if (with_policy) + if (!(type == LDLM_IBITS && + !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) + /* We assume lock type cannot change on server*/ + ldlm_convert_policy_to_local(exp, + lock->l_resource->lr_type, + &reply->lock_desc.l_policy_data, + &lock->l_policy_data); + if (type != LDLM_PLAIN) + LDLM_DEBUG(lock,"client-side enqueue, new policy data"); + } + + if ((*flags) & LDLM_FL_AST_SENT || + /* Cancel extent locks as soon as possible on a liblustre client, + * because it cannot handle asynchronous ASTs robustly (see + * bug 7311). */ + (LIBLUSTRE_CLIENT && type == LDLM_EXTENT)) { + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "enqueue reply includes blocking AST"); + } + + /* If the lock has already been granted by a completion AST, don't + * clobber the LVB with an older one. */ + if (lvb_len != 0) { + /* We must lock or a racing completion might update lvb without + * letting us know and we'll clobber the correct value. + * Cannot unlock after the check either, a that still leaves + * a tiny window for completion to get in */ + lock_res_and_lock(lock); + if (lock->l_req_mode != lock->l_granted_mode) + rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, + lock->l_lvb_data, size); + unlock_res_and_lock(lock); + if (rc < 0) { + cleanup_phase = 1; + GOTO(cleanup, rc); + } + } + + if (!is_replay) { + rc = ldlm_lock_enqueue(ns, &lock, NULL, flags); + if (lock->l_completion_ast != NULL) { + int err = lock->l_completion_ast(lock, *flags, NULL); + if (!rc) + rc = err; + if (rc) + cleanup_phase = 1; + } + } + + if (lvb_len && lvb != NULL) { + /* Copy the LVB here, and not earlier, because the completion + * AST (if any) can override what we got in the reply */ + memcpy(lvb, lock->l_lvb_data, lvb_len); + } + + LDLM_DEBUG(lock, "client-side enqueue END"); + EXIT; +cleanup: + if (cleanup_phase == 1 && rc) + failed_lock_cleanup(ns, lock, mode); + /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ + LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_enqueue_fini); + +/** + * Estimate number of lock handles that would fit into request of given + * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into + * a single page on the send/receive side. XXX: 512 should be changed to + * more adequate value. + */ +static inline int ldlm_req_handles_avail(int req_size, int off) +{ + int avail; + + avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size; + if (likely(avail >= 0)) + avail /= (int)sizeof(struct lustre_handle); + else + avail = 0; + avail += LDLM_LOCKREQ_HANDLES - off; + + return avail; +} + +static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, + enum req_location loc, + int off) +{ + int size = req_capsule_msg_size(pill, loc); + return ldlm_req_handles_avail(size, off); +} + +static inline int ldlm_format_handles_avail(struct obd_import *imp, + const struct req_format *fmt, + enum req_location loc, int off) +{ + int size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); + return ldlm_req_handles_avail(size, off); +} + +/** + * Cancel LRU locks and pack them into the enqueue request. Pack there the given + * \a count locks in \a cancels. + * + * This is to be called by functions preparing their own requests that + * might contain lists of locks to cancel in addition to actual operation + * that needs to be performed. + */ +int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *dlm = NULL; + int flags, avail, to_free, pack = 0; + LIST_HEAD(head); + int rc; + ENTRY; + + if (cancels == NULL) + cancels = &head; + if (ns_connect_cancelset(ns)) { + /* Estimate the amount of available space in the request. */ + req_capsule_filled_sizes(pill, RCL_CLIENT); + avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); + + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + to_free = !ns_connect_lru_resize(ns) && + opc == LDLM_ENQUEUE ? 1 : 0; + + /* Cancel LRU locks here _only_ if the server supports + * EARLY_CANCEL. Otherwise we have to send extra CANCEL + * RPC, which will make us slower. */ + if (avail > count) + count += ldlm_cancel_lru_local(ns, cancels, to_free, + avail - count, 0, flags); + if (avail > count) + pack = count; + else + pack = avail; + req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(pack, opc)); + } + + rc = ptlrpc_request_pack(req, version, opc); + if (rc) { + ldlm_lock_list_put(cancels, l_bl_ast, count); + RETURN(rc); + } + + if (ns_connect_cancelset(ns)) { + if (canceloff) { + dlm = req_capsule_client_get(pill, &RMF_DLM_REQ); + LASSERT(dlm); + /* Skip first lock handler in ldlm_request_pack(), + * this method will incrment @lock_count according + * to the lock handle amount actually written to + * the buffer. */ + dlm->lock_count = canceloff; + } + /* Pack into the request @pack lock handles. */ + ldlm_cli_cancel_list(cancels, pack, req, 0); + /* Prepare and send separate cancel RPC for others. */ + ldlm_cli_cancel_list(cancels, count - pack, NULL, 0); + } else { + ldlm_lock_list_put(cancels, l_bl_ast, count); + } + RETURN(0); +} +EXPORT_SYMBOL(ldlm_prep_elc_req); + +int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE, + LDLM_ENQUEUE_CANCEL_OFF, cancels, count); +} +EXPORT_SYMBOL(ldlm_prep_enqueue_req); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} +EXPORT_SYMBOL(ldlm_enqueue_pack); + +/** + * Client-side lock enqueue. + * + * If a request has some specific initialisation it is passed in \a reqp, + * otherwise it is created in ldlm_cli_enqueue. + * + * Supports sync and async requests, pass \a async flag accordingly. If a + * request was created in ldlm_cli_enqueue and it is the async request, + * pass it to the caller in \a reqp. + */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async) +{ + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + struct ldlm_request *body; + int is_replay = *flags & LDLM_FL_REPLAY; + int req_passed_in = 1; + int rc, err; + struct ptlrpc_request *req; + ENTRY; + + LASSERT(exp != NULL); + + ns = exp->exp_obd->obd_namespace; + + /* If we're replaying this lock, just check some invariants. + * If we're creating a new lock, get everything all setup nice. */ + if (is_replay) { + lock = ldlm_handle2lock_long(lockh, 0); + LASSERT(lock != NULL); + LDLM_DEBUG(lock, "client-side enqueue START"); + LASSERT(exp == lock->l_conn_export); + } else { + const struct ldlm_callback_suite cbs = { + .lcs_completion = einfo->ei_cb_cp, + .lcs_blocking = einfo->ei_cb_bl, + .lcs_glimpse = einfo->ei_cb_gl, + .lcs_weigh = einfo->ei_cb_wg + }; + lock = ldlm_lock_create(ns, res_id, einfo->ei_type, + einfo->ei_mode, &cbs, einfo->ei_cbdata, + lvb_len, lvb_type); + if (lock == NULL) + RETURN(-ENOMEM); + /* for the local lock, add the reference */ + ldlm_lock_addref_internal(lock, einfo->ei_mode); + ldlm_lock2handle(lock, lockh); + if (policy != NULL) { + /* INODEBITS_INTEROP: If the server does not support + * inodebits, we will request a plain lock in the + * descriptor (ldlm_lock2desc() below) but use an + * inodebits lock internally with both bits set. + */ + if (einfo->ei_type == LDLM_IBITS && + !(exp_connect_flags(exp) & + OBD_CONNECT_IBITS)) + lock->l_policy_data.l_inodebits.bits = + MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_UPDATE; + else + lock->l_policy_data = *policy; + } + + if (einfo->ei_type == LDLM_EXTENT) + lock->l_req_extent = policy->l_extent; + LDLM_DEBUG(lock, "client-side enqueue START, flags %llx\n", + *flags); + } + + lock->l_conn_export = exp; + lock->l_export = NULL; + lock->l_blocking_ast = einfo->ei_cb_bl; + lock->l_flags |= (*flags & LDLM_FL_NO_LRU); + + /* lock not sent to server yet */ + + if (reqp == NULL || *reqp == NULL) { + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) { + failed_lock_cleanup(ns, lock, einfo->ei_mode); + LDLM_LOCK_RELEASE(lock); + RETURN(-ENOMEM); + } + req_passed_in = 0; + if (reqp) + *reqp = req; + } else { + int len; + + req = *reqp; + len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, + RCL_CLIENT); + LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n", + DLM_LOCKREQ_OFF, len, (int)sizeof(*body)); + } + + /* Dump lock data into the request buffer */ + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(*flags); + body->lock_handle[0] = *lockh; + + /* Continue as normal. */ + if (!req_passed_in) { + if (lvb_len > 0) + req_capsule_extend(&req->rq_pill, + &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lvb_len); + ptlrpc_request_set_replen(req); + } + + /* + * Liblustre client doesn't get extent locks, except for O_APPEND case + * where [0, OBD_OBJECT_EOF] lock is taken, or truncate, where + * [i_size, OBD_OBJECT_EOF] lock is taken. + */ + LASSERT(ergo(LIBLUSTRE_CLIENT, einfo->ei_type != LDLM_EXTENT || + policy->l_extent.end == OBD_OBJECT_EOF)); + + if (async) { + LASSERT(reqp != NULL); + RETURN(0); + } + + LDLM_DEBUG(lock, "sending request"); + + rc = ptlrpc_queue_wait(req); + + err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0, + einfo->ei_mode, flags, lvb, lvb_len, + lockh, rc); + + /* If ldlm_cli_enqueue_fini did not find the lock, we need to free + * one reference that we took */ + if (err == -ENOLCK) + LDLM_LOCK_RELEASE(lock); + else + rc = err; + + if (!req_passed_in && req != NULL) { + ptlrpc_req_finished(req); + if (reqp) + *reqp = NULL; + } + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_cli_enqueue); + +static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode, + __u32 *flags) +{ + struct ldlm_resource *res; + int rc; + ENTRY; + if (ns_is_client(ldlm_lock_to_ns(lock))) { + CERROR("Trying to cancel local lock\n"); + LBUG(); + } + LDLM_DEBUG(lock, "client-side local convert"); + + res = ldlm_lock_convert(lock, new_mode, flags); + if (res) { + ldlm_reprocess_all(res); + rc = 0; + } else { + rc = EDEADLOCK; + } + LDLM_DEBUG(lock, "client-side local convert handler END"); + LDLM_LOCK_PUT(lock); + RETURN(rc); +} + +/* FIXME: one of ldlm_cli_convert or the server side should reject attempted + * conversion of locks which are on the waiting or converting queue */ +/* Caller of this code is supposed to take care of lock readers/writers + accounting */ +int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags) +{ + struct ldlm_request *body; + struct ldlm_reply *reply; + struct ldlm_lock *lock; + struct ldlm_resource *res; + struct ptlrpc_request *req; + int rc; + ENTRY; + + lock = ldlm_handle2lock(lockh); + if (!lock) { + LBUG(); + RETURN(-EINVAL); + } + *flags = 0; + + if (lock->l_conn_export == NULL) + RETURN(ldlm_cli_convert_local(lock, new_mode, flags)); + + LDLM_DEBUG(lock, "client-side convert"); + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(lock->l_conn_export), + &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION, + LDLM_CONVERT); + if (req == NULL) { + LDLM_LOCK_PUT(lock); + RETURN(-ENOMEM); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + body->lock_handle[0] = lock->l_remote_handle; + + body->lock_desc.l_req_mode = new_mode; + body->lock_flags = ldlm_flags_to_wire(*flags); + + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc != ELDLM_OK) + GOTO(out, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + if (req->rq_status) + GOTO(out, rc = req->rq_status); + + res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags); + if (res != NULL) { + ldlm_reprocess_all(res); + /* Go to sleep until the lock is granted. */ + /* FIXME: or cancelled. */ + if (lock->l_completion_ast) { + rc = lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, + NULL); + if (rc) + GOTO(out, rc); + } + } else { + rc = EDEADLOCK; + } + EXIT; + out: + LDLM_LOCK_PUT(lock); + ptlrpc_req_finished(req); + return rc; +} +EXPORT_SYMBOL(ldlm_cli_convert); + +/** + * Cancel locks locally. + * Returns: + * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server + * \retval LDLM_FL_CANCELING otherwise; + * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC. + */ +static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) +{ + __u64 rc = LDLM_FL_LOCAL_ONLY; + ENTRY; + + if (lock->l_conn_export) { + bool local_only; + + LDLM_DEBUG(lock, "client-side cancel"); + /* Set this flag to prevent others from getting new references*/ + lock_res_and_lock(lock); + lock->l_flags |= LDLM_FL_CBPENDING; + local_only = !!(lock->l_flags & + (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK)); + ldlm_cancel_callback(lock); + rc = (lock->l_flags & LDLM_FL_BL_AST) ? + LDLM_FL_BL_AST : LDLM_FL_CANCELING; + unlock_res_and_lock(lock); + + if (local_only) { + CDEBUG(D_DLMTRACE, "not sending request (at caller's " + "instruction)\n"); + rc = LDLM_FL_LOCAL_ONLY; + } + ldlm_lock_cancel(lock); + } else { + if (ns_is_client(ldlm_lock_to_ns(lock))) { + LDLM_ERROR(lock, "Trying to cancel local lock"); + LBUG(); + } + LDLM_DEBUG(lock, "server-side local cancel"); + ldlm_lock_cancel(lock); + ldlm_reprocess_all(lock->l_resource); + } + + RETURN(rc); +} + +/** + * Pack \a count locks in \a head into ldlm_request buffer of request \a req. + */ +static void ldlm_cancel_pack(struct ptlrpc_request *req, + struct list_head *head, int count) +{ + struct ldlm_request *dlm; + struct ldlm_lock *lock; + int max, packed = 0; + ENTRY; + + dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + LASSERT(dlm != NULL); + + /* Check the room in the request buffer. */ + max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - + sizeof(struct ldlm_request); + max /= sizeof(struct lustre_handle); + max += LDLM_LOCKREQ_HANDLES; + LASSERT(max >= dlm->lock_count + count); + + /* XXX: it would be better to pack lock handles grouped by resource. + * so that the server cancel would call filter_lvbo_update() less + * frequently. */ + list_for_each_entry(lock, head, l_bl_ast) { + if (!count--) + break; + LASSERT(lock->l_conn_export); + /* Pack the lock handle to the given request buffer. */ + LDLM_DEBUG(lock, "packing"); + dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle; + packed++; + } + CDEBUG(D_DLMTRACE, "%d locks packed\n", packed); + EXIT; +} + +/** + * Prepare and send a batched cancel RPC. It will include \a count lock + * handles of locks given in \a cancels list. */ +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, + int count, ldlm_cancel_flags_t flags) +{ + struct ptlrpc_request *req = NULL; + struct obd_import *imp; + int free, sent = 0; + int rc = 0; + ENTRY; + + LASSERT(exp != NULL); + LASSERT(count > 0); + + CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val); + + if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) + RETURN(count); + + free = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, RCL_CLIENT, 0); + if (count > free) + count = free; + + while (1) { + imp = class_exp2cliimp(exp); + if (imp == NULL || imp->imp_invalid) { + CDEBUG(D_DLMTRACE, + "skipping cancel on invalid import %p\n", imp); + RETURN(count); + } + + req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT, + ldlm_request_bufsize(count, LDLM_CANCEL)); + + rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; + req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; + ptlrpc_at_set_req_timeout(req); + + ldlm_cancel_pack(req, cancels, count); + + ptlrpc_request_set_replen(req); + if (flags & LCF_ASYNC) { + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + sent = count; + GOTO(out, 0); + } else { + rc = ptlrpc_queue_wait(req); + } + if (rc == ESTALE) { + CDEBUG(D_DLMTRACE, "client/server (nid %s) " + "out of sync -- not fatal\n", + libcfs_nid2str(req->rq_import-> + imp_connection->c_peer.nid)); + rc = 0; + } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/ + req->rq_import_generation == imp->imp_generation) { + ptlrpc_req_finished(req); + continue; + } else if (rc != ELDLM_OK) { + /* -ESHUTDOWN is common on umount */ + CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "Got rc %d from cancel RPC: " + "canceling anyway\n", rc); + break; + } + sent = count; + break; + } + + ptlrpc_req_finished(req); + EXIT; +out: + return sent ? sent : rc; +} +EXPORT_SYMBOL(ldlm_cli_cancel_req); + +static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) +{ + LASSERT(imp != NULL); + return &imp->imp_obd->obd_namespace->ns_pool; +} + +/** + * Update client's OBD pool related fields with new SLV and Limit from \a req. + */ +int ldlm_cli_update_pool(struct ptlrpc_request *req) +{ + struct obd_device *obd; + __u64 new_slv; + __u32 new_limit; + ENTRY; + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + !imp_connect_lru_resize(req->rq_import))) + { + /* + * Do nothing for corner cases. + */ + RETURN(0); + } + + /* In some cases RPC may contain SLV and limit zeroed out. This + * is the case when server does not support LRU resize feature. + * This is also possible in some recovery cases when server-side + * reqs have no reference to the OBD export and thus access to + * server-side namespace is not possible. */ + if (lustre_msg_get_slv(req->rq_repmsg) == 0 || + lustre_msg_get_limit(req->rq_repmsg) == 0) { + DEBUG_REQ(D_HA, req, "Zero SLV or Limit found " + "(SLV: "LPU64", Limit: %u)", + lustre_msg_get_slv(req->rq_repmsg), + lustre_msg_get_limit(req->rq_repmsg)); + RETURN(0); + } + + new_limit = lustre_msg_get_limit(req->rq_repmsg); + new_slv = lustre_msg_get_slv(req->rq_repmsg); + obd = req->rq_import->imp_obd; + + /* Set new SLV and limit in OBD fields to make them accessible + * to the pool thread. We do not access obd_namespace and pool + * directly here as there is no reliable way to make sure that + * they are still alive at cleanup time. Evil races are possible + * which may cause Oops at that time. */ + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = new_slv; + obd->obd_pool_limit = new_limit; + write_unlock(&obd->obd_pool_lock); + + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_update_pool); + +/** + * Client side lock cancel. + * + * Lock must not have any readers or writers by this time. + */ +int ldlm_cli_cancel(struct lustre_handle *lockh, + ldlm_cancel_flags_t cancel_flags) +{ + struct obd_export *exp; + int avail, flags, count = 1; + __u64 rc = 0; + struct ldlm_namespace *ns; + struct ldlm_lock *lock; + LIST_HEAD(cancels); + ENTRY; + + /* concurrent cancels on the same handle can happen */ + lock = ldlm_handle2lock_long(lockh, LDLM_FL_CANCELING); + if (lock == NULL) { + LDLM_DEBUG_NOLOCK("lock is already being destroyed\n"); + RETURN(0); + } + + rc = ldlm_cli_cancel_local(lock); + if (rc == LDLM_FL_LOCAL_ONLY) { + LDLM_LOCK_RELEASE(lock); + RETURN(0); + } + /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL + * RPC which goes to canceld portal, so we can cancel other LRU locks + * here and send them all as one LDLM_CANCEL RPC. */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, &cancels); + + exp = lock->l_conn_export; + if (exp_connect_cancelset(exp)) { + avail = ldlm_format_handles_avail(class_exp2cliimp(exp), + &RQF_LDLM_CANCEL, + RCL_CLIENT, 0); + LASSERT(avail > 0); + + ns = ldlm_lock_to_ns(lock); + flags = ns_connect_lru_resize(ns) ? + LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; + count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1, + LCF_BL_AST, flags); + } + ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel); + +/** + * Locally cancel up to \a count locks in list \a cancels. + * Return the number of cancelled locks. + */ +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + ldlm_cancel_flags_t flags) +{ + LIST_HEAD(head); + struct ldlm_lock *lock, *next; + int left = 0, bl_ast = 0; + __u64 rc; + + left = count; + list_for_each_entry_safe(lock, next, cancels, l_bl_ast) { + if (left-- == 0) + break; + + if (flags & LCF_LOCAL) { + rc = LDLM_FL_LOCAL_ONLY; + ldlm_lock_cancel(lock); + } else { + rc = ldlm_cli_cancel_local(lock); + } + /* Until we have compound requests and can send LDLM_CANCEL + * requests batched with generic RPCs, we need to send cancels + * with the LDLM_FL_BL_AST flag in a separate RPC from + * the one being generated now. */ + if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) { + LDLM_DEBUG(lock, "Cancel lock separately"); + list_del_init(&lock->l_bl_ast); + list_add(&lock->l_bl_ast, &head); + bl_ast++; + continue; + } + if (rc == LDLM_FL_LOCAL_ONLY) { + /* CANCEL RPC should not be sent to server. */ + list_del_init(&lock->l_bl_ast); + LDLM_LOCK_RELEASE(lock); + count--; + } + } + if (bl_ast > 0) { + count -= bl_ast; + ldlm_cli_cancel_list(&head, bl_ast, NULL, 0); + } + + RETURN(count); +} +EXPORT_SYMBOL(ldlm_cli_cancel_list_local); + +/** + * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back + * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g. + * readahead requests, ...) + */ +static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK; + ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery; + lock_res_and_lock(lock); + + /* don't check added & count since we want to process all locks + * from unused list */ + switch (lock->l_resource->lr_type) { + case LDLM_EXTENT: + case LDLM_IBITS: + if (cb && cb(lock)) + break; + default: + result = LDLM_POLICY_SKIP_LOCK; + lock->l_flags |= LDLM_FL_SKIPPED; + break; + } + + unlock_res_and_lock(lock); + RETURN(result); +} + +/** + * Callback function for LRU-resize policy. Decides whether to keep + * \a lock in LRU for current \a LRU size \a unused, added in current + * scan \a added and number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + cfs_time_t cur = cfs_time_current(); + struct ldlm_pool *pl = &ns->ns_pool; + __u64 slv, lvf, lv; + cfs_time_t la; + + /* Stop LRU processing when we reach past @count or have checked all + * locks in LRU. */ + if (count && added >= count) + return LDLM_POLICY_KEEP_LOCK; + + slv = ldlm_pool_get_slv(pl); + lvf = ldlm_pool_get_lvf(pl); + la = cfs_duration_sec(cfs_time_sub(cur, + lock->l_last_used)); + lv = lvf * la * unused; + + /* Inform pool about current CLV to see it via proc. */ + ldlm_pool_set_clv(pl, lv); + + /* Stop when SLV is not yet come from server or lv is smaller than + * it is. */ + return (slv == 0 || lv < slv) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for proc used policy. Makes decision whether to keep + * \a lock in LRU for current \a LRU size \a unused, added in current scan \a + * added and number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing when we reach past @count or have checked all + * locks in LRU. */ + return (added >= count) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for aged policy. Makes decision whether to keep \a lock in + * LRU for current LRU size \a unused, added in current scan \a added and + * number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing if young lock is found and we reach past count */ + return ((added >= count) && + cfs_time_before(cfs_time_current(), + cfs_time_add(lock->l_last_used, + ns->ns_max_age))) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +/** + * Callback function for default policy. Makes decision whether to keep \a lock + * in LRU for current LRU size \a unused, added in current scan \a added and + * number of locks to be preferably canceled \a count. + * + * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning + * + * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU + */ +static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns, + struct ldlm_lock *lock, + int unused, int added, + int count) +{ + /* Stop LRU processing when we reach past count or have checked all + * locks in LRU. */ + return (added >= count) ? + LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; +} + +typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, + struct ldlm_lock *, int, + int, int); + +static ldlm_cancel_lru_policy_t +ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) +{ + if (flags & LDLM_CANCEL_NO_WAIT) + return ldlm_cancel_no_wait_policy; + + if (ns_connect_lru_resize(ns)) { + if (flags & LDLM_CANCEL_SHRINK) + /* We kill passed number of old locks. */ + return ldlm_cancel_passed_policy; + else if (flags & LDLM_CANCEL_LRUR) + return ldlm_cancel_lrur_policy; + else if (flags & LDLM_CANCEL_PASSED) + return ldlm_cancel_passed_policy; + } else { + if (flags & LDLM_CANCEL_AGED) + return ldlm_cancel_aged_policy; + } + + return ldlm_cancel_default_policy; +} + +/** + * - Free space in LRU for \a count new locks, + * redundant unused locks are canceled locally; + * - also cancel locally unused aged locks; + * - do not cancel more than \a max locks; + * - GET the found locks and add them into the \a cancels list. + * + * A client lock can be added to the l_bl_ast list only when it is + * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing + * CANCEL. There are the following use cases: + * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and + * ldlm_cli_cancel(), which check and set this flag properly. As any + * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed + * later without any special locking. + * + * Calling policies for enabled LRU resize: + * ---------------------------------------- + * flags & LDLM_CANCEL_LRUR - use LRU resize policy (SLV from server) to + * cancel not more than \a count locks; + * + * flags & LDLM_CANCEL_PASSED - cancel \a count number of old locks (located at + * the beginning of LRU list); + * + * flags & LDLM_CANCEL_SHRINK - cancel not more than \a count locks according to + * memory pressre policy function; + * + * flags & LDLM_CANCEL_AGED - cancel \a count locks according to "aged policy". + * + * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible + * (typically before replaying locks) w/o + * sending any RPCs or waiting for any + * outstanding RPC to complete. + */ +static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, struct list_head *cancels, + int count, int max, int flags) +{ + ldlm_cancel_lru_policy_t pf; + struct ldlm_lock *lock, *next; + int added = 0, unused, remained; + ENTRY; + + spin_lock(&ns->ns_lock); + unused = ns->ns_nr_unused; + remained = unused; + + if (!ns_connect_lru_resize(ns)) + count += unused - ns->ns_max_unused; + + pf = ldlm_cancel_lru_policy(ns, flags); + LASSERT(pf != NULL); + + while (!list_empty(&ns->ns_unused_list)) { + ldlm_policy_res_t result; + + /* all unused locks */ + if (remained-- <= 0) + break; + + /* For any flags, stop scanning if @max is reached. */ + if (max && added >= max) + break; + + list_for_each_entry_safe(lock, next, &ns->ns_unused_list, + l_lru) { + /* No locks which got blocking requests. */ + LASSERT(!(lock->l_flags & LDLM_FL_BL_AST)); + + if (flags & LDLM_CANCEL_NO_WAIT && + lock->l_flags & LDLM_FL_SKIPPED) + /* already processed */ + continue; + + /* Somebody is already doing CANCEL. No need for this + * lock in LRU, do not traverse it again. */ + if (!(lock->l_flags & LDLM_FL_CANCELING)) + break; + + ldlm_lock_remove_from_lru_nolock(lock); + } + if (&lock->l_lru == &ns->ns_unused_list) + break; + + LDLM_LOCK_GET(lock); + spin_unlock(&ns->ns_lock); + lu_ref_add(&lock->l_reference, __FUNCTION__, current); + + /* Pass the lock through the policy filter and see if it + * should stay in LRU. + * + * Even for shrinker policy we stop scanning if + * we find a lock that should stay in the cache. + * We should take into account lock age anyway + * as a new lock is a valuable resource even if + * it has a low weight. + * + * That is, for shrinker policy we drop only + * old locks, but additionally choose them by + * their weight. Big extent locks will stay in + * the cache. */ + result = pf(ns, lock, unused, added, count); + if (result == LDLM_POLICY_KEEP_LOCK) { + lu_ref_del(&lock->l_reference, + __FUNCTION__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + break; + } + if (result == LDLM_POLICY_SKIP_LOCK) { + lu_ref_del(&lock->l_reference, + __func__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + continue; + } + + lock_res_and_lock(lock); + /* Check flags again under the lock. */ + if ((lock->l_flags & LDLM_FL_CANCELING) || + (ldlm_lock_remove_from_lru(lock) == 0)) { + /* Another thread is removing lock from LRU, or + * somebody is already doing CANCEL, or there + * is a blocking request which will send cancel + * by itself, or the lock is no longer unused. */ + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, + __FUNCTION__, current); + LDLM_LOCK_RELEASE(lock); + spin_lock(&ns->ns_lock); + continue; + } + LASSERT(!lock->l_readers && !lock->l_writers); + + /* If we have chosen to cancel this lock voluntarily, we + * better send cancel notification to server, so that it + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also + * silently cancelling this lock. */ + lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; + + /* Setting the CBPENDING flag is a little misleading, + * but prevents an important race; namely, once + * CBPENDING is set, the lock can accumulate no more + * readers/writers. Since readers and writers are + * already zero here, ldlm_lock_decref() won't see + * this flag and call l_blocking_ast */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; + + /* We can't re-add to l_lru as it confuses the + * refcounting in ldlm_lock_remove_from_lru() if an AST + * arrives after we drop lr_lock below. We use l_bl_ast + * and can't use l_pending_chain as it is used both on + * server and client nevertheless bug 5666 says it is + * used only on server */ + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + unlock_res_and_lock(lock); + lu_ref_del(&lock->l_reference, __FUNCTION__, current); + spin_lock(&ns->ns_lock); + added++; + unused--; + } + spin_unlock(&ns->ns_lock); + RETURN(added); +} + +int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, + int count, int max, ldlm_cancel_flags_t cancel_flags, + int flags) +{ + int added; + added = ldlm_prepare_lru_list(ns, cancels, count, max, flags); + if (added <= 0) + return added; + return ldlm_cli_cancel_list_local(cancels, added, cancel_flags); +} + +/** + * Cancel at least \a nr locks from given namespace LRU. + * + * When called with LCF_ASYNC the blocking callback will be handled + * in a thread and this function will return after the thread has been + * asked to call the callback. When called with LCF_ASYNC the blocking + * callback will be performed in this function. + */ +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, + ldlm_cancel_flags_t cancel_flags, + int flags) +{ + LIST_HEAD(cancels); + int count, rc; + ENTRY; + + /* Just prepare the list of locks, do not actually cancel them yet. + * Locks are cancelled later in a separate thread. */ + count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags); + rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags); + if (rc == 0) + RETURN(count); + + RETURN(0); +} + +/** + * Find and cancel locally unused locks found on resource, matched to the + * given policy, mode. GET the found locks and add them into the \a cancels + * list. + */ +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, int lock_flags, + ldlm_cancel_flags_t cancel_flags, void *opaque) +{ + struct ldlm_lock *lock; + int count = 0; + ENTRY; + + lock_res(res); + list_for_each_entry(lock, &res->lr_granted, l_res_link) { + if (opaque != NULL && lock->l_ast_data != opaque) { + LDLM_ERROR(lock, "data %p doesn't match opaque %p", + lock->l_ast_data, opaque); + //LBUG(); + continue; + } + + if (lock->l_readers || lock->l_writers) + continue; + + /* If somebody is already doing CANCEL, or blocking AST came, + * skip this lock. */ + if (lock->l_flags & LDLM_FL_BL_AST || + lock->l_flags & LDLM_FL_CANCELING) + continue; + + if (lockmode_compat(lock->l_granted_mode, mode)) + continue; + + /* If policy is given and this is IBITS lock, add to list only + * those locks that match by policy. */ + if (policy && (lock->l_resource->lr_type == LDLM_IBITS) && + !(lock->l_policy_data.l_inodebits.bits & + policy->l_inodebits.bits)) + continue; + + /* See CBPENDING comment in ldlm_cancel_lru */ + lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING | + lock_flags; + + LASSERT(list_empty(&lock->l_bl_ast)); + list_add(&lock->l_bl_ast, cancels); + LDLM_LOCK_GET(lock); + count++; + } + unlock_res(res); + + RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags)); +} +EXPORT_SYMBOL(ldlm_cancel_resource_local); + +/** + * Cancel client-side locks from a list and send/prepare cancel RPCs to the + * server. + * If \a req is NULL, send CANCEL request to server with handles of locks + * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests + * separately per lock. + * If \a req is not NULL, put handles of locks in \a cancels into the request + * buffer at the offset \a off. + * Destroy \a cancels at the end. + */ +int ldlm_cli_cancel_list(struct list_head *cancels, int count, + struct ptlrpc_request *req, ldlm_cancel_flags_t flags) +{ + struct ldlm_lock *lock; + int res = 0; + ENTRY; + + if (list_empty(cancels) || count == 0) + RETURN(0); + + /* XXX: requests (both batched and not) could be sent in parallel. + * Usually it is enough to have just 1 RPC, but it is possible that + * there are too many locks to be cancelled in LRU or on a resource. + * It would also speed up the case when the server does not support + * the feature. */ + while (count > 0) { + LASSERT(!list_empty(cancels)); + lock = list_entry(cancels->next, struct ldlm_lock, + l_bl_ast); + LASSERT(lock->l_conn_export); + + if (exp_connect_cancelset(lock->l_conn_export)) { + res = count; + if (req) + ldlm_cancel_pack(req, cancels, count); + else + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, count, + flags); + } else { + res = ldlm_cli_cancel_req(lock->l_conn_export, + cancels, 1, flags); + } + + if (res < 0) { + CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, + "ldlm_cli_cancel_list: %d\n", res); + res = count; + } + + count -= res; + ldlm_lock_list_put(cancels, l_bl_ast, res); + } + LASSERT(count == 0); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_list); + +/** + * Cancel all locks on a resource that have 0 readers/writers. + * + * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + struct ldlm_resource *res; + LIST_HEAD(cancels); + int count; + int rc; + ENTRY; + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (res == NULL) { + /* This is not a problem. */ + CDEBUG(D_INFO, "No resource "LPU64"\n", res_id->name[0]); + RETURN(0); + } + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, &cancels, policy, mode, + 0, flags | LCF_BL_AST, opaque); + rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags); + if (rc != ELDLM_OK) + CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc); + + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(0); +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource); + +struct ldlm_cli_cancel_arg { + int lc_flags; + void *lc_opaque; +}; + +static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + struct ldlm_cli_cancel_arg *lc = arg; + int rc; + + rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name, + NULL, LCK_MINMODE, + lc->lc_flags, lc->lc_opaque); + if (rc != 0) { + CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n", + res->lr_name.name[0], rc); + } + /* must return 0 for hash iteration */ + return 0; +} + +/** + * Cancel all locks on a namespace (or a specific resource, if given) + * that have 0 readers/writers. + * + * If flags & LCF_LOCAL, throw the locks away without trying + * to notify the server. */ +int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_cancel_flags_t flags, void *opaque) +{ + struct ldlm_cli_cancel_arg arg = { + .lc_flags = flags, + .lc_opaque = opaque, + }; + + ENTRY; + + if (ns == NULL) + RETURN(ELDLM_OK); + + if (res_id != NULL) { + RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL, + LCK_MINMODE, flags, + opaque)); + } else { + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_cli_hash_cancel_unused, &arg); + RETURN(ELDLM_OK); + } +} +EXPORT_SYMBOL(ldlm_cli_cancel_unused); + +/* Lock iterators. */ + +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure) +{ + struct list_head *tmp, *next; + struct ldlm_lock *lock; + int rc = LDLM_ITER_CONTINUE; + + ENTRY; + + if (!res) + RETURN(LDLM_ITER_CONTINUE); + + lock_res(res); + list_for_each_safe(tmp, next, &res->lr_granted) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } + + list_for_each_safe(tmp, next, &res->lr_converting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } + + list_for_each_safe(tmp, next, &res->lr_waiting) { + lock = list_entry(tmp, struct ldlm_lock, l_res_link); + + if (iter(lock, closure) == LDLM_ITER_STOP) + GOTO(out, rc = LDLM_ITER_STOP); + } + out: + unlock_res(res); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_resource_foreach); + +struct iter_helper_data { + ldlm_iterator_t iter; + void *closure; +}; + +static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) +{ + struct iter_helper_data *helper = closure; + return helper->iter(lock, helper->closure); +} + +static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) + +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + return ldlm_resource_foreach(res, ldlm_iter_helper, arg) == + LDLM_ITER_STOP; +} + +void ldlm_namespace_foreach(struct ldlm_namespace *ns, + ldlm_iterator_t iter, void *closure) + +{ + struct iter_helper_data helper = { iter: iter, closure: closure }; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_iter_helper, &helper); + +} +EXPORT_SYMBOL(ldlm_namespace_foreach); + +/* non-blocking function to manipulate a lock whose cb_data is being put away. + * return 0: find no resource + * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE. + * < 0: errors + */ +int ldlm_resource_iterate(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_iterator_t iter, void *data) +{ + struct ldlm_resource *res; + int rc; + ENTRY; + + if (ns == NULL) { + CERROR("must pass in namespace\n"); + LBUG(); + } + + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (res == NULL) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + rc = ldlm_resource_foreach(res, iter, data); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_resource_iterate); + +/* Lock replay */ + +static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) +{ + struct list_head *list = closure; + + /* we use l_pending_chain here, because it's unused on clients. */ + LASSERTF(list_empty(&lock->l_pending_chain), + "lock %p next %p prev %p\n", + lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev); + /* bug 9573: don't replay locks left after eviction, or + * bug 17614: locks being actively cancelled. Get a reference + * on a lock so that it does not disapear under us (e.g. due to cancel) + */ + if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) { + list_add(&lock->l_pending_chain, list); + LDLM_LOCK_GET(lock); + } + + return LDLM_ITER_CONTINUE; +} + +static int replay_lock_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct ldlm_async_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct ldlm_reply *reply; + struct obd_export *exp; + + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + if (rc != ELDLM_OK) + GOTO(out, rc); + + + reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + lock = ldlm_handle2lock(&aa->lock_handle); + if (!lock) { + CERROR("received replay ack for unknown local cookie "LPX64 + " remote cookie "LPX64 " from server %s id %s\n", + aa->lock_handle.cookie, reply->lock_handle.cookie, + req->rq_export->exp_client_uuid.uuid, + libcfs_id2str(req->rq_peer)); + GOTO(out, rc = -ESTALE); + } + + /* Key change rehash lock in per-export hash with new key */ + exp = req->rq_export; + if (exp && exp->exp_lock_hash) { + /* In the function below, .hs_keycmp resolves to + * ldlm_export_lock_keycmp() */ + /* coverity[overrun-buffer-val] */ + cfs_hash_rehash_key(exp->exp_lock_hash, + &lock->l_remote_handle, + &reply->lock_handle, + &lock->l_exp_hash); + } else { + lock->l_remote_handle = reply->lock_handle; + } + + LDLM_DEBUG(lock, "replayed lock:"); + ptlrpc_import_recovery_state_machine(req->rq_import); + LDLM_LOCK_PUT(lock); +out: + if (rc != ELDLM_OK) + ptlrpc_connect_import(req->rq_import); + + RETURN(rc); +} + +static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + struct ldlm_async_args *aa; + struct ldlm_request *body; + int flags; + ENTRY; + + + /* Bug 11974: Do not replay a lock which is actively being canceled */ + if (lock->l_flags & LDLM_FL_CANCELING) { + LDLM_DEBUG(lock, "Not replaying canceled lock:"); + RETURN(0); + } + + /* If this is reply-less callback lock, we cannot replay it, since + * server might have long dropped it, but notification of that event was + * lost by network. (and server granted conflicting lock already) */ + if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) { + LDLM_DEBUG(lock, "Not replaying reply-less lock:"); + ldlm_lock_cancel(lock); + RETURN(0); + } + + /* + * If granted mode matches the requested mode, this lock is granted. + * + * If they differ, but we have a granted mode, then we were granted + * one mode and now want another: ergo, converting. + * + * If we haven't been granted anything and are on a resource list, + * then we're blocked/waiting. + * + * If we haven't been granted anything and we're NOT on a resource list, + * then we haven't got a reply yet and don't have a known disposition. + * This happens whenever a lock enqueue is the request that triggers + * recovery. + */ + if (lock->l_granted_mode == lock->l_req_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; + else if (lock->l_granted_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV; + else if (!list_empty(&lock->l_res_link)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; + else + flags = LDLM_FL_REPLAY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE, + LUSTRE_DLM_VERSION, LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + /* We're part of recovery, so don't wait for it. */ + req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS; + + body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); + ldlm_lock2desc(lock, &body->lock_desc); + body->lock_flags = ldlm_flags_to_wire(flags); + + ldlm_lock2handle(lock, &body->lock_handle[0]); + if (lock->l_lvb_len > 0) + req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB); + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + lock->l_lvb_len); + ptlrpc_request_set_replen(req); + /* notify the server we've replayed all requests. + * also, we mark the request to be put on a dedicated + * queue to be processed after all request replayes. + * bug 6063 */ + lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE); + + LDLM_DEBUG(lock, "replaying lock:"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->lock_handle = body->lock_handle[0]; + req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + + RETURN(0); +} + +/** + * Cancel as many unused locks as possible before replay. since we are + * in recovery, we can't wait for any outstanding RPCs to send any RPC + * to the server. + * + * Called only in recovery before replaying locks. there is no need to + * replay locks that are unused. since the clients may hold thousands of + * cached unused locks, dropping the unused locks can greatly reduce the + * load on the servers at recovery time. + */ +static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) +{ + int canceled; + LIST_HEAD(cancels); + + CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before" + "replay for namespace %s (%d)\n", + ldlm_ns_name(ns), ns->ns_nr_unused); + + /* We don't need to care whether or not LRU resize is enabled + * because the LDLM_CANCEL_NO_WAIT policy doesn't use the + * count parameter */ + canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, + LCF_LOCAL, LDLM_CANCEL_NO_WAIT); + + CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", + canceled, ldlm_ns_name(ns)); +} + +int ldlm_replay_locks(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + LIST_HEAD(list); + struct ldlm_lock *lock, *next; + int rc = 0; + + ENTRY; + + LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); + + /* don't replay locks if import failed recovery */ + if (imp->imp_vbr_failed) + RETURN(0); + + /* ensure this doesn't fall to 0 before all have been queued */ + atomic_inc(&imp->imp_replay_inflight); + + if (ldlm_cancel_unused_locks_before_replay) + ldlm_cancel_unused_locks_for_replay(ns); + + ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); + + list_for_each_entry_safe(lock, next, &list, l_pending_chain) { + list_del_init(&lock->l_pending_chain); + if (rc) { + LDLM_LOCK_RELEASE(lock); + continue; /* or try to do the rest? */ + } + rc = replay_one_lock(imp, lock); + LDLM_LOCK_RELEASE(lock); + } + + atomic_dec(&imp->imp_replay_inflight); + + RETURN(rc); +} +EXPORT_SYMBOL(ldlm_replay_locks); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c new file mode 100644 index 000000000000..6bdfb428a41a --- /dev/null +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c @@ -0,0 +1,1444 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ldlm/ldlm_resource.c + * + * Author: Phil Schwan + * Author: Peter Braam + */ + +#define DEBUG_SUBSYSTEM S_LDLM +# include + +#include +#include +#include "ldlm_internal.h" + +struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab; + +atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0); +atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0); + +struct mutex ldlm_srv_namespace_lock; +LIST_HEAD(ldlm_srv_namespace_list); + +struct mutex ldlm_cli_namespace_lock; +LIST_HEAD(ldlm_cli_namespace_list); + +proc_dir_entry_t *ldlm_type_proc_dir = NULL; +proc_dir_entry_t *ldlm_ns_proc_dir = NULL; +proc_dir_entry_t *ldlm_svc_proc_dir = NULL; + +extern unsigned int ldlm_cancel_unused_locks_before_replay; + +/* during debug dump certain amount of granted locks for one resource to avoid + * DDOS. */ +unsigned int ldlm_dump_granted_max = 256; + +#ifdef LPROCFS +static int ldlm_proc_dump_ns(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); + ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); + RETURN(count); +} + +int ldlm_proc_setup(void) +{ + int rc; + struct lprocfs_vars list[] = { + { "dump_namespaces", NULL, ldlm_proc_dump_ns, NULL }, + { "dump_granted_max", + lprocfs_rd_uint, lprocfs_wr_uint, + &ldlm_dump_granted_max, NULL }, + { "cancel_unused_locks_before_replay", + lprocfs_rd_uint, lprocfs_wr_uint, + &ldlm_cancel_unused_locks_before_replay, NULL }, + { NULL }}; + ENTRY; + LASSERT(ldlm_ns_proc_dir == NULL); + + ldlm_type_proc_dir = lprocfs_register(OBD_LDLM_DEVICENAME, + proc_lustre_root, + NULL, NULL); + if (IS_ERR(ldlm_type_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_type_proc_dir); + GOTO(err, rc); + } + + ldlm_ns_proc_dir = lprocfs_register("namespaces", + ldlm_type_proc_dir, + NULL, NULL); + if (IS_ERR(ldlm_ns_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_ns_proc_dir); + GOTO(err_type, rc); + } + + ldlm_svc_proc_dir = lprocfs_register("services", + ldlm_type_proc_dir, + NULL, NULL); + if (IS_ERR(ldlm_svc_proc_dir)) { + CERROR("LProcFS failed in ldlm-init\n"); + rc = PTR_ERR(ldlm_svc_proc_dir); + GOTO(err_ns, rc); + } + + rc = lprocfs_add_vars(ldlm_type_proc_dir, list, NULL); + + RETURN(0); + +err_ns: + lprocfs_remove(&ldlm_ns_proc_dir); +err_type: + lprocfs_remove(&ldlm_type_proc_dir); +err: + ldlm_svc_proc_dir = NULL; + RETURN(rc); +} + +void ldlm_proc_cleanup(void) +{ + if (ldlm_svc_proc_dir) + lprocfs_remove(&ldlm_svc_proc_dir); + + if (ldlm_ns_proc_dir) + lprocfs_remove(&ldlm_ns_proc_dir); + + if (ldlm_type_proc_dir) + lprocfs_remove(&ldlm_type_proc_dir); +} + +static int lprocfs_rd_ns_resources(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ldlm_namespace *ns = data; + __u64 res = 0; + cfs_hash_bd_t bd; + int i; + + /* result is not strictly consistant */ + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i) + res += cfs_hash_bd_count_get(&bd); + return lprocfs_rd_u64(page, start, off, count, eof, &res); +} + +static int lprocfs_rd_ns_locks(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ldlm_namespace *ns = data; + __u64 locks; + + locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_FIELDS_FLAGS_SUM); + return lprocfs_rd_u64(page, start, off, count, eof, &locks); +} + +static int lprocfs_rd_lru_size(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ldlm_namespace *ns = data; + __u32 *nr = &ns->ns_max_unused; + + if (ns_connect_lru_resize(ns)) + nr = &ns->ns_nr_unused; + return lprocfs_rd_uint(page, start, off, count, eof, nr); +} + +static int lprocfs_wr_lru_size(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ldlm_namespace *ns = data; + char dummy[MAX_STRING_SIZE + 1], *end; + unsigned long tmp; + int lru_resize; + + dummy[MAX_STRING_SIZE] = '\0'; + if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + return -EFAULT; + + if (strncmp(dummy, "clear", 5) == 0) { + CDEBUG(D_DLMTRACE, + "dropping all unused locks from namespace %s\n", + ldlm_ns_name(ns)); + if (ns_connect_lru_resize(ns)) { + int canceled, unused = ns->ns_nr_unused; + + /* Try to cancel all @ns_nr_unused locks. */ + canceled = ldlm_cancel_lru(ns, unused, 0, + LDLM_CANCEL_PASSED); + if (canceled < unused) { + CDEBUG(D_DLMTRACE, + "not all requested locks are canceled, " + "requested: %d, canceled: %d\n", unused, + canceled); + return -EINVAL; + } + } else { + tmp = ns->ns_max_unused; + ns->ns_max_unused = 0; + ldlm_cancel_lru(ns, 0, 0, LDLM_CANCEL_PASSED); + ns->ns_max_unused = tmp; + } + return count; + } + + tmp = simple_strtoul(dummy, &end, 0); + if (dummy == end) { + CERROR("invalid value written\n"); + return -EINVAL; + } + lru_resize = (tmp == 0); + + if (ns_connect_lru_resize(ns)) { + if (!lru_resize) + ns->ns_max_unused = (unsigned int)tmp; + + if (tmp > ns->ns_nr_unused) + tmp = ns->ns_nr_unused; + tmp = ns->ns_nr_unused - tmp; + + CDEBUG(D_DLMTRACE, + "changing namespace %s unused locks from %u to %u\n", + ldlm_ns_name(ns), ns->ns_nr_unused, + (unsigned int)tmp); + ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_CANCEL_PASSED); + + if (!lru_resize) { + CDEBUG(D_DLMTRACE, + "disable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; + } + } else { + CDEBUG(D_DLMTRACE, + "changing namespace %s max_unused from %u to %u\n", + ldlm_ns_name(ns), ns->ns_max_unused, + (unsigned int)tmp); + ns->ns_max_unused = (unsigned int)tmp; + ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_CANCEL_PASSED); + + /* Make sure that LRU resize was originally supported before + * turning it on here. */ + if (lru_resize && + (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { + CDEBUG(D_DLMTRACE, + "enable lru_resize for namespace %s\n", + ldlm_ns_name(ns)); + ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; + } + } + + return count; +} + +static int lprocfs_rd_elc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ldlm_namespace *ns = data; + unsigned int supp = ns_connect_cancelset(ns); + + return lprocfs_rd_uint(page, start, off, count, eof, &supp); +} + +static int lprocfs_wr_elc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ldlm_namespace *ns = data; + unsigned int supp = -1; + int rc; + + rc = lprocfs_wr_uint(file, buffer, count, &supp); + if (rc < 0) + return rc; + + if (supp == 0) + ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET; + else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET) + ns->ns_connect_flags |= OBD_CONNECT_CANCELSET; + return count; +} + +void ldlm_namespace_proc_unregister(struct ldlm_namespace *ns) +{ + struct proc_dir_entry *dir; + + dir = lprocfs_srch(ldlm_ns_proc_dir, ldlm_ns_name(ns)); + if (dir == NULL) { + CERROR("dlm namespace %s has no procfs dir?\n", + ldlm_ns_name(ns)); + } else { + lprocfs_remove(&dir); + } + + if (ns->ns_stats != NULL) + lprocfs_free_stats(&ns->ns_stats); +} + +int ldlm_namespace_proc_register(struct ldlm_namespace *ns) +{ + struct lprocfs_vars lock_vars[2]; + char lock_name[MAX_STRING_SIZE + 1]; + + LASSERT(ns != NULL); + LASSERT(ns->ns_rs_hash != NULL); + + ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0); + if (ns->ns_stats == NULL) + return -ENOMEM; + + lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, + LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); + + lock_name[MAX_STRING_SIZE] = '\0'; + + memset(lock_vars, 0, sizeof(lock_vars)); + lock_vars[0].name = lock_name; + + snprintf(lock_name, MAX_STRING_SIZE, "%s/resource_count", + ldlm_ns_name(ns)); + lock_vars[0].data = ns; + lock_vars[0].read_fptr = lprocfs_rd_ns_resources; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_count", + ldlm_ns_name(ns)); + lock_vars[0].data = ns; + lock_vars[0].read_fptr = lprocfs_rd_ns_locks; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + if (ns_is_client(ns)) { + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_unused_count", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_nr_unused; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_size", + ldlm_ns_name(ns)); + lock_vars[0].data = ns; + lock_vars[0].read_fptr = lprocfs_rd_lru_size; + lock_vars[0].write_fptr = lprocfs_wr_lru_size; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_max_age; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/early_lock_cancel", + ldlm_ns_name(ns)); + lock_vars[0].data = ns; + lock_vars[0].read_fptr = lprocfs_rd_elc; + lock_vars[0].write_fptr = lprocfs_wr_elc; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + } else { + snprintf(lock_name, MAX_STRING_SIZE, "%s/ctime_age_limit", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_ctime_age_limit; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_timeouts", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_timeouts; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/max_nolock_bytes", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_max_nolock_size; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/contention_seconds", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_contention_time; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/contended_locks", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_contended_locks; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + + snprintf(lock_name, MAX_STRING_SIZE, "%s/max_parallel_ast", + ldlm_ns_name(ns)); + lock_vars[0].data = &ns->ns_max_parallel_ast; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lock_vars[0].write_fptr = lprocfs_wr_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + } + return 0; +} +#undef MAX_STRING_SIZE +#else /* LPROCFS */ + +#define ldlm_namespace_proc_unregister(ns) ({;}) +#define ldlm_namespace_proc_register(ns) ({0;}) + +#endif /* LPROCFS */ + +static unsigned ldlm_res_hop_hash(cfs_hash_t *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + unsigned val = 0; + unsigned i; + + for (i = 0; i < RES_NAME_SIZE; i++) + val += id->name[i]; + return val & mask; +} + +static unsigned ldlm_res_hop_fid_hash(cfs_hash_t *hs, + const void *key, unsigned mask) +{ + const struct ldlm_res_id *id = key; + struct lu_fid fid; + __u32 hash; + __u32 val; + + fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF]; + fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF]; + fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + hash = fid_flatten32(&fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) { + val = id->name[LUSTRE_RES_ID_HSH_OFF]; + hash += (val >> 5) + (val << 11); + } else { + val = fid_oid(&fid); + } + hash = cfs_hash_long(hash, hs->hs_bkt_bits); + /* give me another random factor */ + hash -= cfs_hash_long((unsigned long)hs, val % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *ldlm_res_hop_key(struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return &res->lr_name; +} + +static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + return ldlm_res_eq((const struct ldlm_res_id *)key, + (const struct ldlm_res_id *)&res->lr_name); +} + +static void *ldlm_res_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ldlm_resource, lr_hash); +} + +static void ldlm_res_hop_get_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_getref(res); +} + +static void ldlm_res_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* cfs_hash_for_each_nolock is the only chance we call it */ + ldlm_resource_putref_locked(res); +} + +static void ldlm_res_hop_put(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ldlm_resource *res; + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + ldlm_resource_putref(res); +} + +cfs_hash_ops_t ldlm_ns_hash_ops = { + .hs_hash = ldlm_res_hop_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put_locked = ldlm_res_hop_put_locked, + .hs_put = ldlm_res_hop_put +}; + +cfs_hash_ops_t ldlm_ns_fid_hash_ops = { + .hs_hash = ldlm_res_hop_fid_hash, + .hs_key = ldlm_res_hop_key, + .hs_keycmp = ldlm_res_hop_keycmp, + .hs_keycpy = NULL, + .hs_object = ldlm_res_hop_object, + .hs_get = ldlm_res_hop_get_locked, + .hs_put_locked = ldlm_res_hop_put_locked, + .hs_put = ldlm_res_hop_put +}; + +typedef struct { + ldlm_ns_type_t nsd_type; + /** hash bucket bits */ + unsigned nsd_bkt_bits; + /** hash bits */ + unsigned nsd_all_bits; + /** hash operations */ + cfs_hash_ops_t *nsd_hops; +} ldlm_ns_hash_def_t; + +ldlm_ns_hash_def_t ldlm_ns_hash_defs[] = +{ + { + .nsd_type = LDLM_NS_TYPE_MDC, + .nsd_bkt_bits = 11, + .nsd_all_bits = 16, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MDT, + .nsd_bkt_bits = 14, + .nsd_all_bits = 21, + .nsd_hops = &ldlm_ns_fid_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OSC, + .nsd_bkt_bits = 8, + .nsd_all_bits = 12, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_OST, + .nsd_bkt_bits = 11, + .nsd_all_bits = 17, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGC, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_MGT, + .nsd_bkt_bits = 4, + .nsd_all_bits = 4, + .nsd_hops = &ldlm_ns_hash_ops, + }, + { + .nsd_type = LDLM_NS_TYPE_UNKNOWN, + }, +}; + +/** + * Create and initialize new empty namespace. + */ +struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, + ldlm_appetite_t apt, + ldlm_ns_type_t ns_type) +{ + struct ldlm_namespace *ns = NULL; + struct ldlm_ns_bucket *nsb; + ldlm_ns_hash_def_t *nsd; + cfs_hash_bd_t bd; + int idx; + int rc; + ENTRY; + + LASSERT(obd != NULL); + + rc = ldlm_get_ref(); + if (rc) { + CERROR("ldlm_get_ref failed: %d\n", rc); + RETURN(NULL); + } + + for (idx = 0;;idx++) { + nsd = &ldlm_ns_hash_defs[idx]; + if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) { + CERROR("Unknown type %d for ns %s\n", ns_type, name); + GOTO(out_ref, NULL); + } + + if (nsd->nsd_type == ns_type) + break; + } + + OBD_ALLOC_PTR(ns); + if (!ns) + GOTO(out_ref, NULL); + + ns->ns_rs_hash = cfs_hash_create(name, + nsd->nsd_all_bits, nsd->nsd_all_bits, + nsd->nsd_bkt_bits, sizeof(*nsb), + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + nsd->nsd_hops, + CFS_HASH_DEPTH | + CFS_HASH_BIGNAME | + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF); + if (ns->ns_rs_hash == NULL) + GOTO(out_ns, NULL); + + cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) { + nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); + nsb->nsb_namespace = ns; + } + + ns->ns_obd = obd; + ns->ns_appetite = apt; + ns->ns_client = client; + + INIT_LIST_HEAD(&ns->ns_list_chain); + INIT_LIST_HEAD(&ns->ns_unused_list); + spin_lock_init(&ns->ns_lock); + atomic_set(&ns->ns_bref, 0); + init_waitqueue_head(&ns->ns_waitq); + + ns->ns_max_nolock_size = NS_DEFAULT_MAX_NOLOCK_BYTES; + ns->ns_contention_time = NS_DEFAULT_CONTENTION_SECONDS; + ns->ns_contended_locks = NS_DEFAULT_CONTENDED_LOCKS; + + ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; + ns->ns_nr_unused = 0; + ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; + ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE; + ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_timeouts = 0; + ns->ns_orig_connect_flags = 0; + ns->ns_connect_flags = 0; + ns->ns_stopping = 0; + rc = ldlm_namespace_proc_register(ns); + if (rc != 0) { + CERROR("Can't initialize ns proc, rc %d\n", rc); + GOTO(out_hash, rc); + } + + idx = atomic_read(ldlm_namespace_nr(client)); + rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); + if (rc) { + CERROR("Can't initialize lock pool, rc %d\n", rc); + GOTO(out_proc, rc); + } + + ldlm_namespace_register(ns, client); + RETURN(ns); +out_proc: + ldlm_namespace_proc_unregister(ns); + ldlm_namespace_cleanup(ns, 0); +out_hash: + cfs_hash_putref(ns->ns_rs_hash); +out_ns: + OBD_FREE_PTR(ns); +out_ref: + ldlm_put_ref(); + RETURN(NULL); +} +EXPORT_SYMBOL(ldlm_namespace_new); + +extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); + +/** + * Cancel and destroy all locks on a resource. + * + * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just + * clean up. This is currently only used for recovery, and we make + * certain assumptions as a result--notably, that we shouldn't cancel + * locks with refs. + */ +static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, + __u64 flags) +{ + struct list_head *tmp; + int rc = 0, client = ns_is_client(ldlm_res_to_ns(res)); + bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY); + + do { + struct ldlm_lock *lock = NULL; + + /* First, we look for non-cleaned-yet lock + * all cleaned locks are marked by CLEANED flag. */ + lock_res(res); + list_for_each(tmp, q) { + lock = list_entry(tmp, struct ldlm_lock, + l_res_link); + if (lock->l_flags & LDLM_FL_CLEANED) { + lock = NULL; + continue; + } + LDLM_LOCK_GET(lock); + lock->l_flags |= LDLM_FL_CLEANED; + break; + } + + if (lock == NULL) { + unlock_res(res); + break; + } + + /* Set CBPENDING so nothing in the cancellation path + * can match this lock. */ + lock->l_flags |= LDLM_FL_CBPENDING; + lock->l_flags |= LDLM_FL_FAILED; + lock->l_flags |= flags; + + /* ... without sending a CANCEL message for local_only. */ + if (local_only) + lock->l_flags |= LDLM_FL_LOCAL_ONLY; + + if (local_only && (lock->l_readers || lock->l_writers)) { + /* This is a little bit gross, but much better than the + * alternative: pretend that we got a blocking AST from + * the server, so that when the lock is decref'd, it + * will go away ... */ + unlock_res(res); + LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); + if (lock->l_completion_ast) + lock->l_completion_ast(lock, 0, NULL); + LDLM_LOCK_RELEASE(lock); + continue; + } + + if (client) { + struct lustre_handle lockh; + + unlock_res(res); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + ldlm_resource_unlink_lock(lock); + unlock_res(res); + LDLM_DEBUG(lock, "Freeing a lock still held by a " + "client node"); + ldlm_lock_destroy(lock); + } + LDLM_LOCK_RELEASE(lock); + } while (1); +} + +static int ldlm_resource_clean(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + __u64 flags = *(__u64 *)arg; + + cleanup_resource(res, &res->lr_granted, flags); + cleanup_resource(res, &res->lr_converting, flags); + cleanup_resource(res, &res->lr_waiting, flags); + + return 0; +} + +static int ldlm_resource_complain(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + + lock_res(res); + CERROR("Namespace %s resource refcount nonzero " + "(%d) after lock cleanup; forcing " + "cleanup.\n", + ldlm_ns_name(ldlm_res_to_ns(res)), + atomic_read(&res->lr_refcount) - 1); + + CERROR("Resource: %p ("LPU64"/"LPU64"/"LPU64"/" + LPU64") (rc: %d)\n", res, + res->lr_name.name[0], res->lr_name.name[1], + res->lr_name.name[2], res->lr_name.name[3], + atomic_read(&res->lr_refcount) - 1); + + ldlm_resource_dump(D_ERROR, res); + unlock_res(res); + return 0; +} + +/** + * Cancel and destroy all locks in the namespace. + * + * Typically used during evictions when server notified client that it was + * evicted and all of its state needs to be destroyed. + * Also used during shutdown. + */ +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) +{ + if (ns == NULL) { + CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); + return ELDLM_OK; + } + + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, &flags); + cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, NULL); + return ELDLM_OK; +} +EXPORT_SYMBOL(ldlm_namespace_cleanup); + +/** + * Attempts to free namespace. + * + * Only used when namespace goes away, like during an unmount. + */ +static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) +{ + ENTRY; + + /* At shutdown time, don't call the cancellation callback */ + ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); + + if (atomic_read(&ns->ns_bref) > 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc; + CDEBUG(D_DLMTRACE, + "dlm namespace %s free waiting on refcount %d\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); +force_wait: + if (force) + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + + rc = l_wait_event(ns->ns_waitq, + atomic_read(&ns->ns_bref) == 0, &lwi); + + /* Forced cleanups should be able to reclaim all references, + * so it's safe to wait forever... we can't leak locks... */ + if (force && rc == -ETIMEDOUT) { + LCONSOLE_ERROR("Forced cleanup waiting for %s " + "namespace with %d resources in use, " + "(rc=%d)\n", ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + GOTO(force_wait, rc); + } + + if (atomic_read(&ns->ns_bref)) { + LCONSOLE_ERROR("Cleanup waiting for %s namespace " + "with %d resources in use, (rc=%d)\n", + ldlm_ns_name(ns), + atomic_read(&ns->ns_bref), rc); + RETURN(ELDLM_NAMESPACE_EXISTS); + } + CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", + ldlm_ns_name(ns)); + } + + RETURN(ELDLM_OK); +} + +/** + * Performs various cleanups for passed \a ns to make it drop refc and be + * ready for freeing. Waits for refc == 0. + * + * The following is done: + * (0) Unregister \a ns from its list to make inaccessible for potential + * users like pools thread and others; + * (1) Clear all locks in \a ns. + */ +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + int rc; + ENTRY; + if (!ns) { + EXIT; + return; + } + + spin_lock(&ns->ns_lock); + ns->ns_stopping = 1; + spin_unlock(&ns->ns_lock); + + /* + * Can fail with -EINTR when force == 0 in which case try harder. + */ + rc = __ldlm_namespace_free(ns, force); + if (rc != ELDLM_OK) { + if (imp) { + ptlrpc_disconnect_import(imp, 0); + ptlrpc_invalidate_import(imp); + } + + /* + * With all requests dropped and the import inactive + * we are gaurenteed all reference will be dropped. + */ + rc = __ldlm_namespace_free(ns, 1); + LASSERT(rc == 0); + } + EXIT; +} + +/** + * Performs freeing memory structures related to \a ns. This is only done + * when ldlm_namespce_free_prior() successfully removed all resources + * referencing \a ns and its refc == 0. + */ +void ldlm_namespace_free_post(struct ldlm_namespace *ns) +{ + ENTRY; + if (!ns) { + EXIT; + return; + } + + /* Make sure that nobody can find this ns in its list. */ + ldlm_namespace_unregister(ns, ns->ns_client); + /* Fini pool _before_ parent proc dir is removed. This is important as + * ldlm_pool_fini() removes own proc dir which is child to @dir. + * Removing it after @dir may cause oops. */ + ldlm_pool_fini(&ns->ns_pool); + + ldlm_namespace_proc_unregister(ns); + cfs_hash_putref(ns->ns_rs_hash); + /* Namespace \a ns should be not on list at this time, otherwise + * this will cause issues related to using freed \a ns in poold + * thread. */ + LASSERT(list_empty(&ns->ns_list_chain)); + OBD_FREE_PTR(ns); + ldlm_put_ref(); + EXIT; +} + +/** + * Cleanup the resource, and free namespace. + * bug 12864: + * Deadlock issue: + * proc1: destroy import + * class_disconnect_export(grab cl_sem) -> + * -> ldlm_namespace_free -> + * -> lprocfs_remove(grab _lprocfs_lock). + * proc2: read proc info + * lprocfs_fops_read(grab _lprocfs_lock) -> + * -> osc_rd_active, etc(grab cl_sem). + * + * So that I have to split the ldlm_namespace_free into two parts - the first + * part ldlm_namespace_free_prior is used to cleanup the resource which is + * being used; the 2nd part ldlm_namespace_free_post is used to unregister the + * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem + * held. + */ +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) +{ + ldlm_namespace_free_prior(ns, imp, force); + ldlm_namespace_free_post(ns); +} +EXPORT_SYMBOL(ldlm_namespace_free); + +void ldlm_namespace_get(struct ldlm_namespace *ns) +{ + atomic_inc(&ns->ns_bref); +} +EXPORT_SYMBOL(ldlm_namespace_get); + +void ldlm_namespace_put(struct ldlm_namespace *ns) +{ + if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) { + wake_up(&ns->ns_waitq); + spin_unlock(&ns->ns_lock); + } +} +EXPORT_SYMBOL(ldlm_namespace_put); + +/** Register \a ns in the list of namespaces */ +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(list_empty(&ns->ns_list_chain)); + list_add(&ns->ns_list_chain, ldlm_namespace_list(client)); + atomic_inc(ldlm_namespace_nr(client)); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Unregister \a ns from the list of namespaces. */ +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_lock(ldlm_namespace_lock(client)); + LASSERT(!list_empty(&ns->ns_list_chain)); + /* Some asserts and possibly other parts of the code are still + * using list_empty(&ns->ns_list_chain). This is why it is + * important to use list_del_init() here. */ + list_del_init(&ns->ns_list_chain); + atomic_dec(ldlm_namespace_nr(client)); + mutex_unlock(ldlm_namespace_lock(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client) +{ + LASSERT(!list_empty(&ns->ns_list_chain)); + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client)); +} + +/** Should be called with ldlm_namespace_lock(client) taken. */ +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client) +{ + LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); + LASSERT(!list_empty(ldlm_namespace_list(client))); + return container_of(ldlm_namespace_list(client)->next, + struct ldlm_namespace, ns_list_chain); +} + +/** Create and initialize new resource. */ +static struct ldlm_resource *ldlm_resource_new(void) +{ + struct ldlm_resource *res; + int idx; + + OBD_SLAB_ALLOC_PTR_GFP(res, ldlm_resource_slab, __GFP_IO); + if (res == NULL) + return NULL; + + INIT_LIST_HEAD(&res->lr_granted); + INIT_LIST_HEAD(&res->lr_converting); + INIT_LIST_HEAD(&res->lr_waiting); + + /* Initialize interval trees for each lock mode. */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + res->lr_itree[idx].lit_size = 0; + res->lr_itree[idx].lit_mode = 1 << idx; + res->lr_itree[idx].lit_root = NULL; + } + + atomic_set(&res->lr_refcount, 1); + spin_lock_init(&res->lr_lock); + lu_ref_init(&res->lr_reference); + + /* The creator of the resource must unlock the mutex after LVB + * initialization. */ + mutex_init(&res->lr_lvb_mutex); + mutex_lock(&res->lr_lvb_mutex); + + return res; +} + +/** + * Return a reference to resource with given name, creating it if necessary. + * Args: namespace with ns_lock unlocked + * Locks: takes and releases NS hash-lock and res->lr_lock + * Returns: referenced, unlocked ldlm_resource or NULL + */ +struct ldlm_resource * +ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, + const struct ldlm_res_id *name, ldlm_type_t type, int create) +{ + struct hlist_node *hnode; + struct ldlm_resource *res; + cfs_hash_bd_t bd; + __u64 version; + + LASSERT(ns != NULL); + LASSERT(parent == NULL); + LASSERT(ns->ns_rs_hash != NULL); + LASSERT(name->name[0] != 0); + + cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0); + hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + if (hnode != NULL) { + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* Synchronize with regard to resource creation. */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + mutex_lock(&res->lr_lvb_mutex); + mutex_unlock(&res->lr_lvb_mutex); + } + + if (unlikely(res->lr_lvb_len < 0)) { + ldlm_resource_putref(res); + res = NULL; + } + return res; + } + + version = cfs_hash_bd_version_get(&bd); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); + + if (create == 0) + return NULL; + + LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, + "type: %d\n", type); + res = ldlm_resource_new(); + if (!res) + return NULL; + + res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); + res->lr_name = *name; + res->lr_type = type; + res->lr_most_restr = LCK_NL; + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL : + cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); + + if (hnode != NULL) { + /* Someone won the race and already added the resource. */ + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* Clean lu_ref for failed resource. */ + lu_ref_fini(&res->lr_reference); + /* We have taken lr_lvb_mutex. Drop it. */ + mutex_unlock(&res->lr_lvb_mutex); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res); + + res = hlist_entry(hnode, struct ldlm_resource, lr_hash); + /* Synchronize with regard to resource creation. */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + mutex_lock(&res->lr_lvb_mutex); + mutex_unlock(&res->lr_lvb_mutex); + } + + if (unlikely(res->lr_lvb_len < 0)) { + ldlm_resource_putref(res); + res = NULL; + } + return res; + } + /* We won! Let's add the resource. */ + cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash); + if (cfs_hash_bd_count_get(&bd) == 1) + ldlm_namespace_get(ns); + + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { + int rc; + + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); + rc = ns->ns_lvbo->lvbo_init(res); + if (rc < 0) { + CERROR("lvbo_init failed for resource " + LPU64": rc %d\n", name->name[0], rc); + if (res->lr_lvb_data) { + OBD_FREE(res->lr_lvb_data, res->lr_lvb_len); + res->lr_lvb_data = NULL; + } + res->lr_lvb_len = rc; + mutex_unlock(&res->lr_lvb_mutex); + ldlm_resource_putref(res); + return NULL; + } + } + + /* We create resource with locked lr_lvb_mutex. */ + mutex_unlock(&res->lr_lvb_mutex); + + return res; +} +EXPORT_SYMBOL(ldlm_resource_get); + +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) +{ + LASSERT(res != NULL); + LASSERT(res != LP_POISON); + atomic_inc(&res->lr_refcount); + CDEBUG(D_INFO, "getref res: %p count: %d\n", res, + atomic_read(&res->lr_refcount)); + return res; +} + +static void __ldlm_resource_putref_final(cfs_hash_bd_t *bd, + struct ldlm_resource *res) +{ + struct ldlm_ns_bucket *nsb = res->lr_ns_bucket; + + if (!list_empty(&res->lr_granted)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_converting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + if (!list_empty(&res->lr_waiting)) { + ldlm_resource_dump(D_ERROR, res); + LBUG(); + } + + cfs_hash_bd_del_locked(nsb->nsb_namespace->ns_rs_hash, + bd, &res->lr_hash); + lu_ref_fini(&res->lr_reference); + if (cfs_hash_bd_count_get(bd) == 0) + ldlm_namespace_put(nsb->nsb_namespace); +} + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + cfs_hash_bd_t bd; + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd); + if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) { + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res); + return 1; + } + return 0; +} +EXPORT_SYMBOL(ldlm_resource_putref); + +/* Returns 1 if the resource was freed, 0 if it remains. */ +int ldlm_resource_putref_locked(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + + LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "putref res: %p count: %d\n", + res, atomic_read(&res->lr_refcount) - 1); + + if (atomic_dec_and_test(&res->lr_refcount)) { + cfs_hash_bd_t bd; + + cfs_hash_bd_get(ldlm_res_to_ns(res)->ns_rs_hash, + &res->lr_name, &bd); + __ldlm_resource_putref_final(&bd, res); + cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); + /* NB: ns_rs_hash is created with CFS_HASH_NO_ITEMREF, + * so we should never be here while calling cfs_hash_del, + * cfs_hash_for_each_nolock is the only case we can get + * here, which is safe to release cfs_hash_bd_lock. + */ + if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) + ns->ns_lvbo->lvbo_free(res); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res); + + cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); + return 1; + } + return 0; +} + +/** + * Add a lock into a given resource into specified lock list. + */ +void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, + struct ldlm_lock *lock) +{ + check_res_locked(res); + + LDLM_DEBUG(lock, "About to add this lock:\n"); + + if (lock->l_destroyed) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + + LASSERT(list_empty(&lock->l_res_link)); + + list_add_tail(&lock->l_res_link, head); +} + +/** + * Insert a lock into resource after specified lock. + * + * Obtain resource description from the lock we are inserting after. + */ +void ldlm_resource_insert_lock_after(struct ldlm_lock *original, + struct ldlm_lock *new) +{ + struct ldlm_resource *res = original->l_resource; + + check_res_locked(res); + + ldlm_resource_dump(D_INFO, res); + LDLM_DEBUG(new, "About to insert this lock after %p:\n", original); + + if (new->l_destroyed) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + goto out; + } + + LASSERT(list_empty(&new->l_res_link)); + + list_add(&new->l_res_link, &original->l_res_link); + out:; +} + +void ldlm_resource_unlink_lock(struct ldlm_lock *lock) +{ + int type = lock->l_resource->lr_type; + + check_res_locked(lock->l_resource); + if (type == LDLM_IBITS || type == LDLM_PLAIN) + ldlm_unlink_lock_skiplist(lock); + else if (type == LDLM_EXTENT) + ldlm_extent_unlink_lock(lock); + list_del_init(&lock->l_res_link); +} +EXPORT_SYMBOL(ldlm_resource_unlink_lock); + +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) +{ + desc->lr_type = res->lr_type; + desc->lr_name = res->lr_name; +} + +/** + * Print information about all locks in all namespaces on this node to debug + * log. + */ +void ldlm_dump_all_namespaces(ldlm_side_t client, int level) +{ + struct list_head *tmp; + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + mutex_lock(ldlm_namespace_lock(client)); + + list_for_each(tmp, ldlm_namespace_list(client)) { + struct ldlm_namespace *ns; + ns = list_entry(tmp, struct ldlm_namespace, ns_list_chain); + ldlm_namespace_dump(level, ns); + } + + mutex_unlock(ldlm_namespace_lock(client)); +} +EXPORT_SYMBOL(ldlm_dump_all_namespaces); + +static int ldlm_res_hash_dump(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *arg) +{ + struct ldlm_resource *res = cfs_hash_object(hs, hnode); + int level = (int)(unsigned long)arg; + + lock_res(res); + ldlm_resource_dump(level, res); + unlock_res(res); + + return 0; +} + +/** + * Print information about all locks in this namespace on this node to debug + * log. + */ +void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) +{ + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", + ldlm_ns_name(ns), atomic_read(&ns->ns_bref), + ns_is_client(ns) ? "client" : "server"); + + if (cfs_time_before(cfs_time_current(), ns->ns_next_dump)) + return; + + cfs_hash_for_each_nolock(ns->ns_rs_hash, + ldlm_res_hash_dump, + (void *)(unsigned long)level); + spin_lock(&ns->ns_lock); + ns->ns_next_dump = cfs_time_shift(10); + spin_unlock(&ns->ns_lock); +} +EXPORT_SYMBOL(ldlm_namespace_dump); + +/** + * Print information about all locks in this resource to debug log. + */ +void ldlm_resource_dump(int level, struct ldlm_resource *res) +{ + struct ldlm_lock *lock; + unsigned int granted = 0; + + CLASSERT(RES_NAME_SIZE == 4); + + if (!((libcfs_debug | D_ERROR) & level)) + return; + + CDEBUG(level, "--- Resource: %p ("LPU64"/"LPU64"/"LPU64"/"LPU64 + ") (rc: %d)\n", res, res->lr_name.name[0], res->lr_name.name[1], + res->lr_name.name[2], res->lr_name.name[3], + atomic_read(&res->lr_refcount)); + + if (!list_empty(&res->lr_granted)) { + CDEBUG(level, "Granted locks (in reverse order):\n"); + list_for_each_entry_reverse(lock, &res->lr_granted, + l_res_link) { + LDLM_DEBUG_LIMIT(level, lock, "###"); + if (!(level & D_CANTMASK) && + ++granted > ldlm_dump_granted_max) { + CDEBUG(level, "only dump %d granted locks to " + "avoid DDOS.\n", granted); + break; + } + } + } + if (!list_empty(&res->lr_converting)) { + CDEBUG(level, "Converting locks:\n"); + list_for_each_entry(lock, &res->lr_converting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } + if (!list_empty(&res->lr_waiting)) { + CDEBUG(level, "Waiting locks:\n"); + list_for_each_entry(lock, &res->lr_waiting, l_res_link) + LDLM_DEBUG_LIMIT(level, lock, "###"); + } +} diff --git a/drivers/staging/lustre/lustre/libcfs/Makefile b/drivers/staging/lustre/lustre/libcfs/Makefile new file mode 100644 index 000000000000..d64a3d0aec74 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/Makefile @@ -0,0 +1,22 @@ +obj-$(CONFIG_LUSTRE_FS) += libcfs.o + +libcfs-linux-objs := linux-tracefile.o linux-debug.o +libcfs-linux-objs += linux-prim.o linux-mem.o linux-cpu.o +libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o +libcfs-linux-objs += linux-proc.o linux-curproc.o +libcfs-linux-objs += linux-utils.o linux-module.o +libcfs-linux-objs += linux-crypto.o linux-crypto-crc32.o +libcfs-linux-objs += linux-crypto-adler.o +libcfs-linux-objs += linux-crypto-crc32pclmul.o + +libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs)) + +libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \ + watchdog.o libcfs_string.o hash.o kernel_user_comm.o \ + prng.o workitem.o upcall_cache.o libcfs_cpu.o \ + libcfs_mem.o libcfs_lock.o crc32-pclmul_asm.o + +libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) + +ccflags-y := -I$(src)/../include +ccflags-y += -I$(src)/ diff --git a/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S b/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S new file mode 100644 index 000000000000..cfaf13f275b3 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/crc32-pclmul_asm.S @@ -0,0 +1,360 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 + * calculation. + * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) + * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found + * at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2B: Instruction Set Reference, N-Z + * + * Authors: Gregory Prestas + * Alexander Boyko + */ + +/* gcc 4.1.2 does not support pclmulqdq instruction + * Use macro defenition from linux kernel 2.6.38 */ + +#define REG_NUM_INVALID 100 + .macro R32_NUM opd r32 + \opd = REG_NUM_INVALID + .ifc \r32,%eax + \opd = 0 + .endif + .ifc \r32,%ecx + \opd = 1 + .endif + .ifc \r32,%edx + \opd = 2 + .endif + .ifc \r32,%ebx + \opd = 3 + .endif + .ifc \r32,%esp + \opd = 4 + .endif + .ifc \r32,%ebp + \opd = 5 + .endif + .ifc \r32,%esi + \opd = 6 + .endif + .ifc \r32,%edi + \opd = 7 + .endif + .endm + + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro PEXTRD imm8 xmm1 reg1 + XMM_NUM extrd_opd2 \xmm1 + R32_NUM extrd_opd1 \reg1 + PFX_OPD_SIZE + PFX_REX extrd_opd1 extrd_opd2 + .byte 0x0f, 0x3a, 0x16 + MODRM 0xc0 extrd_opd1 extrd_opd2 + .byte \imm8 + .endm + +.align 16 +/* + * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 + * #define CONSTANT_R1 0x154442bd4LL + * + * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 + * #define CONSTANT_R2 0x1c6e41596LL + */ +.Lconstant_R2R1: + .octa 0x00000001c6e415960000000154442bd4 +/* + * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 + * #define CONSTANT_R3 0x1751997d0LL + * + * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e + * #define CONSTANT_R4 0x0ccaa009eLL + */ +.Lconstant_R4R3: + .octa 0x00000000ccaa009e00000001751997d0 +/* + * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 + * #define CONSTANT_R5 0x163cd6124LL + */ +.Lconstant_R5: + .octa 0x00000000000000000000000163cd6124 +.Lconstant_mask32: + .octa 0x000000000000000000000000FFFFFFFF +/* + * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL + * + * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL + * #define CONSTANT_RU 0x1F7011641LL + */ +.Lconstant_RUpoly: + .octa 0x00000001F701164100000001DB710641 + +#define CONSTANT %xmm0 + +#ifdef __x86_64__ +#define BUF %rdi +#define LEN %rsi +#define CRC %edx +#else +#define BUF %eax +#define LEN %edx +#define CRC %ecx +#endif + + + +.text +/** + * Calculate crc32 + * BUF - buffer (16 bytes aligned) + * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 + * CRC - initial crc32 + * return %eax crc32 + * uint crc32_pclmul_le_16(unsigned char const *buffer, + * size_t len, uint crc32) + */ +.globl crc32_pclmul_le_16 +.align 4, 0x90 +crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */ + movdqa (BUF), %xmm1 + movdqa 0x10(BUF), %xmm2 + movdqa 0x20(BUF), %xmm3 + movdqa 0x30(BUF), %xmm4 + movd CRC, CONSTANT + pxor CONSTANT, %xmm1 + sub $0x40, LEN + add $0x40, BUF +#ifndef __x86_64__ + /* This is for position independed code(-fPIC) support for 32bit */ + call delta +delta: + pop %ecx +#endif + cmp $0x40, LEN + jb less_64 + +#ifdef __x86_64__ + movdqa .Lconstant_R2R1(%rip), CONSTANT +#else + movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT +#endif + +loop_64:/* 64 bytes Full cache line folding */ + prefetchnta 0x40(BUF) + movdqa %xmm1, %xmm5 + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 +#ifdef __x86_64__ + movdqa %xmm4, %xmm8 +#endif + PCLMULQDQ 00, CONSTANT, %xmm1 + PCLMULQDQ 00, CONSTANT, %xmm2 + PCLMULQDQ 00, CONSTANT, %xmm3 +#ifdef __x86_64__ + PCLMULQDQ 00, CONSTANT, %xmm4 +#endif + PCLMULQDQ 0x11, CONSTANT, %xmm5 + PCLMULQDQ 0x11, CONSTANT, %xmm6 + PCLMULQDQ 0x11, CONSTANT, %xmm7 +#ifdef __x86_64__ + PCLMULQDQ 0x11, CONSTANT, %xmm8 +#endif + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 +#ifdef __x86_64__ + pxor %xmm8, %xmm4 +#else + /* xmm8 unsupported for x32 */ + movdqa %xmm4, %xmm5 + PCLMULQDQ 00, CONSTANT, %xmm4 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm4 +#endif + + pxor (BUF), %xmm1 + pxor 0x10(BUF), %xmm2 + pxor 0x20(BUF), %xmm3 + pxor 0x30(BUF), %xmm4 + + sub $0x40, LEN + add $0x40, BUF + cmp $0x40, LEN + jge loop_64 +less_64:/* Folding cache line into 128bit */ +#ifdef __x86_64__ + movdqa .Lconstant_R4R3(%rip), CONSTANT +#else + movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT +#endif + prefetchnta (BUF) + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm2, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm3, %xmm1 + + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor %xmm4, %xmm1 + + cmp $0x10, LEN + jb fold_64 +loop_16:/* Folding rest buffer into 128bit */ + movdqa %xmm1, %xmm5 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + PCLMULQDQ 0x11, CONSTANT, %xmm5 + pxor %xmm5, %xmm1 + pxor (BUF), %xmm1 + sub $0x10, LEN + add $0x10, BUF + cmp $0x10, LEN + jge loop_16 + +fold_64: + /* perform the last 64 bit fold, also adds 32 zeroes + * to the input stream */ + PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + psrldq $0x08, %xmm1 + pxor CONSTANT, %xmm1 + + /* final 32-bit fold */ + movdqa %xmm1, %xmm2 +#ifdef __x86_64__ + movdqa .Lconstant_R5(%rip), CONSTANT + movdqa .Lconstant_mask32(%rip), %xmm3 +#else + movdqa .Lconstant_R5 - delta(%ecx), CONSTANT + movdqa .Lconstant_mask32 - delta(%ecx), %xmm3 +#endif + psrldq $0x04, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + + /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +#ifdef __x86_64__ + movdqa .Lconstant_RUpoly(%rip), CONSTANT +#else + movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT +#endif + movdqa %xmm1, %xmm2 + pand %xmm3, %xmm1 + PCLMULQDQ 0x10, CONSTANT, %xmm1 + pand %xmm3, %xmm1 + PCLMULQDQ 0x00, CONSTANT, %xmm1 + pxor %xmm2, %xmm1 + PEXTRD 0x01, %xmm1, %eax + + ret diff --git a/drivers/staging/lustre/lustre/libcfs/debug.c b/drivers/staging/lustre/lustre/libcfs/debug.c new file mode 100644 index 000000000000..5a87b0832074 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/debug.c @@ -0,0 +1,476 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/debug.c + * + * Author: Phil Schwan + * + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include "tracefile.h" + +static char debug_file_name[1024]; + +unsigned int libcfs_subsystem_debug = ~0; +CFS_MODULE_PARM(libcfs_subsystem_debug, "i", int, 0644, + "Lustre kernel debug subsystem mask"); +EXPORT_SYMBOL(libcfs_subsystem_debug); + +unsigned int libcfs_debug = (D_CANTMASK | + D_NETERROR | D_HA | D_CONFIG | D_IOCTL); +CFS_MODULE_PARM(libcfs_debug, "i", int, 0644, + "Lustre kernel debug mask"); +EXPORT_SYMBOL(libcfs_debug); + +unsigned int libcfs_debug_mb = 0; +CFS_MODULE_PARM(libcfs_debug_mb, "i", uint, 0644, + "Total debug buffer size."); +EXPORT_SYMBOL(libcfs_debug_mb); + +unsigned int libcfs_printk = D_CANTMASK; +CFS_MODULE_PARM(libcfs_printk, "i", uint, 0644, + "Lustre kernel debug console mask"); +EXPORT_SYMBOL(libcfs_printk); + +unsigned int libcfs_console_ratelimit = 1; +CFS_MODULE_PARM(libcfs_console_ratelimit, "i", uint, 0644, + "Lustre kernel debug console ratelimit (0 to disable)"); +EXPORT_SYMBOL(libcfs_console_ratelimit); + +unsigned int libcfs_console_max_delay; +CFS_MODULE_PARM(libcfs_console_max_delay, "l", uint, 0644, + "Lustre kernel debug console max delay (jiffies)"); +EXPORT_SYMBOL(libcfs_console_max_delay); + +unsigned int libcfs_console_min_delay; +CFS_MODULE_PARM(libcfs_console_min_delay, "l", uint, 0644, + "Lustre kernel debug console min delay (jiffies)"); +EXPORT_SYMBOL(libcfs_console_min_delay); + +unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF; +CFS_MODULE_PARM(libcfs_console_backoff, "i", uint, 0644, + "Lustre kernel debug console backoff factor"); +EXPORT_SYMBOL(libcfs_console_backoff); + +unsigned int libcfs_debug_binary = 1; +EXPORT_SYMBOL(libcfs_debug_binary); + +unsigned int libcfs_stack = 3 * THREAD_SIZE / 4; +EXPORT_SYMBOL(libcfs_stack); + +unsigned int portal_enter_debugger; +EXPORT_SYMBOL(portal_enter_debugger); + +unsigned int libcfs_catastrophe; +EXPORT_SYMBOL(libcfs_catastrophe); + +unsigned int libcfs_watchdog_ratelimit = 300; +EXPORT_SYMBOL(libcfs_watchdog_ratelimit); + +unsigned int libcfs_panic_on_lbug = 1; +CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644, + "Lustre kernel panic on LBUG"); +EXPORT_SYMBOL(libcfs_panic_on_lbug); + +atomic_t libcfs_kmemory = ATOMIC_INIT(0); +EXPORT_SYMBOL(libcfs_kmemory); + +static wait_queue_head_t debug_ctlwq; + +char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT; + +/* We need to pass a pointer here, but elsewhere this must be a const */ +char *libcfs_debug_file_path; +CFS_MODULE_PARM(libcfs_debug_file_path, "s", charp, 0644, + "Path for dumping debug logs, " + "set 'NONE' to prevent log dumping"); + +int libcfs_panic_in_progress; + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +const char * +libcfs_debug_subsys2str(int subsys) +{ + switch (1 << subsys) { + default: + return NULL; + case S_UNDEFINED: + return "undefined"; + case S_MDC: + return "mdc"; + case S_MDS: + return "mds"; + case S_OSC: + return "osc"; + case S_OST: + return "ost"; + case S_CLASS: + return "class"; + case S_LOG: + return "log"; + case S_LLITE: + return "llite"; + case S_RPC: + return "rpc"; + case S_LNET: + return "lnet"; + case S_LND: + return "lnd"; + case S_PINGER: + return "pinger"; + case S_FILTER: + return "filter"; + case S_ECHO: + return "echo"; + case S_LDLM: + return "ldlm"; + case S_LOV: + return "lov"; + case S_LQUOTA: + return "lquota"; + case S_OSD: + return "osd"; + case S_LMV: + return "lmv"; + case S_SEC: + return "sec"; + case S_GSS: + return "gss"; + case S_MGC: + return "mgc"; + case S_MGS: + return "mgs"; + case S_FID: + return "fid"; + case S_FLD: + return "fld"; + } +} + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +const char * +libcfs_debug_dbg2str(int debug) +{ + switch (1 << debug) { + default: + return NULL; + case D_TRACE: + return "trace"; + case D_INODE: + return "inode"; + case D_SUPER: + return "super"; + case D_EXT2: + return "ext2"; + case D_MALLOC: + return "malloc"; + case D_CACHE: + return "cache"; + case D_INFO: + return "info"; + case D_IOCTL: + return "ioctl"; + case D_NETERROR: + return "neterror"; + case D_NET: + return "net"; + case D_WARNING: + return "warning"; + case D_BUFFS: + return "buffs"; + case D_OTHER: + return "other"; + case D_DENTRY: + return "dentry"; + case D_NETTRACE: + return "nettrace"; + case D_PAGE: + return "page"; + case D_DLMTRACE: + return "dlmtrace"; + case D_ERROR: + return "error"; + case D_EMERG: + return "emerg"; + case D_HA: + return "ha"; + case D_RPCTRACE: + return "rpctrace"; + case D_VFSTRACE: + return "vfstrace"; + case D_READA: + return "reada"; + case D_MMAP: + return "mmap"; + case D_CONFIG: + return "config"; + case D_CONSOLE: + return "console"; + case D_QUOTA: + return "quota"; + case D_SEC: + return "sec"; + case D_LFSCK: + return "lfsck"; + } +} + +int +libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int len = 0; + const char *token; + int i; + + if (mask == 0) { /* "0" */ + if (size > 0) + str[0] = '0'; + len = 1; + } else { /* space-separated tokens */ + for (i = 0; i < 32; i++) { + if ((mask & (1 << i)) == 0) + continue; + + token = fn(i); + if (token == NULL) /* unused bit */ + continue; + + if (len > 0) { /* separator? */ + if (len < size) + str[len] = ' '; + len++; + } + + while (*token != 0) { + if (len < size) + str[len] = *token; + token++; + len++; + } + } + } + + /* terminate 'str' */ + if (len < size) + str[len] = 0; + else + str[size - 1] = 0; + + return len; +} + +int +libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int m = 0; + int matched; + int n; + int t; + + /* Allow a number for backwards compatibility */ + + for (n = strlen(str); n > 0; n--) + if (!isspace(str[n-1])) + break; + matched = n; + + if ((t = sscanf(str, "%i%n", &m, &matched)) >= 1 && + matched == n) { + /* don't print warning for lctl set_param debug=0 or -1 */ + if (m != 0 && m != -1) + CWARN("You are trying to use a numerical value for the " + "mask - this will be deprecated in a future " + "release.\n"); + *mask = m; + return 0; + } + + return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, + 0xffffffff); +} + +/** + * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() + */ +void libcfs_debug_dumplog_internal(void *arg) +{ + DECL_JOURNAL_DATA; + + PUSH_JOURNAL; + + if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0) { + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s.%ld." LPLD, libcfs_debug_file_path_arr, + cfs_time_current_sec(), (long_ptr_t)arg); + printk(KERN_ALERT "LustreError: dumping log to %s\n", + debug_file_name); + cfs_tracefile_dump_all_pages(debug_file_name); + libcfs_run_debug_log_upcall(debug_file_name); + } + POP_JOURNAL; +} + +int libcfs_debug_dumplog_thread(void *arg) +{ + libcfs_debug_dumplog_internal(arg); + wake_up(&debug_ctlwq); + return 0; +} + +void libcfs_debug_dumplog(void) +{ + wait_queue_t wait; + task_t *dumper; + ENTRY; + + /* we're being careful to ensure that the kernel thread is + * able to set our state to running as it exits before we + * get to schedule() */ + init_waitqueue_entry_current(&wait); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&debug_ctlwq, &wait); + + dumper = kthread_run(libcfs_debug_dumplog_thread, + (void *)(long)current_pid(), + "libcfs_debug_dumper"); + if (IS_ERR(dumper)) + printk(KERN_ERR "LustreError: cannot start log dump thread:" + " %ld\n", PTR_ERR(dumper)); + else + waitq_wait(&wait, TASK_INTERRUPTIBLE); + + /* be sure to teardown if cfs_create_thread() failed */ + remove_wait_queue(&debug_ctlwq, &wait); + set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL(libcfs_debug_dumplog); + +int libcfs_debug_init(unsigned long bufsize) +{ + int rc = 0; + unsigned int max = libcfs_debug_mb; + + init_waitqueue_head(&debug_ctlwq); + + if (libcfs_console_max_delay <= 0 || /* not set by user or */ + libcfs_console_min_delay <= 0 || /* set to invalid values */ + libcfs_console_min_delay >= libcfs_console_max_delay) { + libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY; + libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY; + } + + if (libcfs_debug_file_path != NULL) { + memset(libcfs_debug_file_path_arr, 0, PATH_MAX); + strncpy(libcfs_debug_file_path_arr, + libcfs_debug_file_path, PATH_MAX-1); + } + + /* If libcfs_debug_mb is set to an invalid value or uninitialized + * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES */ + if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) { + max = TCD_MAX_PAGES; + } else { + max = (max / num_possible_cpus()); + max = (max << (20 - PAGE_CACHE_SHIFT)); + } + rc = cfs_tracefile_init(max); + + if (rc == 0) + libcfs_register_panic_notifier(); + + return rc; +} + +int libcfs_debug_cleanup(void) +{ + libcfs_unregister_panic_notifier(); + cfs_tracefile_exit(); + return 0; +} + +int libcfs_debug_clear_buffer(void) +{ + cfs_trace_flush_pages(); + return 0; +} + +/* Debug markers, although printed by S_LNET + * should not be be marked as such. */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int libcfs_debug_mark_buffer(const char *text) +{ + CDEBUG(D_TRACE,"***************************************************\n"); + LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text); + CDEBUG(D_TRACE,"***************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_LNET + +void libcfs_debug_set_level(unsigned int debug_level) +{ + printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n", + debug_level); + libcfs_debug = debug_level; +} + +EXPORT_SYMBOL(libcfs_debug_set_level); + +long libcfs_log_return(struct libcfs_debug_msg_data *msgdata, long rc) +{ + libcfs_debug_msg(msgdata, "Process leaving (rc=%lu : %ld : %lx)\n", + rc, rc, rc); + return rc; +} +EXPORT_SYMBOL(libcfs_log_return); + +void libcfs_log_goto(struct libcfs_debug_msg_data *msgdata, const char *label, + long_ptr_t rc) +{ + libcfs_debug_msg(msgdata, "Process leaving via %s (rc=" LPLU " : " LPLD + " : " LPLX ")\n", label, (ulong_ptr_t)rc, rc, rc); +} +EXPORT_SYMBOL(libcfs_log_goto); diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c new file mode 100644 index 000000000000..c54448d69008 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/fail.c @@ -0,0 +1,137 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please contact Oracle Corporation, Inc., 500 Oracle Parkway, Redwood Shores, + * CA 94065 USA or visit www.oracle.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Oracle Corporation, Inc. + */ + +#include + +unsigned long cfs_fail_loc = 0; +unsigned int cfs_fail_val = 0; +wait_queue_head_t cfs_race_waitq; +int cfs_race_state; + +EXPORT_SYMBOL(cfs_fail_loc); +EXPORT_SYMBOL(cfs_fail_val); +EXPORT_SYMBOL(cfs_race_waitq); +EXPORT_SYMBOL(cfs_race_state); + +int __cfs_fail_check_set(__u32 id, __u32 value, int set) +{ + static atomic_t cfs_fail_count = ATOMIC_INIT(0); + + LASSERT(!(id & CFS_FAIL_ONCE)); + + if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) == + (CFS_FAILED | CFS_FAIL_ONCE)) { + atomic_set(&cfs_fail_count, 0); /* paranoia */ + return 0; + } + + /* Fail 1/cfs_fail_val times */ + if (cfs_fail_loc & CFS_FAIL_RAND) { + if (cfs_fail_val < 2 || cfs_rand() % cfs_fail_val > 0) + return 0; + } + + /* Skip the first cfs_fail_val, then fail */ + if (cfs_fail_loc & CFS_FAIL_SKIP) { + if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val) + return 0; + } + + /* check cfs_fail_val... */ + if (set == CFS_FAIL_LOC_VALUE) { + if (cfs_fail_val != -1 && cfs_fail_val != value) + return 0; + } + + /* Fail cfs_fail_val times, overridden by FAIL_ONCE */ + if (cfs_fail_loc & CFS_FAIL_SOME && + (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) { + int count = atomic_inc_return(&cfs_fail_count); + + if (count >= cfs_fail_val) { + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + atomic_set(&cfs_fail_count, 0); + /* we are lost race to increase */ + if (count > cfs_fail_val) + return 0; + } + } + + if ((set == CFS_FAIL_LOC_ORSET || set == CFS_FAIL_LOC_RESET) && + (value & CFS_FAIL_ONCE)) + set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); + /* Lost race to set CFS_FAILED_BIT. */ + if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { + /* If CFS_FAIL_ONCE is valid, only one process can fail, + * otherwise multi-process can fail at the same time. */ + if (cfs_fail_loc & CFS_FAIL_ONCE) + return 0; + } + + switch (set) { + case CFS_FAIL_LOC_NOSET: + case CFS_FAIL_LOC_VALUE: + break; + case CFS_FAIL_LOC_ORSET: + cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE); + break; + case CFS_FAIL_LOC_RESET: + cfs_fail_loc = value; + break; + default: + LASSERTF(0, "called with bad set %u\n", set); + break; + } + + return 1; +} +EXPORT_SYMBOL(__cfs_fail_check_set); + +int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) +{ + int ret = 0; + + ret = __cfs_fail_check_set(id, value, set); + if (ret) { + CERROR("cfs_fail_timeout id %x sleeping for %dms\n", + id, ms); + schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, + cfs_time_seconds(ms) / 1000); + set_current_state(TASK_RUNNING); + CERROR("cfs_fail_timeout id %x awake\n", id); + } + return ret; +} +EXPORT_SYMBOL(__cfs_fail_timeout_set); diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c new file mode 100644 index 000000000000..231c678f76df --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@ -0,0 +1,2135 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/hash.c + * + * Implement a hash class for hash process in lustre system. + * + * Author: YuZhangyong + * + * 2008-08-15: Brian Behlendorf + * - Simplified API and improved documentation + * - Added per-hash feature flags: + * * CFS_HASH_DEBUG additional validation + * * CFS_HASH_REHASH dynamic rehashing + * - Added per-hash statistics + * - General performance enhancements + * + * 2009-07-31: Liang Zhen + * - move all stuff to libcfs + * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH + * - ignore hs_rwlock if without CFS_HASH_REHASH setting + * - buckets are allocated one by one(intead of contiguous memory), + * to avoid unnecessary cacheline conflict + * + * 2010-03-01: Liang Zhen + * - "bucket" is a group of hlist_head now, user can speicify bucket size + * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share + * one lock for reducing memory overhead. + * + * - support lockless hash, caller will take care of locks: + * avoid lock overhead for hash tables that are already protected + * by locking in the caller for another reason + * + * - support both spin_lock/rwlock for bucket: + * overhead of spinlock contention is lower than read/write + * contention of rwlock, so using spinlock to serialize operations on + * bucket is more reasonable for those frequently changed hash tables + * + * - support one-single lock mode: + * one lock to protect all hash operations to avoid overhead of + * multiple locks if hash table is always small + * + * - removed a lot of unnecessary addref & decref on hash element: + * addref & decref are atomic operations in many use-cases which + * are expensive. + * + * - support non-blocking cfs_hash_add() and cfs_hash_findadd(): + * some lustre use-cases require these functions to be strictly + * non-blocking, we need to schedule required rehash on a different + * thread on those cases. + * + * - safer rehash on large hash table + * In old implementation, rehash function will exclusively lock the + * hash table and finish rehash in one batch, it's dangerous on SMP + * system because rehash millions of elements could take long time. + * New implemented rehash can release lock and relax CPU in middle + * of rehash, it's safe for another thread to search/change on the + * hash table even it's in rehasing. + * + * - support two different refcount modes + * . hash table has refcount on element + * . hash table doesn't change refcount on adding/removing element + * + * - support long name hash table (for param-tree) + * + * - fix a bug for cfs_hash_rehash_key: + * in old implementation, cfs_hash_rehash_key could screw up the + * hash-table because @key is overwritten without any protection. + * Now we need user to define hs_keycpy for those rehash enabled + * hash tables, cfs_hash_rehash_key will overwrite hash-key + * inside lock by calling hs_keycpy. + * + * - better hash iteration: + * Now we support both locked iteration & lockless iteration of hash + * table. Also, user can break the iteration by return 1 in callback. + */ + +#include + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static unsigned int warn_on_depth = 8; +CFS_MODULE_PARM(warn_on_depth, "i", uint, 0644, + "warning when hash depth is high."); +#endif + +struct cfs_wi_sched *cfs_sched_rehash; + +static inline void +cfs_hash_nl_lock(cfs_hash_lock_t *lock, int exclusive) {} + +static inline void +cfs_hash_nl_unlock(cfs_hash_lock_t *lock, int exclusive) {} + +static inline void +cfs_hash_spin_lock(cfs_hash_lock_t *lock, int exclusive) +{ + spin_lock(&lock->spin); +} + +static inline void +cfs_hash_spin_unlock(cfs_hash_lock_t *lock, int exclusive) +{ + spin_unlock(&lock->spin); +} + +static inline void +cfs_hash_rw_lock(cfs_hash_lock_t *lock, int exclusive) +{ + if (!exclusive) + read_lock(&lock->rw); + else + write_lock(&lock->rw); +} + +static inline void +cfs_hash_rw_unlock(cfs_hash_lock_t *lock, int exclusive) +{ + if (!exclusive) + read_unlock(&lock->rw); + else + write_unlock(&lock->rw); +} + +/** No lock hash */ +static cfs_hash_lock_ops_t cfs_hash_nl_lops = +{ + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** no bucket lock, one spinlock to protect everything */ +static cfs_hash_lock_ops_t cfs_hash_nbl_lops = +{ + .hs_lock = cfs_hash_spin_lock, + .hs_unlock = cfs_hash_spin_unlock, + .hs_bkt_lock = cfs_hash_nl_lock, + .hs_bkt_unlock = cfs_hash_nl_unlock, +}; + +/** spin bucket lock, rehash is enabled */ +static cfs_hash_lock_ops_t cfs_hash_bkt_spin_lops = +{ + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is enabled */ +static cfs_hash_lock_ops_t cfs_hash_bkt_rw_lops = +{ + .hs_lock = cfs_hash_rw_lock, + .hs_unlock = cfs_hash_rw_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +/** spin bucket lock, rehash is disabled */ +static cfs_hash_lock_ops_t cfs_hash_nr_bkt_spin_lops = +{ + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_spin_lock, + .hs_bkt_unlock = cfs_hash_spin_unlock, +}; + +/** rw bucket lock, rehash is disabled */ +static cfs_hash_lock_ops_t cfs_hash_nr_bkt_rw_lops = +{ + .hs_lock = cfs_hash_nl_lock, + .hs_unlock = cfs_hash_nl_unlock, + .hs_bkt_lock = cfs_hash_rw_lock, + .hs_bkt_unlock = cfs_hash_rw_unlock, +}; + +static void +cfs_hash_lock_setup(cfs_hash_t *hs) +{ + if (cfs_hash_with_no_lock(hs)) { + hs->hs_lops = &cfs_hash_nl_lops; + + } else if (cfs_hash_with_no_bktlock(hs)) { + hs->hs_lops = &cfs_hash_nbl_lops; + spin_lock_init(&hs->hs_lock.spin); + + } else if (cfs_hash_with_rehash(hs)) { + rwlock_init(&hs->hs_lock.rw); + + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_bkt_spin_lops; + else + LBUG(); + } else { + if (cfs_hash_with_rw_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_rw_lops; + else if (cfs_hash_with_spin_bktlock(hs)) + hs->hs_lops = &cfs_hash_nr_bkt_spin_lops; + else + LBUG(); + } +} + +/** + * Simple hash head without depth tracking + * new element is always added to head of hlist + */ +typedef struct { + struct hlist_head hh_head; /**< entries list */ +} cfs_hash_head_t; + +static int +cfs_hash_hh_hhead_size(cfs_hash_t *hs) +{ + return sizeof(cfs_hash_head_t); +} + +static struct hlist_head * +cfs_hash_hh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + cfs_hash_head_t *head = (cfs_hash_head_t *)&bd->bd_bucket->hsb_head[0]; + + return &head[bd->bd_offset].hh_head; +} + +static int +cfs_hash_hh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd)); + return -1; /* unknown depth */ +} + +static int +cfs_hash_hh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + hlist_del_init(hnode); + return -1; /* unknown depth */ +} + +/** + * Simple hash head with depth tracking + * new element is always added to head of hlist + */ +typedef struct { + struct hlist_head hd_head; /**< entries list */ + unsigned int hd_depth; /**< list length */ +} cfs_hash_head_dep_t; + +static int +cfs_hash_hd_hhead_size(cfs_hash_t *hs) +{ + return sizeof(cfs_hash_head_dep_t); +} + +static struct hlist_head * +cfs_hash_hd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + cfs_hash_head_dep_t *head; + + head = (cfs_hash_head_dep_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].hd_head; +} + +static int +cfs_hash_hd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd), + cfs_hash_head_dep_t, hd_head); + hlist_add_head(hnode, &hh->hd_head); + return ++hh->hd_depth; +} + +static int +cfs_hash_hd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + cfs_hash_head_dep_t *hh = container_of(cfs_hash_hd_hhead(hs, bd), + cfs_hash_head_dep_t, hd_head); + hlist_del_init(hnode); + return --hh->hd_depth; +} + +/** + * double links hash head without depth tracking + * new element is always added to tail of hlist + */ +typedef struct { + struct hlist_head dh_head; /**< entries list */ + struct hlist_node *dh_tail; /**< the last entry */ +} cfs_hash_dhead_t; + +static int +cfs_hash_dh_hhead_size(cfs_hash_t *hs) +{ + return sizeof(cfs_hash_dhead_t); +} + +static struct hlist_head * +cfs_hash_dh_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + cfs_hash_dhead_t *head; + + head = (cfs_hash_dhead_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dh_head; +} + +static int +cfs_hash_dh_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd), + cfs_hash_dhead_t, dh_head); + + if (dh->dh_tail != NULL) /* not empty */ + hlist_add_after(dh->dh_tail, hnode); + else /* empty list */ + hlist_add_head(hnode, &dh->dh_head); + dh->dh_tail = hnode; + return -1; /* unknown depth */ +} + +static int +cfs_hash_dh_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnd) +{ + cfs_hash_dhead_t *dh = container_of(cfs_hash_dh_hhead(hs, bd), + cfs_hash_dhead_t, dh_head); + + if (hnd->next == NULL) { /* it's the tail */ + dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return -1; /* unknown depth */ +} + +/** + * double links hash head with depth tracking + * new element is always added to tail of hlist + */ +typedef struct { + struct hlist_head dd_head; /**< entries list */ + struct hlist_node *dd_tail; /**< the last entry */ + unsigned int dd_depth; /**< list length */ +} cfs_hash_dhead_dep_t; + +static int +cfs_hash_dd_hhead_size(cfs_hash_t *hs) +{ + return sizeof(cfs_hash_dhead_dep_t); +} + +static struct hlist_head * +cfs_hash_dd_hhead(cfs_hash_t *hs, cfs_hash_bd_t *bd) +{ + cfs_hash_dhead_dep_t *head; + + head = (cfs_hash_dhead_dep_t *)&bd->bd_bucket->hsb_head[0]; + return &head[bd->bd_offset].dd_head; +} + +static int +cfs_hash_dd_hnode_add(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd), + cfs_hash_dhead_dep_t, dd_head); + + if (dh->dd_tail != NULL) /* not empty */ + hlist_add_after(dh->dd_tail, hnode); + else /* empty list */ + hlist_add_head(hnode, &dh->dd_head); + dh->dd_tail = hnode; + return ++dh->dd_depth; +} + +static int +cfs_hash_dd_hnode_del(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnd) +{ + cfs_hash_dhead_dep_t *dh = container_of(cfs_hash_dd_hhead(hs, bd), + cfs_hash_dhead_dep_t, dd_head); + + if (hnd->next == NULL) { /* it's the tail */ + dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL : + container_of(hnd->pprev, struct hlist_node, next); + } + hlist_del_init(hnd); + return --dh->dd_depth; +} + +static cfs_hash_hlist_ops_t cfs_hash_hh_hops = { + .hop_hhead = cfs_hash_hh_hhead, + .hop_hhead_size = cfs_hash_hh_hhead_size, + .hop_hnode_add = cfs_hash_hh_hnode_add, + .hop_hnode_del = cfs_hash_hh_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_hd_hops = { + .hop_hhead = cfs_hash_hd_hhead, + .hop_hhead_size = cfs_hash_hd_hhead_size, + .hop_hnode_add = cfs_hash_hd_hnode_add, + .hop_hnode_del = cfs_hash_hd_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_dh_hops = { + .hop_hhead = cfs_hash_dh_hhead, + .hop_hhead_size = cfs_hash_dh_hhead_size, + .hop_hnode_add = cfs_hash_dh_hnode_add, + .hop_hnode_del = cfs_hash_dh_hnode_del, +}; + +static cfs_hash_hlist_ops_t cfs_hash_dd_hops = { + .hop_hhead = cfs_hash_dd_hhead, + .hop_hhead_size = cfs_hash_dd_hhead_size, + .hop_hnode_add = cfs_hash_dd_hnode_add, + .hop_hnode_del = cfs_hash_dd_hnode_del, +}; + +static void +cfs_hash_hlist_setup(cfs_hash_t *hs) +{ + if (cfs_hash_with_add_tail(hs)) { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_dd_hops : &cfs_hash_dh_hops; + } else { + hs->hs_hops = cfs_hash_with_depth(hs) ? + &cfs_hash_hd_hops : &cfs_hash_hh_hops; + } +} + +static void +cfs_hash_bd_from_key(cfs_hash_t *hs, cfs_hash_bucket_t **bkts, + unsigned int bits, const void *key, cfs_hash_bd_t *bd) +{ + unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1); + + LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits); + + bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)]; + bd->bd_offset = index >> (bits - hs->hs_bkt_bits); +} + +void +cfs_hash_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bd) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (likely(hs->hs_rehash_buckets == NULL)) { + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, bd); + } else { + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, bd); + } +} +EXPORT_SYMBOL(cfs_hash_bd_get); + +static inline void +cfs_hash_bd_dep_record(cfs_hash_t *hs, cfs_hash_bd_t *bd, int dep_cur) +{ + if (likely(dep_cur <= bd->bd_bucket->hsb_depmax)) + return; + + bd->bd_bucket->hsb_depmax = dep_cur; +# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 + if (likely(warn_on_depth == 0 || + max(warn_on_depth, hs->hs_dep_max) >= dep_cur)) + return; + + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_max = dep_cur; + hs->hs_dep_bkt = bd->bd_bucket->hsb_index; + hs->hs_dep_off = bd->bd_offset; + hs->hs_dep_bits = hs->hs_cur_bits; + spin_unlock(&hs->hs_dep_lock); + + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_dep_wi); +# endif +} + +void +cfs_hash_bd_add_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + int rc; + + rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode); + cfs_hash_bd_dep_record(hs, bd, rc); + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + bd->bd_bucket->hsb_count++; + + if (cfs_hash_with_counter(hs)) + atomic_inc(&hs->hs_count); + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_get(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_add_locked); + +void +cfs_hash_bd_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode) +{ + hs->hs_hops->hop_hnode_del(hs, bd, hnode); + + LASSERT(bd->bd_bucket->hsb_count > 0); + bd->bd_bucket->hsb_count--; + bd->bd_bucket->hsb_version++; + if (unlikely(bd->bd_bucket->hsb_version == 0)) + bd->bd_bucket->hsb_version++; + + if (cfs_hash_with_counter(hs)) { + LASSERT(atomic_read(&hs->hs_count) > 0); + atomic_dec(&hs->hs_count); + } + if (!cfs_hash_with_no_itemref(hs)) + cfs_hash_put_locked(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_bd_del_locked); + +void +cfs_hash_bd_move_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd_old, + cfs_hash_bd_t *bd_new, struct hlist_node *hnode) +{ + cfs_hash_bucket_t *obkt = bd_old->bd_bucket; + cfs_hash_bucket_t *nbkt = bd_new->bd_bucket; + int rc; + + if (cfs_hash_bd_compare(bd_old, bd_new) == 0) + return; + + /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops + * in cfs_hash_bd_del/add_locked */ + hs->hs_hops->hop_hnode_del(hs, bd_old, hnode); + rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode); + cfs_hash_bd_dep_record(hs, bd_new, rc); + + LASSERT(obkt->hsb_count > 0); + obkt->hsb_count--; + obkt->hsb_version++; + if (unlikely(obkt->hsb_version == 0)) + obkt->hsb_version++; + nbkt->hsb_count++; + nbkt->hsb_version++; + if (unlikely(nbkt->hsb_version == 0)) + nbkt->hsb_version++; +} +EXPORT_SYMBOL(cfs_hash_bd_move_locked); + +enum { + /** always set, for sanity (avoid ZERO intent) */ + CFS_HS_LOOKUP_MASK_FIND = 1 << 0, + /** return entry with a ref */ + CFS_HS_LOOKUP_MASK_REF = 1 << 1, + /** add entry if not existing */ + CFS_HS_LOOKUP_MASK_ADD = 1 << 2, + /** delete entry, ignore other masks */ + CFS_HS_LOOKUP_MASK_DEL = 1 << 3, +}; + +typedef enum cfs_hash_lookup_intent { + /** return item w/o refcount */ + CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND, + /** return item with refcount */ + CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_REF), + /** return item w/o refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** return item with refcount if existed, otherwise add */ + CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND | + CFS_HS_LOOKUP_MASK_ADD), + /** delete if existed */ + CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND | + CFS_HS_LOOKUP_MASK_DEL) +} cfs_hash_lookup_intent_t; + +static struct hlist_node * +cfs_hash_bd_lookup_intent(cfs_hash_t *hs, cfs_hash_bd_t *bd, + const void *key, struct hlist_node *hnode, + cfs_hash_lookup_intent_t intent) + +{ + struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd); + struct hlist_node *ehnode; + struct hlist_node *match; + int intent_add = (intent & CFS_HS_LOOKUP_MASK_ADD) != 0; + + /* with this function, we can avoid a lot of useless refcount ops, + * which are expensive atomic operations most time. */ + match = intent_add ? NULL : hnode; + hlist_for_each(ehnode, hhead) { + if (!cfs_hash_keycmp(hs, key, ehnode)) + continue; + + if (match != NULL && match != ehnode) /* can't match */ + continue; + + /* match and ... */ + if ((intent & CFS_HS_LOOKUP_MASK_DEL) != 0) { + cfs_hash_bd_del_locked(hs, bd, ehnode); + return ehnode; + } + + /* caller wants refcount? */ + if ((intent & CFS_HS_LOOKUP_MASK_REF) != 0) + cfs_hash_get(hs, ehnode); + return ehnode; + } + /* no match item */ + if (!intent_add) + return NULL; + + LASSERT(hnode != NULL); + cfs_hash_bd_add_locked(hs, bd, hnode); + return hnode; +} + +struct hlist_node * +cfs_hash_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_FIND); +} +EXPORT_SYMBOL(cfs_hash_bd_lookup_locked); + +struct hlist_node * +cfs_hash_bd_peek_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, const void *key) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, + CFS_HS_LOOKUP_IT_PEEK); +} +EXPORT_SYMBOL(cfs_hash_bd_peek_locked); + +struct hlist_node * +cfs_hash_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_bd_lookup_intent(hs, bd, key, hnode, + CFS_HS_LOOKUP_IT_ADD | + (!noref * CFS_HS_LOOKUP_MASK_REF)); +} +EXPORT_SYMBOL(cfs_hash_bd_findadd_locked); + +struct hlist_node * +cfs_hash_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + const void *key, struct hlist_node *hnode) +{ + /* hnode can be NULL, we find the first item with @key */ + return cfs_hash_bd_lookup_intent(hs, bd, key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); +} +EXPORT_SYMBOL(cfs_hash_bd_finddel_locked); + +static void +cfs_hash_multi_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, + unsigned n, int excl) +{ + cfs_hash_bucket_t *prev = NULL; + int i; + + /** + * bds must be ascendantly ordered by bd->bd_bucket->hsb_index. + * NB: it's possible that several bds point to the same bucket but + * have different bd::bd_offset, so need take care of deadlock. + */ + cfs_hash_for_each_bd(bds, n, i) { + if (prev == bds[i].bd_bucket) + continue; + + LASSERT(prev == NULL || + prev->hsb_index < bds[i].bd_bucket->hsb_index); + cfs_hash_bd_lock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } +} + +static void +cfs_hash_multi_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, + unsigned n, int excl) +{ + cfs_hash_bucket_t *prev = NULL; + int i; + + cfs_hash_for_each_bd(bds, n, i) { + if (prev != bds[i].bd_bucket) { + cfs_hash_bd_unlock(hs, &bds[i], excl); + prev = bds[i].bd_bucket; + } + } +} + +static struct hlist_node * +cfs_hash_multi_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds, + unsigned n, const void *key) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL, + CFS_HS_LOOKUP_IT_FIND); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static struct hlist_node * +cfs_hash_multi_bd_findadd_locked(cfs_hash_t *hs, + cfs_hash_bd_t *bds, unsigned n, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + int intent; + unsigned i; + + LASSERT(hnode != NULL); + intent = CFS_HS_LOOKUP_IT_PEEK | (!noref * CFS_HS_LOOKUP_MASK_REF); + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, + NULL, intent); + if (ehnode != NULL) + return ehnode; + } + + if (i == 1) { /* only one bucket */ + cfs_hash_bd_add_locked(hs, &bds[0], hnode); + } else { + cfs_hash_bd_t mybd; + + cfs_hash_bd_get(hs, key, &mybd); + cfs_hash_bd_add_locked(hs, &mybd, hnode); + } + + return hnode; +} + +static struct hlist_node * +cfs_hash_multi_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds, + unsigned n, const void *key, + struct hlist_node *hnode) +{ + struct hlist_node *ehnode; + unsigned i; + + cfs_hash_for_each_bd(bds, n, i) { + ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode, + CFS_HS_LOOKUP_IT_FINDDEL); + if (ehnode != NULL) + return ehnode; + } + return NULL; +} + +static void +cfs_hash_bd_order(cfs_hash_bd_t *bd1, cfs_hash_bd_t *bd2) +{ + int rc; + + if (bd2->bd_bucket == NULL) + return; + + if (bd1->bd_bucket == NULL) { + *bd1 = *bd2; + bd2->bd_bucket = NULL; + return; + } + + rc = cfs_hash_bd_compare(bd1, bd2); + if (rc == 0) { + bd2->bd_bucket = NULL; + + } else if (rc > 0) { /* swab bd1 and bd2 */ + cfs_hash_bd_t tmp; + + tmp = *bd2; + *bd2 = *bd1; + *bd1 = tmp; + } +} + +void +cfs_hash_dual_bd_get(cfs_hash_t *hs, const void *key, cfs_hash_bd_t *bds) +{ + /* NB: caller should hold hs_lock.rw if REHASH is set */ + cfs_hash_bd_from_key(hs, hs->hs_buckets, + hs->hs_cur_bits, key, &bds[0]); + if (likely(hs->hs_rehash_buckets == NULL)) { + /* no rehash or not rehashing */ + bds[1].bd_bucket = NULL; + return; + } + + LASSERT(hs->hs_rehash_bits != 0); + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &bds[1]); + + cfs_hash_bd_order(&bds[0], &bds[1]); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_get); + +void +cfs_hash_dual_bd_lock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl) +{ + cfs_hash_multi_bd_lock(hs, bds, 2, excl); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_lock); + +void +cfs_hash_dual_bd_unlock(cfs_hash_t *hs, cfs_hash_bd_t *bds, int excl) +{ + cfs_hash_multi_bd_unlock(hs, bds, 2, excl); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_unlock); + +struct hlist_node * +cfs_hash_dual_bd_lookup_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds, + const void *key) +{ + return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_lookup_locked); + +struct hlist_node * +cfs_hash_dual_bd_findadd_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds, + const void *key, struct hlist_node *hnode, + int noref) +{ + return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key, + hnode, noref); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_findadd_locked); + +struct hlist_node * +cfs_hash_dual_bd_finddel_locked(cfs_hash_t *hs, cfs_hash_bd_t *bds, + const void *key, struct hlist_node *hnode) +{ + return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode); +} +EXPORT_SYMBOL(cfs_hash_dual_bd_finddel_locked); + +static void +cfs_hash_buckets_free(cfs_hash_bucket_t **buckets, + int bkt_size, int prev_size, int size) +{ + int i; + + for (i = prev_size; i < size; i++) { + if (buckets[i] != NULL) + LIBCFS_FREE(buckets[i], bkt_size); + } + + LIBCFS_FREE(buckets, sizeof(buckets[0]) * size); +} + +/* + * Create or grow bucket memory. Return old_buckets if no allocation was + * needed, the newly allocated buckets if allocation was needed and + * successful, and NULL on error. + */ +static cfs_hash_bucket_t ** +cfs_hash_buckets_realloc(cfs_hash_t *hs, cfs_hash_bucket_t **old_bkts, + unsigned int old_size, unsigned int new_size) +{ + cfs_hash_bucket_t **new_bkts; + int i; + + LASSERT(old_size == 0 || old_bkts != NULL); + + if (old_bkts != NULL && old_size == new_size) + return old_bkts; + + LIBCFS_ALLOC(new_bkts, sizeof(new_bkts[0]) * new_size); + if (new_bkts == NULL) + return NULL; + + if (old_bkts != NULL) { + memcpy(new_bkts, old_bkts, + min(old_size, new_size) * sizeof(*old_bkts)); + } + + for (i = old_size; i < new_size; i++) { + struct hlist_head *hhead; + cfs_hash_bd_t bd; + + LIBCFS_ALLOC(new_bkts[i], cfs_hash_bkt_size(hs)); + if (new_bkts[i] == NULL) { + cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs), + old_size, new_size); + return NULL; + } + + new_bkts[i]->hsb_index = i; + new_bkts[i]->hsb_version = 1; /* shouldn't be zero */ + new_bkts[i]->hsb_depmax = -1; /* unknown */ + bd.bd_bucket = new_bkts[i]; + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) + INIT_HLIST_HEAD(hhead); + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_no_bktlock(hs)) + continue; + + if (cfs_hash_with_rw_bktlock(hs)) + rwlock_init(&new_bkts[i]->hsb_lock.rw); + else if (cfs_hash_with_spin_bktlock(hs)) + spin_lock_init(&new_bkts[i]->hsb_lock.spin); + else + LBUG(); /* invalid use-case */ + } + return new_bkts; +} + +/** + * Initialize new libcfs hash, where: + * @name - Descriptive hash name + * @cur_bits - Initial hash table size, in bits + * @max_bits - Maximum allowed hash table resize, in bits + * @ops - Registered hash table operations + * @flags - CFS_HASH_REHASH enable synamic hash resizing + * - CFS_HASH_SORT enable chained hash sort + */ +static int cfs_hash_rehash_worker(cfs_workitem_t *wi); + +#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 +static int cfs_hash_dep_print(cfs_workitem_t *wi) +{ + cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_dep_wi); + int dep; + int bkt; + int off; + int bits; + + spin_lock(&hs->hs_dep_lock); + dep = hs->hs_dep_max; + bkt = hs->hs_dep_bkt; + off = hs->hs_dep_off; + bits = hs->hs_dep_bits; + spin_unlock(&hs->hs_dep_lock); + + LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n", + hs->hs_name, bits, dep, bkt, off); + spin_lock(&hs->hs_dep_lock); + hs->hs_dep_bits = 0; /* mark as workitem done */ + spin_unlock(&hs->hs_dep_lock); + return 0; +} + +static void cfs_hash_depth_wi_init(cfs_hash_t *hs) +{ + spin_lock_init(&hs->hs_dep_lock); + cfs_wi_init(&hs->hs_dep_wi, hs, cfs_hash_dep_print); +} + +static void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) +{ + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_dep_wi)) + return; + + spin_lock(&hs->hs_dep_lock); + while (hs->hs_dep_bits != 0) { + spin_unlock(&hs->hs_dep_lock); + cond_resched(); + spin_lock(&hs->hs_dep_lock); + } + spin_unlock(&hs->hs_dep_lock); +} + +#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */ + +static inline void cfs_hash_depth_wi_init(cfs_hash_t *hs) {} +static inline void cfs_hash_depth_wi_cancel(cfs_hash_t *hs) {} + +#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */ + +cfs_hash_t * +cfs_hash_create(char *name, unsigned cur_bits, unsigned max_bits, + unsigned bkt_bits, unsigned extra_bytes, + unsigned min_theta, unsigned max_theta, + cfs_hash_ops_t *ops, unsigned flags) +{ + cfs_hash_t *hs; + int len; + + ENTRY; + + CLASSERT(CFS_HASH_THETA_BITS < 15); + + LASSERT(name != NULL); + LASSERT(ops != NULL); + LASSERT(ops->hs_key); + LASSERT(ops->hs_hash); + LASSERT(ops->hs_object); + LASSERT(ops->hs_keycmp); + LASSERT(ops->hs_get != NULL); + LASSERT(ops->hs_put_locked != NULL); + + if ((flags & CFS_HASH_REHASH) != 0) + flags |= CFS_HASH_COUNTER; /* must have counter */ + + LASSERT(cur_bits > 0); + LASSERT(cur_bits >= bkt_bits); + LASSERT(max_bits >= cur_bits && max_bits < 31); + LASSERT(ergo((flags & CFS_HASH_REHASH) == 0, cur_bits == max_bits)); + LASSERT(ergo((flags & CFS_HASH_REHASH) != 0, + (flags & CFS_HASH_NO_LOCK) == 0)); + LASSERT(ergo((flags & CFS_HASH_REHASH_KEY) != 0, + ops->hs_keycpy != NULL)); + + len = (flags & CFS_HASH_BIGNAME) == 0 ? + CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN; + LIBCFS_ALLOC(hs, offsetof(cfs_hash_t, hs_name[len])); + if (hs == NULL) + RETURN(NULL); + + strncpy(hs->hs_name, name, len); + hs->hs_name[len - 1] = '\0'; + hs->hs_flags = flags; + + atomic_set(&hs->hs_refcount, 1); + atomic_set(&hs->hs_count, 0); + + cfs_hash_lock_setup(hs); + cfs_hash_hlist_setup(hs); + + hs->hs_cur_bits = (__u8)cur_bits; + hs->hs_min_bits = (__u8)cur_bits; + hs->hs_max_bits = (__u8)max_bits; + hs->hs_bkt_bits = (__u8)bkt_bits; + + hs->hs_ops = ops; + hs->hs_extra_bytes = extra_bytes; + hs->hs_rehash_bits = 0; + cfs_wi_init(&hs->hs_rehash_wi, hs, cfs_hash_rehash_worker); + cfs_hash_depth_wi_init(hs); + + if (cfs_hash_with_rehash(hs)) + __cfs_hash_set_theta(hs, min_theta, max_theta); + + hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0, + CFS_HASH_NBKT(hs)); + if (hs->hs_buckets != NULL) + return hs; + + LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[len])); + RETURN(NULL); +} +EXPORT_SYMBOL(cfs_hash_create); + +/** + * Cleanup libcfs hash @hs. + */ +static void +cfs_hash_destroy(cfs_hash_t *hs) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + cfs_hash_bd_t bd; + int i; + ENTRY; + + LASSERT(hs != NULL); + LASSERT(!cfs_hash_is_exiting(hs) && + !cfs_hash_is_iterating(hs)); + + /** + * prohibit further rehashes, don't need any lock because + * I'm the only (last) one can change it. + */ + hs->hs_exiting = 1; + if (cfs_hash_with_rehash(hs)) + cfs_hash_rehash_cancel(hs); + + cfs_hash_depth_wi_cancel(hs); + /* rehash should be done/canceled */ + LASSERT(hs->hs_buckets != NULL && + hs->hs_rehash_buckets == NULL); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + LASSERT(bd.bd_bucket != NULL); + /* no need to take this lock, just for consistent code */ + cfs_hash_bd_lock(hs, &bd, 1); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + LASSERTF(!cfs_hash_with_assert_empty(hs), + "hash %s bucket %u(%u) is not " + " empty: %u items left\n", + hs->hs_name, bd.bd_bucket->hsb_index, + bd.bd_offset, bd.bd_bucket->hsb_count); + /* can't assert key valicate, because we + * can interrupt rehash */ + cfs_hash_bd_del_locked(hs, &bd, hnode); + cfs_hash_exit(hs, hnode); + } + } + LASSERT(bd.bd_bucket->hsb_count == 0); + cfs_hash_bd_unlock(hs, &bd, 1); + cond_resched(); + } + + LASSERT(atomic_read(&hs->hs_count) == 0); + + cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs), + 0, CFS_HASH_NBKT(hs)); + i = cfs_hash_with_bigname(hs) ? + CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN; + LIBCFS_FREE(hs, offsetof(cfs_hash_t, hs_name[i])); + + EXIT; +} + +cfs_hash_t *cfs_hash_getref(cfs_hash_t *hs) +{ + if (atomic_inc_not_zero(&hs->hs_refcount)) + return hs; + return NULL; +} +EXPORT_SYMBOL(cfs_hash_getref); + +void cfs_hash_putref(cfs_hash_t *hs) +{ + if (atomic_dec_and_test(&hs->hs_refcount)) + cfs_hash_destroy(hs); +} +EXPORT_SYMBOL(cfs_hash_putref); + +static inline int +cfs_hash_rehash_bits(cfs_hash_t *hs) +{ + if (cfs_hash_with_no_lock(hs) || + !cfs_hash_with_rehash(hs)) + return -EOPNOTSUPP; + + if (unlikely(cfs_hash_is_exiting(hs))) + return -ESRCH; + + if (unlikely(cfs_hash_is_rehashing(hs))) + return -EALREADY; + + if (unlikely(cfs_hash_is_iterating(hs))) + return -EAGAIN; + + /* XXX: need to handle case with max_theta != 2.0 + * and the case with min_theta != 0.5 */ + if ((hs->hs_cur_bits < hs->hs_max_bits) && + (__cfs_hash_theta(hs) > hs->hs_max_theta)) + return hs->hs_cur_bits + 1; + + if (!cfs_hash_with_shrink(hs)) + return 0; + + if ((hs->hs_cur_bits > hs->hs_min_bits) && + (__cfs_hash_theta(hs) < hs->hs_min_theta)) + return hs->hs_cur_bits - 1; + + return 0; +} + +/** + * don't allow inline rehash if: + * - user wants non-blocking change (add/del) on hash table + * - too many elements + */ +static inline int +cfs_hash_rehash_inline(cfs_hash_t *hs) +{ + return !cfs_hash_with_nblk_change(hs) && + atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called when the item is added. + */ +void +cfs_hash_add(cfs_hash_t *hs, const void *key, struct hlist_node *hnode) +{ + cfs_hash_bd_t bd; + int bits; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_bd_get_and_lock(hs, key, &bd, 1); + + cfs_hash_key_validate(hs, key, hnode); + cfs_hash_bd_add_locked(hs, &bd, hnode); + + cfs_hash_bd_unlock(hs, &bd, 1); + + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); +} +EXPORT_SYMBOL(cfs_hash_add); + +static struct hlist_node * +cfs_hash_find_or_add(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode, int noref) +{ + struct hlist_node *ehnode; + cfs_hash_bd_t bds[2]; + int bits = 0; + + LASSERT(hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + cfs_hash_key_validate(hs, key, hnode); + ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key, + hnode, noref); + cfs_hash_dual_bd_unlock(hs, bds, 1); + + if (ehnode == hnode) /* new item added */ + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return ehnode; +} + +/** + * Add item @hnode to libcfs hash @hs using @key. The registered + * ops->hs_get function will be called if the item was added. + * Returns 0 on success or -EALREADY on key collisions. + */ +int +cfs_hash_add_unique(cfs_hash_t *hs, const void *key, struct hlist_node *hnode) +{ + return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ? + -EALREADY : 0; +} +EXPORT_SYMBOL(cfs_hash_add_unique); + +/** + * Add item @hnode to libcfs hash @hs using @key. If this @key + * already exists in the hash then ops->hs_get will be called on the + * conflicting entry and that entry will be returned to the caller. + * Otherwise ops->hs_get is called on the item which was added. + */ +void * +cfs_hash_findadd_unique(cfs_hash_t *hs, const void *key, + struct hlist_node *hnode) +{ + hnode = cfs_hash_find_or_add(hs, key, hnode, 0); + + return cfs_hash_object(hs, hnode); +} +EXPORT_SYMBOL(cfs_hash_findadd_unique); + +/** + * Delete item @hnode from the libcfs hash @hs using @key. The @key + * is required to ensure the correct hash bucket is locked since there + * is no direct linkage from the item to the bucket. The object + * removed from the hash will be returned and obs->hs_put is called + * on the removed object. + */ +void * +cfs_hash_del(cfs_hash_t *hs, const void *key, struct hlist_node *hnode) +{ + void *obj = NULL; + int bits = 0; + cfs_hash_bd_t bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); + + /* NB: do nothing if @hnode is not in hash table */ + if (hnode == NULL || !hlist_unhashed(hnode)) { + if (bds[1].bd_bucket == NULL && hnode != NULL) { + cfs_hash_bd_del_locked(hs, &bds[0], hnode); + } else { + hnode = cfs_hash_dual_bd_finddel_locked(hs, bds, + key, hnode); + } + } + + if (hnode != NULL) { + obj = cfs_hash_object(hs, hnode); + bits = cfs_hash_rehash_bits(hs); + } + + cfs_hash_dual_bd_unlock(hs, bds, 1); + cfs_hash_unlock(hs, 0); + if (bits > 0) + cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_del); + +/** + * Delete item given @key in libcfs hash @hs. The first @key found in + * the hash will be removed, if the key exists multiple times in the hash + * @hs this function must be called once per key. The removed object + * will be returned and ops->hs_put is called on the removed object. + */ +void * +cfs_hash_del_key(cfs_hash_t *hs, const void *key) +{ + return cfs_hash_del(hs, key, NULL); +} +EXPORT_SYMBOL(cfs_hash_del_key); + +/** + * Lookup an item using @key in the libcfs hash @hs and return it. + * If the @key is found in the hash hs->hs_get() is called and the + * matching objects is returned. It is the callers responsibility + * to call the counterpart ops->hs_put using the cfs_hash_put() macro + * when when finished with the object. If the @key was not found + * in the hash @hs NULL is returned. + */ +void * +cfs_hash_lookup(cfs_hash_t *hs, const void *key) +{ + void *obj = NULL; + struct hlist_node *hnode; + cfs_hash_bd_t bds[2]; + + cfs_hash_lock(hs, 0); + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key); + if (hnode != NULL) + obj = cfs_hash_object(hs, hnode); + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); + + return obj; +} +EXPORT_SYMBOL(cfs_hash_lookup); + +static void +cfs_hash_for_each_enter(cfs_hash_t *hs) +{ + LASSERT(!cfs_hash_is_exiting(hs)); + + if (!cfs_hash_with_rehash(hs)) + return; + /* + * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter + * because it's just an unreliable signal to rehash-thread, + * rehash-thread will try to finsih rehash ASAP when seeing this. + */ + hs->hs_iterating = 1; + + cfs_hash_lock(hs, 1); + hs->hs_iterators++; + + /* NB: iteration is mostly called by service thread, + * we tend to cancel pending rehash-requst, instead of + * blocking service thread, we will relaunch rehash request + * after iteration */ + if (cfs_hash_is_rehashing(hs)) + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} + +static void +cfs_hash_for_each_exit(cfs_hash_t *hs) +{ + int remained; + int bits; + + if (!cfs_hash_with_rehash(hs)) + return; + cfs_hash_lock(hs, 1); + remained = --hs->hs_iterators; + bits = cfs_hash_rehash_bits(hs); + cfs_hash_unlock(hs, 1); + /* NB: it's race on cfs_has_t::hs_iterating, see above */ + if (remained == 0) + hs->hs_iterating = 0; + if (bits > 0) { + cfs_hash_rehash(hs, atomic_read(&hs->hs_count) < + CFS_HASH_LOOP_HOG); + } +} + +/** + * For each item in the libcfs hash @hs call the passed callback @func + * and pass to it as an argument each hash item and the private @data. + * + * a) the function may sleep! + * b) during the callback: + * . the bucket lock is held so the callback must never sleep. + * . if @removal_safe is true, use can remove current item by + * cfs_hash_bd_del_locked + */ +static __u64 +cfs_hash_for_each_tight(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, + void *data, int remove_safe) +{ + struct hlist_node *hnode; + struct hlist_node *pos; + cfs_hash_bd_t bd; + __u64 count = 0; + int excl = !!remove_safe; + int loop = 0; + int i; + ENTRY; + + cfs_hash_for_each_enter(hs); + + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, excl); + if (func == NULL) { /* only glimpse size */ + count += bd.bd_bucket->hsb_count; + cfs_hash_bd_unlock(hs, &bd, excl); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + cfs_hash_bucket_validate(hs, &bd, hnode); + count++; + loop++; + if (func(hs, &bd, hnode, data)) { + cfs_hash_bd_unlock(hs, &bd, excl); + goto out; + } + } + } + cfs_hash_bd_unlock(hs, &bd, excl); + if (loop < CFS_HASH_LOOP_HOG) + continue; + loop = 0; + cfs_hash_unlock(hs, 0); + cond_resched(); + cfs_hash_lock(hs, 0); + } + out: + cfs_hash_unlock(hs, 0); + + cfs_hash_for_each_exit(hs); + RETURN(count); +} + +typedef struct { + cfs_hash_cond_opt_cb_t func; + void *arg; +} cfs_hash_cond_arg_t; + +static int +cfs_hash_cond_del_locked(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + cfs_hash_cond_arg_t *cond = data; + + if (cond->func(cfs_hash_object(hs, hnode), cond->arg)) + cfs_hash_bd_del_locked(hs, bd, hnode); + return 0; +} + +/** + * Delete item from the libcfs hash @hs when @func return true. + * The write lock being hold during loop for each bucket to avoid + * any object be reference. + */ +void +cfs_hash_cond_del(cfs_hash_t *hs, cfs_hash_cond_opt_cb_t func, void *data) +{ + cfs_hash_cond_arg_t arg = { + .func = func, + .arg = data, + }; + + cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1); +} +EXPORT_SYMBOL(cfs_hash_cond_del); + +void +cfs_hash_for_each(cfs_hash_t *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each); + +void +cfs_hash_for_each_safe(cfs_hash_t *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + cfs_hash_for_each_tight(hs, func, data, 1); +} +EXPORT_SYMBOL(cfs_hash_for_each_safe); + +static int +cfs_hash_peek(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + *(int *)data = 0; + return 1; /* return 1 to break the loop */ +} + +int +cfs_hash_is_empty(cfs_hash_t *hs) +{ + int empty = 1; + + cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0); + return empty; +} +EXPORT_SYMBOL(cfs_hash_is_empty); + +__u64 +cfs_hash_size_get(cfs_hash_t *hs) +{ + return cfs_hash_with_counter(hs) ? + atomic_read(&hs->hs_count) : + cfs_hash_for_each_tight(hs, NULL, NULL, 0); +} +EXPORT_SYMBOL(cfs_hash_size_get); + +/* + * cfs_hash_for_each_relax: + * Iterate the hash table and call @func on each item without + * any lock. This function can't guarantee to finish iteration + * if these features are enabled: + * + * a. if rehash_key is enabled, an item can be moved from + * one bucket to another bucket + * b. user can remove non-zero-ref item from hash-table, + * so the item can be removed from hash-table, even worse, + * it's possible that user changed key and insert to another + * hash bucket. + * there's no way for us to finish iteration correctly on previous + * two cases, so iteration has to be stopped on change. + */ +static int +cfs_hash_for_each_relax(cfs_hash_t *hs, cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_node *hnode; + struct hlist_node *tmp; + cfs_hash_bd_t bd; + __u32 version; + int count = 0; + int stop_on_change; + int rc; + int i; + ENTRY; + + stop_on_change = cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs) || + CFS_HOP(hs, put_locked) == NULL; + cfs_hash_lock(hs, 0); + LASSERT(!cfs_hash_is_rehashing(hs)); + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 0); + version = cfs_hash_bd_version_get(&bd); + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + for (hnode = hhead->first; hnode != NULL;) { + cfs_hash_bucket_validate(hs, &bd, hnode); + cfs_hash_get(hs, hnode); + cfs_hash_bd_unlock(hs, &bd, 0); + cfs_hash_unlock(hs, 0); + + rc = func(hs, &bd, hnode, data); + if (stop_on_change) + cfs_hash_put(hs, hnode); + cond_resched(); + count++; + + cfs_hash_lock(hs, 0); + cfs_hash_bd_lock(hs, &bd, 0); + if (!stop_on_change) { + tmp = hnode->next; + cfs_hash_put_locked(hs, hnode); + hnode = tmp; + } else { /* bucket changed? */ + if (version != + cfs_hash_bd_version_get(&bd)) + break; + /* safe to continue because no change */ + hnode = hnode->next; + } + if (rc) /* callback wants to break iteration */ + break; + } + } + cfs_hash_bd_unlock(hs, &bd, 0); + } + cfs_hash_unlock(hs, 0); + + return count; +} + +int +cfs_hash_for_each_nolock(cfs_hash_t *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + ENTRY; + + if (cfs_hash_with_no_lock(hs) || + cfs_hash_with_rehash_key(hs) || + !cfs_hash_with_no_itemref(hs)) + RETURN(-EOPNOTSUPP); + + if (CFS_HOP(hs, get) == NULL || + (CFS_HOP(hs, put) == NULL && + CFS_HOP(hs, put_locked) == NULL)) + RETURN(-EOPNOTSUPP); + + cfs_hash_for_each_enter(hs); + cfs_hash_for_each_relax(hs, func, data); + cfs_hash_for_each_exit(hs); + + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_nolock); + +/** + * For each hash bucket in the libcfs hash @hs call the passed callback + * @func until all the hash buckets are empty. The passed callback @func + * or the previously registered callback hs->hs_put must remove the item + * from the hash. You may either use the cfs_hash_del() or hlist_del() + * functions. No rwlocks will be held during the callback @func it is + * safe to sleep if needed. This function will not terminate until the + * hash is empty. Note it is still possible to concurrently add new + * items in to the hash. It is the callers responsibility to ensure + * the required locking is in place to prevent concurrent insertions. + */ +int +cfs_hash_for_each_empty(cfs_hash_t *hs, + cfs_hash_for_each_cb_t func, void *data) +{ + unsigned i = 0; + ENTRY; + + if (cfs_hash_with_no_lock(hs)) + return -EOPNOTSUPP; + + if (CFS_HOP(hs, get) == NULL || + (CFS_HOP(hs, put) == NULL && + CFS_HOP(hs, put_locked) == NULL)) + return -EOPNOTSUPP; + + cfs_hash_for_each_enter(hs); + while (cfs_hash_for_each_relax(hs, func, data)) { + CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", + hs->hs_name, i++); + } + cfs_hash_for_each_exit(hs); + RETURN(0); +} +EXPORT_SYMBOL(cfs_hash_for_each_empty); + +void +cfs_hash_hlist_for_each(cfs_hash_t *hs, unsigned hindex, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_head *hhead; + struct hlist_node *hnode; + cfs_hash_bd_t bd; + + cfs_hash_for_each_enter(hs); + cfs_hash_lock(hs, 0); + if (hindex >= CFS_HASH_NHLIST(hs)) + goto out; + + cfs_hash_bd_index_set(hs, hindex, &bd); + + cfs_hash_bd_lock(hs, &bd, 0); + hhead = cfs_hash_bd_hhead(hs, &bd); + hlist_for_each(hnode, hhead) { + if (func(hs, &bd, hnode, data)) + break; + } + cfs_hash_bd_unlock(hs, &bd, 0); + out: + cfs_hash_unlock(hs, 0); + cfs_hash_for_each_exit(hs); +} + +EXPORT_SYMBOL(cfs_hash_hlist_for_each); + +/* + * For each item in the libcfs hash @hs which matches the @key call + * the passed callback @func and pass to it as an argument each hash + * item and the private @data. During the callback the bucket lock + * is held so the callback must never sleep. + */ +void +cfs_hash_for_each_key(cfs_hash_t *hs, const void *key, + cfs_hash_for_each_cb_t func, void *data) +{ + struct hlist_node *hnode; + cfs_hash_bd_t bds[2]; + unsigned i; + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); + + cfs_hash_for_each_bd(bds, 2, i) { + struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]); + + hlist_for_each(hnode, hlist) { + cfs_hash_bucket_validate(hs, &bds[i], hnode); + + if (cfs_hash_keycmp(hs, key, hnode)) { + if (func(hs, &bds[i], hnode, data)) + break; + } + } + } + + cfs_hash_dual_bd_unlock(hs, bds, 0); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_for_each_key); + +/** + * Rehash the libcfs hash @hs to the given @bits. This can be used + * to grow the hash size when excessive chaining is detected, or to + * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH + * flag is set in @hs the libcfs hash may be dynamically rehashed + * during addition or removal if the hash's theta value exceeds + * either the hs->hs_min_theta or hs->max_theta values. By default + * these values are tuned to keep the chained hash depth small, and + * this approach assumes a reasonably uniform hashing function. The + * theta thresholds for @hs are tunable via cfs_hash_set_theta(). + */ +void +cfs_hash_rehash_cancel_locked(cfs_hash_t *hs) +{ + int i; + + /* need hold cfs_hash_lock(hs, 1) */ + LASSERT(cfs_hash_with_rehash(hs) && + !cfs_hash_with_no_lock(hs)); + + if (!cfs_hash_is_rehashing(hs)) + return; + + if (cfs_wi_deschedule(cfs_sched_rehash, &hs->hs_rehash_wi)) { + hs->hs_rehash_bits = 0; + return; + } + + for (i = 2; cfs_hash_is_rehashing(hs); i++) { + cfs_hash_unlock(hs, 1); + /* raise console warning while waiting too long */ + CDEBUG(IS_PO2(i >> 3) ? D_WARNING : D_INFO, + "hash %s is still rehashing, rescheded %d\n", + hs->hs_name, i - 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } +} +EXPORT_SYMBOL(cfs_hash_rehash_cancel_locked); + +void +cfs_hash_rehash_cancel(cfs_hash_t *hs) +{ + cfs_hash_lock(hs, 1); + cfs_hash_rehash_cancel_locked(hs); + cfs_hash_unlock(hs, 1); +} +EXPORT_SYMBOL(cfs_hash_rehash_cancel); + +int +cfs_hash_rehash(cfs_hash_t *hs, int do_rehash) +{ + int rc; + + LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs)); + + cfs_hash_lock(hs, 1); + + rc = cfs_hash_rehash_bits(hs); + if (rc <= 0) { + cfs_hash_unlock(hs, 1); + return rc; + } + + hs->hs_rehash_bits = rc; + if (!do_rehash) { + /* launch and return */ + cfs_wi_schedule(cfs_sched_rehash, &hs->hs_rehash_wi); + cfs_hash_unlock(hs, 1); + return 0; + } + + /* rehash right now */ + cfs_hash_unlock(hs, 1); + + return cfs_hash_rehash_worker(&hs->hs_rehash_wi); +} +EXPORT_SYMBOL(cfs_hash_rehash); + +static int +cfs_hash_rehash_bd(cfs_hash_t *hs, cfs_hash_bd_t *old) +{ + cfs_hash_bd_t new; + struct hlist_head *hhead; + struct hlist_node *hnode; + struct hlist_node *pos; + void *key; + int c = 0; + + /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */ + cfs_hash_bd_for_each_hlist(hs, old, hhead) { + hlist_for_each_safe(hnode, pos, hhead) { + key = cfs_hash_key(hs, hnode); + LASSERT(key != NULL); + /* Validate hnode is in the correct bucket. */ + cfs_hash_bucket_validate(hs, old, hnode); + /* + * Delete from old hash bucket; move to new bucket. + * ops->hs_key must be defined. + */ + cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, + hs->hs_rehash_bits, key, &new); + cfs_hash_bd_move_locked(hs, old, &new, hnode); + c++; + } + } + + return c; +} + +static int +cfs_hash_rehash_worker(cfs_workitem_t *wi) +{ + cfs_hash_t *hs = container_of(wi, cfs_hash_t, hs_rehash_wi); + cfs_hash_bucket_t **bkts; + cfs_hash_bd_t bd; + unsigned int old_size; + unsigned int new_size; + int bsize; + int count = 0; + int rc = 0; + int i; + + LASSERT (hs != NULL && cfs_hash_with_rehash(hs)); + + cfs_hash_lock(hs, 0); + LASSERT(cfs_hash_is_rehashing(hs)); + + old_size = CFS_HASH_NBKT(hs); + new_size = CFS_HASH_RH_NBKT(hs); + + cfs_hash_unlock(hs, 0); + + /* + * don't need hs::hs_rwlock for hs::hs_buckets, + * because nobody can change bkt-table except me. + */ + bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets, + old_size, new_size); + cfs_hash_lock(hs, 1); + if (bkts == NULL) { + rc = -ENOMEM; + goto out; + } + + if (bkts == hs->hs_buckets) { + bkts = NULL; /* do nothing */ + goto out; + } + + rc = __cfs_hash_theta(hs); + if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) { + /* free the new allocated bkt-table */ + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + rc = -EALREADY; + goto out; + } + + LASSERT(hs->hs_rehash_buckets == NULL); + hs->hs_rehash_buckets = bkts; + + rc = 0; + cfs_hash_for_each_bucket(hs, &bd, i) { + if (cfs_hash_is_exiting(hs)) { + rc = -ESRCH; + /* someone wants to destroy the hash, abort now */ + if (old_size < new_size) /* OK to free old bkt-table */ + break; + /* it's shrinking, need free new bkt-table */ + hs->hs_rehash_buckets = NULL; + old_size = new_size; + new_size = CFS_HASH_NBKT(hs); + goto out; + } + + count += cfs_hash_rehash_bd(hs, &bd); + if (count < CFS_HASH_LOOP_HOG || + cfs_hash_is_iterating(hs)) { /* need to finish ASAP */ + continue; + } + + count = 0; + cfs_hash_unlock(hs, 1); + cond_resched(); + cfs_hash_lock(hs, 1); + } + + hs->hs_rehash_count++; + + bkts = hs->hs_buckets; + hs->hs_buckets = hs->hs_rehash_buckets; + hs->hs_rehash_buckets = NULL; + + hs->hs_cur_bits = hs->hs_rehash_bits; + out: + hs->hs_rehash_bits = 0; + if (rc == -ESRCH) /* never be scheduled again */ + cfs_wi_exit(cfs_sched_rehash, wi); + bsize = cfs_hash_bkt_size(hs); + cfs_hash_unlock(hs, 1); + /* can't refer to @hs anymore because it could be destroyed */ + if (bkts != NULL) + cfs_hash_buckets_free(bkts, bsize, new_size, old_size); + if (rc != 0) + CDEBUG(D_INFO, "early quit of of rehashing: %d\n", rc); + /* return 1 only if cfs_wi_exit is called */ + return rc == -ESRCH; +} + +/** + * Rehash the object referenced by @hnode in the libcfs hash @hs. The + * @old_key must be provided to locate the objects previous location + * in the hash, and the @new_key will be used to reinsert the object. + * Use this function instead of a cfs_hash_add() + cfs_hash_del() + * combo when it is critical that there is no window in time where the + * object is missing from the hash. When an object is being rehashed + * the registered cfs_hash_get() and cfs_hash_put() functions will + * not be called. + */ +void cfs_hash_rehash_key(cfs_hash_t *hs, const void *old_key, + void *new_key, struct hlist_node *hnode) +{ + cfs_hash_bd_t bds[3]; + cfs_hash_bd_t old_bds[2]; + cfs_hash_bd_t new_bd; + + LASSERT(!hlist_unhashed(hnode)); + + cfs_hash_lock(hs, 0); + + cfs_hash_dual_bd_get(hs, old_key, old_bds); + cfs_hash_bd_get(hs, new_key, &new_bd); + + bds[0] = old_bds[0]; + bds[1] = old_bds[1]; + bds[2] = new_bd; + + /* NB: bds[0] and bds[1] are ordered already */ + cfs_hash_bd_order(&bds[1], &bds[2]); + cfs_hash_bd_order(&bds[0], &bds[1]); + + cfs_hash_multi_bd_lock(hs, bds, 3, 1); + if (likely(old_bds[1].bd_bucket == NULL)) { + cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode); + } else { + cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode); + cfs_hash_bd_add_locked(hs, &new_bd, hnode); + } + /* overwrite key inside locks, otherwise may screw up with + * other operations, i.e: rehash */ + cfs_hash_keycpy(hs, new_key, hnode); + + cfs_hash_multi_bd_unlock(hs, bds, 3, 1); + cfs_hash_unlock(hs, 0); +} +EXPORT_SYMBOL(cfs_hash_rehash_key); + +int cfs_hash_debug_header(char *str, int size) +{ + return snprintf(str, size, "%-*s%6s%6s%6s%6s%6s%6s%6s%7s%8s%8s%8s%s\n", + CFS_HASH_BIGNAME_LEN, + "name", "cur", "min", "max", "theta", "t-min", "t-max", + "flags", "rehash", "count", "maxdep", "maxdepb", + " distribution"); +} +EXPORT_SYMBOL(cfs_hash_debug_header); + +static cfs_hash_bucket_t ** +cfs_hash_full_bkts(cfs_hash_t *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return hs->hs_buckets; + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + hs->hs_rehash_buckets : hs->hs_buckets; +} + +static unsigned int +cfs_hash_full_nbkt(cfs_hash_t *hs) +{ + /* NB: caller should hold hs->hs_rwlock if REHASH is set */ + if (hs->hs_rehash_buckets == NULL) + return CFS_HASH_NBKT(hs); + + LASSERT(hs->hs_rehash_bits != 0); + return hs->hs_rehash_bits > hs->hs_cur_bits ? + CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs); +} + +int cfs_hash_debug_str(cfs_hash_t *hs, char *str, int size) +{ + int dist[8] = { 0, }; + int maxdep = -1; + int maxdepb = -1; + int total = 0; + int c = 0; + int theta; + int i; + + if (str == NULL || size == 0) + return 0; + + cfs_hash_lock(hs, 0); + theta = __cfs_hash_theta(hs); + + c += snprintf(str + c, size - c, "%-*s ", + CFS_HASH_BIGNAME_LEN, hs->hs_name); + c += snprintf(str + c, size - c, "%5d ", 1 << hs->hs_cur_bits); + c += snprintf(str + c, size - c, "%5d ", 1 << hs->hs_min_bits); + c += snprintf(str + c, size - c, "%5d ", 1 << hs->hs_max_bits); + c += snprintf(str + c, size - c, "%d.%03d ", + __cfs_hash_theta_int(theta), + __cfs_hash_theta_frac(theta)); + c += snprintf(str + c, size - c, "%d.%03d ", + __cfs_hash_theta_int(hs->hs_min_theta), + __cfs_hash_theta_frac(hs->hs_min_theta)); + c += snprintf(str + c, size - c, "%d.%03d ", + __cfs_hash_theta_int(hs->hs_max_theta), + __cfs_hash_theta_frac(hs->hs_max_theta)); + c += snprintf(str + c, size - c, " 0x%02x ", hs->hs_flags); + c += snprintf(str + c, size - c, "%6d ", hs->hs_rehash_count); + + /* + * The distribution is a summary of the chained hash depth in + * each of the libcfs hash buckets. Each buckets hsb_count is + * divided by the hash theta value and used to generate a + * histogram of the hash distribution. A uniform hash will + * result in all hash buckets being close to the average thus + * only the first few entries in the histogram will be non-zero. + * If you hash function results in a non-uniform hash the will + * be observable by outlier bucks in the distribution histogram. + * + * Uniform hash distribution: 128/128/0/0/0/0/0/0 + * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 + */ + for (i = 0; i < cfs_hash_full_nbkt(hs); i++) { + cfs_hash_bd_t bd; + + bd.bd_bucket = cfs_hash_full_bkts(hs)[i]; + cfs_hash_bd_lock(hs, &bd, 0); + if (maxdep < bd.bd_bucket->hsb_depmax) { + maxdep = bd.bd_bucket->hsb_depmax; + maxdepb = ffz(~maxdep); + } + total += bd.bd_bucket->hsb_count; + dist[min(__cfs_fls(bd.bd_bucket->hsb_count/max(theta,1)),7)]++; + cfs_hash_bd_unlock(hs, &bd, 0); + } + + c += snprintf(str + c, size - c, "%7d ", total); + c += snprintf(str + c, size - c, "%7d ", maxdep); + c += snprintf(str + c, size - c, "%7d ", maxdepb); + for (i = 0; i < 8; i++) + c += snprintf(str + c, size - c, "%d%c", dist[i], + (i == 7) ? '\n' : '/'); + + cfs_hash_unlock(hs, 0); + + return c; +} +EXPORT_SYMBOL(cfs_hash_debug_str); diff --git a/drivers/staging/lustre/lustre/libcfs/heap.c b/drivers/staging/lustre/lustre/libcfs/heap.c new file mode 100644 index 000000000000..147e4fe4762d --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/heap.c @@ -0,0 +1,475 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + */ +/* + * libcfs/libcfs/heap.c + * + * Author: Eric Barton + * Liang Zhen + */ +/** \addtogroup heap + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#define CBH_ALLOC(ptr, h) \ +do { \ + if ((h)->cbh_flags & CBH_FLAG_ATOMIC_GROW) \ + LIBCFS_CPT_ALLOC_GFP((ptr), h->cbh_cptab, h->cbh_cptid, \ + CBH_NOB, GFP_ATOMIC); \ + else \ + LIBCFS_CPT_ALLOC((ptr), h->cbh_cptab, h->cbh_cptid, \ + CBH_NOB); \ +} while (0) + +#define CBH_FREE(ptr) LIBCFS_FREE(ptr, CBH_NOB) + +/** + * Grows the capacity of a binary heap so that it can handle a larger number of + * \e cfs_binheap_node_t objects. + * + * \param[in] h The binary heap + * + * \retval 0 Successfully grew the heap + * \retval -ENOMEM OOM error + */ +static int +cfs_binheap_grow(cfs_binheap_t *h) +{ + cfs_binheap_node_t ***frag1 = NULL; + cfs_binheap_node_t **frag2; + int hwm = h->cbh_hwm; + + /* need a whole new chunk of pointers */ + LASSERT((h->cbh_hwm & CBH_MASK) == 0); + + if (hwm == 0) { + /* first use of single indirect */ + CBH_ALLOC(h->cbh_elements1, h); + if (h->cbh_elements1 == NULL) + return -ENOMEM; + + goto out; + } + + hwm -= CBH_SIZE; + if (hwm < CBH_SIZE * CBH_SIZE) { + /* not filled double indirect */ + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (hwm == 0) { + /* first use of double indirect */ + CBH_ALLOC(h->cbh_elements2, h); + if (h->cbh_elements2 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + h->cbh_elements2[hwm >> CBH_SHIFT] = frag2; + goto out; + } + + hwm -= CBH_SIZE * CBH_SIZE; +#if (CBH_SHIFT * 3 < 32) + if (hwm >= CBH_SIZE * CBH_SIZE * CBH_SIZE) { + /* filled triple indirect */ + return -ENOMEM; + } +#endif + CBH_ALLOC(frag2, h); + if (frag2 == NULL) + return -ENOMEM; + + if (((hwm >> CBH_SHIFT) & CBH_MASK) == 0) { + /* first use of this 2nd level index */ + CBH_ALLOC(frag1, h); + if (frag1 == NULL) { + CBH_FREE(frag2); + return -ENOMEM; + } + } + + if (hwm == 0) { + /* first use of triple indirect */ + CBH_ALLOC(h->cbh_elements3, h); + if (h->cbh_elements3 == NULL) { + CBH_FREE(frag2); + CBH_FREE(frag1); + return -ENOMEM; + } + } + + if (frag1 != NULL) { + LASSERT(h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] == NULL); + h->cbh_elements3[hwm >> (2 * CBH_SHIFT)] = frag1; + } else { + frag1 = h->cbh_elements3[hwm >> (2 * CBH_SHIFT)]; + LASSERT(frag1 != NULL); + } + + frag1[(hwm >> CBH_SHIFT) & CBH_MASK] = frag2; + + out: + h->cbh_hwm += CBH_SIZE; + return 0; +} + +/** + * Creates and initializes a binary heap instance. + * + * \param[in] ops The operations to be used + * \param[in] flags The heap flags + * \parm[in] count The initial heap capacity in # of elements + * \param[in] arg An optional private argument + * \param[in] cptab The CPT table this heap instance will operate over + * \param[in] cptid The CPT id of \a cptab this heap instance will operate over + * + * \retval valid-pointer A newly-created and initialized binary heap object + * \retval NULL error + */ +cfs_binheap_t * +cfs_binheap_create(cfs_binheap_ops_t *ops, unsigned int flags, + unsigned count, void *arg, struct cfs_cpt_table *cptab, + int cptid) +{ + cfs_binheap_t *h; + + LASSERT(ops != NULL); + LASSERT(ops->hop_compare != NULL); + LASSERT(cptab != NULL); + LASSERT(cptid == CFS_CPT_ANY || + (cptid >= 0 && cptid < cptab->ctb_nparts)); + + LIBCFS_CPT_ALLOC(h, cptab, cptid, sizeof(*h)); + if (h == NULL) + return NULL; + + h->cbh_ops = ops; + h->cbh_nelements = 0; + h->cbh_hwm = 0; + h->cbh_private = arg; + h->cbh_flags = flags & (~CBH_FLAG_ATOMIC_GROW); + h->cbh_cptab = cptab; + h->cbh_cptid = cptid; + + while (h->cbh_hwm < count) { /* preallocate */ + if (cfs_binheap_grow(h) != 0) { + cfs_binheap_destroy(h); + return NULL; + } + } + + h->cbh_flags |= flags & CBH_FLAG_ATOMIC_GROW; + + return h; +} +EXPORT_SYMBOL(cfs_binheap_create); + +/** + * Releases all resources associated with a binary heap instance. + * + * Deallocates memory for all indirection levels and the binary heap object + * itself. + * + * \param[in] h The binary heap object + */ +void +cfs_binheap_destroy(cfs_binheap_t *h) +{ + int idx0; + int idx1; + int n; + + LASSERT(h != NULL); + + n = h->cbh_hwm; + + if (n > 0) { + CBH_FREE(h->cbh_elements1); + n -= CBH_SIZE; + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + CBH_FREE(h->cbh_elements2[idx0]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements2); + } + + if (n > 0) { + for (idx0 = 0; idx0 < CBH_SIZE && n > 0; idx0++) { + + for (idx1 = 0; idx1 < CBH_SIZE && n > 0; idx1++) { + CBH_FREE(h->cbh_elements3[idx0][idx1]); + n -= CBH_SIZE; + } + + CBH_FREE(h->cbh_elements3[idx0]); + } + + CBH_FREE(h->cbh_elements3); + } + + LIBCFS_FREE(h, sizeof(*h)); +} +EXPORT_SYMBOL(cfs_binheap_destroy); + +/** + * Obtains a double pointer to a heap element, given its index into the binary + * tree. + * + * \param[in] h The binary heap instance + * \param[in] idx The requested node's index + * + * \retval valid-pointer A double pointer to a heap pointer entry + */ +static cfs_binheap_node_t ** +cfs_binheap_pointer(cfs_binheap_t *h, unsigned int idx) +{ + if (idx < CBH_SIZE) + return &(h->cbh_elements1[idx]); + + idx -= CBH_SIZE; + if (idx < CBH_SIZE * CBH_SIZE) + return &(h->cbh_elements2[idx >> CBH_SHIFT][idx & CBH_MASK]); + + idx -= CBH_SIZE * CBH_SIZE; + return &(h->cbh_elements3[idx >> (2 * CBH_SHIFT)]\ + [(idx >> CBH_SHIFT) & CBH_MASK]\ + [idx & CBH_MASK]); +} + +/** + * Obtains a pointer to a heap element, given its index into the binary tree. + * + * \param[in] h The binary heap + * \param[in] idx The requested node's index + * + * \retval valid-pointer The requested heap node + * \retval NULL Supplied index is out of bounds + */ +cfs_binheap_node_t * +cfs_binheap_find(cfs_binheap_t *h, unsigned int idx) +{ + if (idx >= h->cbh_nelements) + return NULL; + + return *cfs_binheap_pointer(h, idx); +} +EXPORT_SYMBOL(cfs_binheap_find); + +/** + * Moves a node upwards, towards the root of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +cfs_binheap_bubble(cfs_binheap_t *h, cfs_binheap_node_t *e) +{ + unsigned int cur_idx = e->chn_index; + cfs_binheap_node_t **cur_ptr; + unsigned int parent_idx; + cfs_binheap_node_t **parent_ptr; + int did_sth = 0; + + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx > 0) { + parent_idx = (cur_idx - 1) >> 1; + + parent_ptr = cfs_binheap_pointer(h, parent_idx); + LASSERT((*parent_ptr)->chn_index == parent_idx); + + if (h->cbh_ops->hop_compare(*parent_ptr, e)) + break; + + (*parent_ptr)->chn_index = cur_idx; + *cur_ptr = *parent_ptr; + cur_ptr = parent_ptr; + cur_idx = parent_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Moves a node downwards, towards the last level of the binary tree. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 1 The position of \a e in the tree was changed at least once + * \retval 0 The position of \a e in the tree was not changed + */ +static int +cfs_binheap_sink(cfs_binheap_t *h, cfs_binheap_node_t *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int child_idx; + cfs_binheap_node_t **child_ptr; + cfs_binheap_node_t *child; + unsigned int child2_idx; + cfs_binheap_node_t **child2_ptr; + cfs_binheap_node_t *child2; + unsigned int cur_idx; + cfs_binheap_node_t **cur_ptr; + int did_sth = 0; + + cur_idx = e->chn_index; + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + while (cur_idx < n) { + child_idx = (cur_idx << 1) + 1; + if (child_idx >= n) + break; + + child_ptr = cfs_binheap_pointer(h, child_idx); + child = *child_ptr; + + child2_idx = child_idx + 1; + if (child2_idx < n) { + child2_ptr = cfs_binheap_pointer(h, child2_idx); + child2 = *child2_ptr; + + if (h->cbh_ops->hop_compare(child2, child)) { + child_idx = child2_idx; + child_ptr = child2_ptr; + child = child2; + } + } + + LASSERT(child->chn_index == child_idx); + + if (h->cbh_ops->hop_compare(e, child)) + break; + + child->chn_index = cur_idx; + *cur_ptr = child; + cur_ptr = child_ptr; + cur_idx = child_idx; + did_sth = 1; + } + + e->chn_index = cur_idx; + *cur_ptr = e; + + return did_sth; +} + +/** + * Sort-inserts a node into the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + * + * \retval 0 Element inserted successfully + * \retval != 0 error + */ +int +cfs_binheap_insert(cfs_binheap_t *h, cfs_binheap_node_t *e) +{ + cfs_binheap_node_t **new_ptr; + unsigned int new_idx = h->cbh_nelements; + int rc; + + if (new_idx == h->cbh_hwm) { + rc = cfs_binheap_grow(h); + if (rc != 0) + return rc; + } + + if (h->cbh_ops->hop_enter) { + rc = h->cbh_ops->hop_enter(h, e); + if (rc != 0) + return rc; + } + + e->chn_index = new_idx; + new_ptr = cfs_binheap_pointer(h, new_idx); + h->cbh_nelements++; + *new_ptr = e; + + cfs_binheap_bubble(h, e); + + return 0; +} +EXPORT_SYMBOL(cfs_binheap_insert); + +/** + * Removes a node from the binary heap. + * + * \param[in] h The heap + * \param[in] e The node + */ +void +cfs_binheap_remove(cfs_binheap_t *h, cfs_binheap_node_t *e) +{ + unsigned int n = h->cbh_nelements; + unsigned int cur_idx = e->chn_index; + cfs_binheap_node_t **cur_ptr; + cfs_binheap_node_t *last; + + LASSERT(cur_idx != CBH_POISON); + LASSERT(cur_idx < n); + + cur_ptr = cfs_binheap_pointer(h, cur_idx); + LASSERT(*cur_ptr == e); + + n--; + last = *cfs_binheap_pointer(h, n); + h->cbh_nelements = n; + if (last == e) + return; + + last->chn_index = cur_idx; + *cur_ptr = last; + if (!cfs_binheap_bubble(h, *cur_ptr)) + cfs_binheap_sink(h, *cur_ptr); + + e->chn_index = CBH_POISON; + if (h->cbh_ops->hop_exit) + h->cbh_ops->hop_exit(h, e); +} +EXPORT_SYMBOL(cfs_binheap_remove); + +/** @} heap */ diff --git a/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c new file mode 100644 index 000000000000..c152223ed5d3 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/kernel_user_comm.c @@ -0,0 +1,336 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: Nathan Rutman + * + * Kernel <-> userspace communication routines. + * Using pipes for all arches. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_KUC D_OTHER + +#include + +#ifdef LUSTRE_UTILS +/* This is the userspace side. */ + +/** Start the userspace side of a KUC pipe. + * @param link Private descriptor for pipe/socket. + * @param groups KUC broadcast group to listen to + * (can be null for unicast to this pid) + */ +int libcfs_ukuc_start(lustre_kernelcomm *link, int group) +{ + int pfd[2]; + + if (pipe(pfd) < 0) + return -errno; + + memset(link, 0, sizeof(*link)); + link->lk_rfd = pfd[0]; + link->lk_wfd = pfd[1]; + link->lk_group = group; + link->lk_uid = getpid(); + return 0; +} + +int libcfs_ukuc_stop(lustre_kernelcomm *link) +{ + if (link->lk_wfd > 0) + close(link->lk_wfd); + return close(link->lk_rfd); +} + +#define lhsz sizeof(*kuch) + +/** Read a message from the link. + * Allocates memory, returns handle + * + * @param link Private descriptor for pipe/socket. + * @param buf Buffer to read into, must include size for kuc_hdr + * @param maxsize Maximum message size allowed + * @param transport Only listen to messages on this transport + * (and the generic transport) + */ +int libcfs_ukuc_msg_get(lustre_kernelcomm *link, char *buf, int maxsize, + int transport) +{ + struct kuc_hdr *kuch; + int rc = 0; + + memset(buf, 0, maxsize); + + CDEBUG(D_KUC, "Waiting for message from kernel on fd %d\n", + link->lk_rfd); + + while (1) { + /* Read header first to get message size */ + rc = read(link->lk_rfd, buf, lhsz); + if (rc <= 0) { + rc = -errno; + break; + } + kuch = (struct kuc_hdr *)buf; + + CDEBUG(D_KUC, "Received message mg=%x t=%d m=%d l=%d\n", + kuch->kuc_magic, kuch->kuc_transport, kuch->kuc_msgtype, + kuch->kuc_msglen); + + if (kuch->kuc_magic != KUC_MAGIC) { + CERROR("bad message magic %x != %x\n", + kuch->kuc_magic, KUC_MAGIC); + rc = -EPROTO; + break; + } + + if (kuch->kuc_msglen > maxsize) { + rc = -EMSGSIZE; + break; + } + + /* Read payload */ + rc = read(link->lk_rfd, buf + lhsz, kuch->kuc_msglen - lhsz); + if (rc < 0) { + rc = -errno; + break; + } + if (rc < (kuch->kuc_msglen - lhsz)) { + CERROR("short read: got %d of %d bytes\n", + rc, kuch->kuc_msglen); + rc = -EPROTO; + break; + } + + if (kuch->kuc_transport == transport || + kuch->kuc_transport == KUC_TRANSPORT_GENERIC) { + return 0; + } + /* Drop messages for other transports */ + } + return rc; +} + +#else /* LUSTRE_UTILS */ +/* This is the kernel side (liblustre as well). */ + +/** + * libcfs_kkuc_msg_put - send an message from kernel to userspace + * @param fp to send the message to + * @param payload Payload data. First field of payload is always + * struct kuc_hdr + */ +int libcfs_kkuc_msg_put(struct file *filp, void *payload) +{ + struct kuc_hdr *kuch = (struct kuc_hdr *)payload; + int rc = -ENOSYS; + + if (filp == NULL || IS_ERR(filp)) + return -EBADF; + + if (kuch->kuc_magic != KUC_MAGIC) { + CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic); + return -ENOSYS; + } + + { + loff_t offset = 0; + rc = filp_user_write(filp, payload, kuch->kuc_msglen, + &offset); + } + + if (rc < 0) + CWARN("message send failed (%d)\n", rc); + else + CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp); + + return rc; +} +EXPORT_SYMBOL(libcfs_kkuc_msg_put); + +/* Broadcast groups are global across all mounted filesystems; + * i.e. registering for a group on 1 fs will get messages for that + * group from any fs */ +/** A single group reigstration has a uid and a file pointer */ +struct kkuc_reg { + struct list_head kr_chain; + int kr_uid; + struct file *kr_fp; + __u32 kr_data; +}; +static struct list_head kkuc_groups[KUC_GRP_MAX+1] = {}; +/* Protect message sending against remove and adds */ +static DECLARE_RWSEM(kg_sem); + +/** Add a receiver to a broadcast group + * @param filp pipe to write into + * @param uid identidier for this receiver + * @param group group number + */ +int libcfs_kkuc_group_add(struct file *filp, int uid, int group, __u32 data) +{ + struct kkuc_reg *reg; + + if (group > KUC_GRP_MAX) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + return -EINVAL; + } + + /* fput in group_rem */ + if (filp == NULL) + return -EBADF; + + /* freed in group_rem */ + reg = kmalloc(sizeof(*reg), 0); + if (reg == NULL) + return -ENOMEM; + + reg->kr_fp = filp; + reg->kr_uid = uid; + reg->kr_data = data; + + down_write(&kg_sem); + if (kkuc_groups[group].next == NULL) + INIT_LIST_HEAD(&kkuc_groups[group]); + list_add(®->kr_chain, &kkuc_groups[group]); + up_write(&kg_sem); + + CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group); + + return 0; +} +EXPORT_SYMBOL(libcfs_kkuc_group_add); + +int libcfs_kkuc_group_rem(int uid, int group) +{ + struct kkuc_reg *reg, *next; + ENTRY; + + if (kkuc_groups[group].next == NULL) + RETURN(0); + + if (uid == 0) { + /* Broadcast a shutdown message */ + struct kuc_hdr lh; + + lh.kuc_magic = KUC_MAGIC; + lh.kuc_transport = KUC_TRANSPORT_GENERIC; + lh.kuc_msgtype = KUC_MSG_SHUTDOWN; + lh.kuc_msglen = sizeof(lh); + libcfs_kkuc_group_put(group, &lh); + } + + down_write(&kg_sem); + list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) { + if ((uid == 0) || (uid == reg->kr_uid)) { + list_del(®->kr_chain); + CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n", + reg->kr_uid, reg->kr_fp, group); + if (reg->kr_fp != NULL) + fput(reg->kr_fp); + kfree(reg); + } + } + up_write(&kg_sem); + + RETURN(0); +} +EXPORT_SYMBOL(libcfs_kkuc_group_rem); + +int libcfs_kkuc_group_put(int group, void *payload) +{ + struct kkuc_reg *reg; + int rc = 0; + int one_success = 0; + ENTRY; + + down_read(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (reg->kr_fp != NULL) { + rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); + if (rc == 0) + one_success = 1; + else if (rc == -EPIPE) { + fput(reg->kr_fp); + reg->kr_fp = NULL; + } + } + } + up_read(&kg_sem); + + /* don't return an error if the message has been delivered + * at least to one agent */ + if (one_success) + rc = 0; + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_put); + +/** + * Calls a callback function for each link of the given kuc group. + * @param group the group to call the function on. + * @param cb_func the function to be called. + * @param cb_arg iextra argument to be passed to the callback function. + */ +int libcfs_kkuc_group_foreach(int group, libcfs_kkuc_cb_t cb_func, + void *cb_arg) +{ + struct kkuc_reg *reg; + int rc = 0; + ENTRY; + + if (group > KUC_GRP_MAX) { + CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); + RETURN(-EINVAL); + } + + /* no link for this group */ + if (kkuc_groups[group].next == NULL) + RETURN(0); + + down_read(&kg_sem); + list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { + if (reg->kr_fp != NULL) { + rc = cb_func(reg->kr_data, cb_arg); + } + } + up_read(&kg_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(libcfs_kkuc_group_foreach); + +#endif /* LUSTRE_UTILS */ diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c new file mode 100644 index 000000000000..8e88eb59dd51 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/libcfs_cpu.c @@ -0,0 +1,204 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction + * + * Author: liang@whamcloud.com + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/** Global CPU partition table */ +struct cfs_cpt_table *cfs_cpt_table __read_mostly = NULL; +EXPORT_SYMBOL(cfs_cpt_table); + +#ifndef HAVE_LIBCFS_CPT + +#define CFS_CPU_VERSION_MAGIC 0xbabecafe + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + + if (ncpt != 1) { + CERROR("Can't support cpu partition number %d\n", ncpt); + return NULL; + } + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab != NULL) { + cptab->ctb_version = CFS_CPU_VERSION_MAGIC; + cptab->ctb_nparts = ncpt; + } + + return cptab; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + LASSERT(cptab->ctb_version == CFS_CPU_VERSION_MAGIC); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_online); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) { + cfs_cpt_table_free(cfs_cpt_table); + cfs_cpt_table = NULL; + } +} + +int +cfs_cpu_init(void) +{ + cfs_cpt_table = cfs_cpt_table_alloc(1); + + return cfs_cpt_table != NULL ? 0 : -1; +} + +#endif /* HAVE_LIBCFS_CPT */ diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c new file mode 100644 index 000000000000..8d6c4adf2ee6 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/libcfs_lock.c @@ -0,0 +1,192 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LNET + +#include + + +/** destroy cpu-partition lock, see libcfs_private.h for more detail */ +void +cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) +{ + LASSERT(pcl->pcl_locks != NULL); + LASSERT(!pcl->pcl_locked); + + cfs_percpt_free(pcl->pcl_locks); + LIBCFS_FREE(pcl, sizeof(*pcl)); +} +EXPORT_SYMBOL(cfs_percpt_lock_free); + +/** + * create cpu-partition lock, see libcfs_private.h for more detail. + * + * cpu-partition lock is designed for large-scale SMP system, so we need to + * reduce cacheline conflict as possible as we can, that's the + * reason we always allocate cacheline-aligned memory block. + */ +struct cfs_percpt_lock * +cfs_percpt_lock_alloc(struct cfs_cpt_table *cptab) +{ + struct cfs_percpt_lock *pcl; + spinlock_t *lock; + int i; + + /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */ + LIBCFS_ALLOC(pcl, sizeof(*pcl)); + if (pcl == NULL) + return NULL; + + pcl->pcl_cptab = cptab; + pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock)); + if (pcl->pcl_locks == NULL) { + LIBCFS_FREE(pcl, sizeof(*pcl)); + return NULL; + } + + cfs_percpt_for_each(lock, i, pcl->pcl_locks) + spin_lock_init(lock); + + return pcl; +} +EXPORT_SYMBOL(cfs_percpt_lock_alloc); + +/** + * lock a CPU partition + * + * \a index != CFS_PERCPT_LOCK_EX + * hold private lock indexed by \a index + * + * \a index == CFS_PERCPT_LOCK_EX + * exclusively lock @pcl and nobody can take private lock + */ +void +cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt); + + if (ncpt == 1) { + index = 0; + } else { /* serialize with exclusive lock */ + while (pcl->pcl_locked) + cpu_relax(); + } + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_lock(pcl->pcl_locks[index]); + return; + } + + /* exclusive lock request */ + for (i = 0; i < ncpt; i++) { + spin_lock(pcl->pcl_locks[i]); + if (i == 0) { + LASSERT(!pcl->pcl_locked); + /* nobody should take private lock after this + * so I wouldn't starve for too long time */ + pcl->pcl_locked = 1; + } + } +} +EXPORT_SYMBOL(cfs_percpt_lock); + +/** unlock a CPU partition */ +void +cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index) +{ + int ncpt = cfs_cpt_number(pcl->pcl_cptab); + int i; + + index = ncpt == 1 ? 0 : index; + + if (likely(index != CFS_PERCPT_LOCK_EX)) { + spin_unlock(pcl->pcl_locks[index]); + return; + } + + for (i = ncpt - 1; i >= 0; i--) { + if (i == 0) { + LASSERT(pcl->pcl_locked); + pcl->pcl_locked = 0; + } + spin_unlock(pcl->pcl_locks[i]); + } +} +EXPORT_SYMBOL(cfs_percpt_unlock); + + +/** free cpu-partition refcount */ +void +cfs_percpt_atomic_free(atomic_t **refs) +{ + cfs_percpt_free(refs); +} +EXPORT_SYMBOL(cfs_percpt_atomic_free); + +/** allocate cpu-partition refcount with initial value @init_val */ +atomic_t ** +cfs_percpt_atomic_alloc(struct cfs_cpt_table *cptab, int init_val) +{ + atomic_t **refs; + atomic_t *ref; + int i; + + refs = cfs_percpt_alloc(cptab, sizeof(*ref)); + if (refs == NULL) + return NULL; + + cfs_percpt_for_each(ref, i, refs) + atomic_set(ref, init_val); + return refs; +} +EXPORT_SYMBOL(cfs_percpt_atomic_alloc); + +/** return sum of cpu-partition refs */ +int +cfs_percpt_atomic_summary(atomic_t **refs) +{ + atomic_t *ref; + int i; + int val = 0; + + cfs_percpt_for_each(ref, i, refs) + val += atomic_read(ref); + + return val; +} +EXPORT_SYMBOL(cfs_percpt_atomic_summary); diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c new file mode 100644 index 000000000000..879137303482 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/libcfs_mem.c @@ -0,0 +1,205 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LNET + +#include + +struct cfs_var_array { + unsigned int va_count; /* # of buffers */ + unsigned int va_size; /* size of each var */ + struct cfs_cpt_table *va_cptab; /* cpu partition table */ + void *va_ptrs[0]; /* buffer addresses */ +}; + +/* + * free per-cpu data, see more detail in cfs_percpt_free + */ +void +cfs_percpt_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] != NULL) + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_percpt_free); + +/* + * allocate per cpu-partition variables, returned value is an array of pointers, + * variable can be indexed by CPU partition ID, i.e: + * + * arr = cfs_percpt_alloc(cfs_cpu_pt, size); + * then caller can access memory block for CPU 0 by arr[0], + * memory block for CPU 1 by arr[1]... + * memory block for CPU N by arr[N]... + * + * cacheline aligned. + */ +void * +cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) +{ + struct cfs_var_array *arr; + int count; + int i; + + count = cfs_cpt_number(cptab); + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_size = size = L1_CACHE_ALIGN(size); + arr->va_count = count; + arr->va_cptab = cptab; + + for (i = 0; i < count; i++) { + LIBCFS_CPT_ALLOC(arr->va_ptrs[i], cptab, i, size); + if (arr->va_ptrs[i] == NULL) { + cfs_percpt_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_percpt_alloc); + +/* + * return number of CPUs (or number of elements in per-cpu data) + * according to cptab of @vars + */ +int +cfs_percpt_number(void *vars) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + return arr->va_count; +} +EXPORT_SYMBOL(cfs_percpt_number); + +/* + * return memory block shadowed from current CPU + */ +void * +cfs_percpt_current(void *vars) +{ + struct cfs_var_array *arr; + int cpt; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + cpt = cfs_cpt_current(arr->va_cptab, 0); + if (cpt < 0) + return NULL; + + return arr->va_ptrs[cpt]; +} +EXPORT_SYMBOL(cfs_percpt_current); + +void * +cfs_percpt_index(void *vars, int idx) +{ + struct cfs_var_array *arr; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + LASSERT(idx >= 0 && idx < arr->va_count); + return arr->va_ptrs[idx]; +} +EXPORT_SYMBOL(cfs_percpt_index); + +/* + * free variable array, see more detail in cfs_array_alloc + */ +void +cfs_array_free(void *vars) +{ + struct cfs_var_array *arr; + int i; + + arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); + + for (i = 0; i < arr->va_count; i++) { + if (arr->va_ptrs[i] == NULL) + continue; + + LIBCFS_FREE(arr->va_ptrs[i], arr->va_size); + } + LIBCFS_FREE(arr, offsetof(struct cfs_var_array, + va_ptrs[arr->va_count])); +} +EXPORT_SYMBOL(cfs_array_free); + +/* + * allocate a variable array, returned value is an array of pointers. + * Caller can specify length of array by @count, @size is size of each + * memory block in array. + */ +void * +cfs_array_alloc(int count, unsigned int size) +{ + struct cfs_var_array *arr; + int i; + + LIBCFS_ALLOC(arr, offsetof(struct cfs_var_array, va_ptrs[count])); + if (arr == NULL) + return NULL; + + arr->va_count = count; + arr->va_size = size; + + for (i = 0; i < count; i++) { + LIBCFS_ALLOC(arr->va_ptrs[i], size); + + if (arr->va_ptrs[i] == NULL) { + cfs_array_free((void *)&arr->va_ptrs[0]); + return NULL; + } + } + + return (void *)&arr->va_ptrs[0]; +} +EXPORT_SYMBOL(cfs_array_alloc); diff --git a/drivers/staging/lustre/lustre/libcfs/libcfs_string.c b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c new file mode 100644 index 000000000000..9edccc99683e --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/libcfs_string.c @@ -0,0 +1,647 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * String manipulation functions. + * + * libcfs/libcfs/libcfs_string.c + * + * Author: Nathan Rutman + */ + +#include + +/* non-0 = don't match */ +int cfs_strncasecmp(const char *s1, const char *s2, size_t n) +{ + if (s1 == NULL || s2 == NULL) + return 1; + + if (n == 0) + return 0; + + while (n-- != 0 && tolower(*s1) == tolower(*s2)) { + if (n == 0 || *s1 == '\0' || *s2 == '\0') + break; + s1++; + s2++; + } + + return tolower(*(unsigned char *)s1) - tolower(*(unsigned char *)s2); +} +EXPORT_SYMBOL(cfs_strncasecmp); + +/* Convert a text string to a bitmask */ +int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), + int *oldmask, int minmask, int allmask) +{ + const char *debugstr; + char op = 0; + int newmask = minmask, i, len, found = 0; + ENTRY; + + /* must be a list of tokens separated by whitespace + * and optionally an operator ('+' or '-'). If an operator + * appears first in , '*oldmask' is used as the starting point + * (relative), otherwise minmask is used (absolute). An operator + * applies to all following tokens up to the next operator. */ + while (*str != 0) { + while (isspace(*str)) + str++; + if (*str == 0) + break; + if (*str == '+' || *str == '-') { + op = *str++; + if (!found) + /* only if first token is relative */ + newmask = *oldmask; + while (isspace(*str)) + str++; + if (*str == 0) /* trailing op */ + return -EINVAL; + } + + /* find token length */ + for (len = 0; str[len] != 0 && !isspace(str[len]) && + str[len] != '+' && str[len] != '-'; len++); + + /* match token */ + found = 0; + for (i = 0; i < 32; i++) { + debugstr = bit2str(i); + if (debugstr != NULL && + strlen(debugstr) == len && + cfs_strncasecmp(str, debugstr, len) == 0) { + if (op == '-') + newmask &= ~(1 << i); + else + newmask |= (1 << i); + found = 1; + break; + } + } + if (!found && len == 3 && + (cfs_strncasecmp(str, "ALL", len) == 0)) { + if (op == '-') + newmask = minmask; + else + newmask = allmask; + found = 1; + } + if (!found) { + CWARN("unknown mask '%.*s'.\n" + "mask usage: [+|-] ...\n", len, str); + return -EINVAL; + } + str += len; + } + + *oldmask = newmask; + return 0; +} +EXPORT_SYMBOL(cfs_str2mask); + +/* Duplicate a string in a platform-independent way */ +char *cfs_strdup(const char *str, u_int32_t flags) +{ + size_t lenz; /* length of str + zero byte */ + char *dup_str; + + lenz = strlen(str) + 1; + + dup_str = kmalloc(lenz, flags); + if (dup_str == NULL) + return NULL; + + memcpy(dup_str, str, lenz); + + return dup_str; +} +EXPORT_SYMBOL(cfs_strdup); + +/** + * cfs_{v}snprintf() return the actual size that is printed rather than + * the size that would be printed in standard functions. + */ +/* safe vsnprintf */ +int cfs_vsnprintf(char *buf, size_t size, const char *fmt, va_list args) +{ + int i; + + LASSERT(size > 0); + i = vsnprintf(buf, size, fmt, args); + + return (i >= size ? size - 1 : i); +} +EXPORT_SYMBOL(cfs_vsnprintf); + +/* safe snprintf */ +int cfs_snprintf(char *buf, size_t size, const char *fmt, ...) +{ + va_list args; + int i; + + va_start(args, fmt); + i = cfs_vsnprintf(buf, size, fmt, args); + va_end(args); + + return i; +} +EXPORT_SYMBOL(cfs_snprintf); + +/* get the first string out of @str */ +char *cfs_firststr(char *str, size_t size) +{ + size_t i = 0; + char *end; + + /* trim leading spaces */ + while (i < size && *str && isspace(*str)) { + ++i; + ++str; + } + + /* string with all spaces */ + if (*str == '\0') + goto out; + + end = str; + while (i < size && *end != '\0' && !isspace(*end)) { + ++i; + ++end; + } + + *end= '\0'; +out: + return str; +} +EXPORT_SYMBOL(cfs_firststr); + +char * +cfs_trimwhite(char *str) +{ + char *end; + + while (cfs_iswhite(*str)) + str++; + + end = str + strlen(str); + while (end > str) { + if (!cfs_iswhite(end[-1])) + break; + end--; + } + + *end = 0; + return str; +} +EXPORT_SYMBOL(cfs_trimwhite); + +/** + * Extracts tokens from strings. + * + * Looks for \a delim in string \a next, sets \a res to point to + * substring before the delimiter, sets \a next right after the found + * delimiter. + * + * \retval 1 if \a res points to a string of non-whitespace characters + * \retval 0 otherwise + */ +int +cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) +{ + char *end; + + if (next->ls_str == NULL) + return 0; + + /* skip leading white spaces */ + while (next->ls_len) { + if (!cfs_iswhite(*next->ls_str)) + break; + next->ls_str++; + next->ls_len--; + } + + if (next->ls_len == 0) /* whitespaces only */ + return 0; + + if (*next->ls_str == delim) { + /* first non-writespace is the delimiter */ + return 0; + } + + res->ls_str = next->ls_str; + end = memchr(next->ls_str, delim, next->ls_len); + if (end == NULL) { + /* there is no the delimeter in the string */ + end = next->ls_str + next->ls_len; + next->ls_str = NULL; + } else { + next->ls_str = end + 1; + next->ls_len -= (end - res->ls_str + 1); + } + + /* skip ending whitespaces */ + while (--end != res->ls_str) { + if (!cfs_iswhite(*end)) + break; + } + + res->ls_len = end - res->ls_str + 1; + return 1; +} +EXPORT_SYMBOL(cfs_gettok); + +/** + * Converts string to integer. + * + * Accepts decimal and hexadecimal number recordings. + * + * \retval 1 if first \a nob chars of \a str convert to decimal or + * hexadecimal integer in the range [\a min, \a max] + * \retval 0 otherwise + */ +int +cfs_str2num_check(char *str, int nob, unsigned *num, + unsigned min, unsigned max) +{ + char *endp; + + str = cfs_trimwhite(str); + *num = strtoul(str, &endp, 0); + if (endp == str) + return 0; + + for (; endp < str + nob; endp++) { + if (!cfs_iswhite(*endp)) + return 0; + } + + return (*num >= min && *num <= max); +} +EXPORT_SYMBOL(cfs_str2num_check); + +/** + * Parses \ token of the syntax. If \a bracketed is false, + * \a src should only have a single token which can be \ or \* + * + * \retval pointer to allocated range_expr and initialized + * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a + `* src parses to + * \ | + * \ '-' \ | + * \ '-' \ '/' \ + * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or + * -ENOMEM will be returned. + */ +int +cfs_range_expr_parse(struct cfs_lstr *src, unsigned min, unsigned max, + int bracketed, struct cfs_range_expr **expr) +{ + struct cfs_range_expr *re; + struct cfs_lstr tok; + + LIBCFS_ALLOC(re, sizeof(*re)); + if (re == NULL) + return -ENOMEM; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + re->re_lo = min; + re->re_hi = max; + re->re_stride = 1; + goto out; + } + + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_lo, min, max)) { + /* is parsed */ + re->re_hi = re->re_lo; + re->re_stride = 1; + goto out; + } + + if (!bracketed || !cfs_gettok(src, '-', &tok)) + goto failed; + + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_lo, min, max)) + goto failed; + + /* - */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_hi, min, max)) { + /* - is parsed */ + re->re_stride = 1; + goto out; + } + + /* go to check '-' '/' */ + if (cfs_gettok(src, '/', &tok)) { + if (!cfs_str2num_check(tok.ls_str, tok.ls_len, + &re->re_hi, min, max)) + goto failed; + + /* - / ... */ + if (cfs_str2num_check(src->ls_str, src->ls_len, + &re->re_stride, min, max)) { + /* - / is parsed */ + goto out; + } + } + + out: + *expr = re; + return 0; + + failed: + LIBCFS_FREE(re, sizeof(*re)); + return -EINVAL; +} +EXPORT_SYMBOL(cfs_range_expr_parse); + +/** + * Matches value (\a value) against ranges expression list \a expr_list. + * + * \retval 1 if \a value matches + * \retval 0 otherwise + */ +int +cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + if (value >= expr->re_lo && value <= expr->re_hi && + ((value - expr->re_lo) % expr->re_stride) == 0) + return 1; + } + + return 0; +} +EXPORT_SYMBOL(cfs_expr_list_match); + +/** + * Convert express list (\a expr_list) to an array of all matched values + * + * \retval N N is total number of all matched values + * \retval 0 if expression list is empty + * \retval < 0 for failure + */ +int +cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp) +{ + struct cfs_range_expr *expr; + __u32 *val; + int count = 0; + int i; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + count++; + } + } + + if (count == 0) /* empty expression list */ + return 0; + + if (count > max) { + CERROR("Number of values %d exceeds max allowed %d\n", + max, count); + return -EINVAL; + } + + LIBCFS_ALLOC(val, sizeof(val[0]) * count); + if (val == NULL) + return -ENOMEM; + + count = 0; + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + for (i = expr->re_lo; i <= expr->re_hi; i++) { + if (((i - expr->re_lo) % expr->re_stride) == 0) + val[count++] = i; + } + } + + *valpp = val; + return count; +} +EXPORT_SYMBOL(cfs_expr_list_values); + +/** + * Frees cfs_range_expr structures of \a expr_list. + * + * \retval none + */ +void +cfs_expr_list_free(struct cfs_expr_list *expr_list) +{ + while (!list_empty(&expr_list->el_exprs)) { + struct cfs_range_expr *expr; + + expr = list_entry(expr_list->el_exprs.next, + struct cfs_range_expr, re_link), + list_del(&expr->re_link); + LIBCFS_FREE(expr, sizeof(*expr)); + } + + LIBCFS_FREE(expr_list, sizeof(*expr_list)); +} +EXPORT_SYMBOL(cfs_expr_list_free); + +void +cfs_expr_list_print(struct cfs_expr_list *expr_list) +{ + struct cfs_range_expr *expr; + + list_for_each_entry(expr, &expr_list->el_exprs, re_link) { + CDEBUG(D_WARNING, "%d-%d/%d\n", + expr->re_lo, expr->re_hi, expr->re_stride); + } +} +EXPORT_SYMBOL(cfs_expr_list_print); + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a str parses to \ | \ + * \retval 0 otherwise + */ +int +cfs_expr_list_parse(char *str, int len, unsigned min, unsigned max, + struct cfs_expr_list **elpp) +{ + struct cfs_expr_list *expr_list; + struct cfs_range_expr *expr; + struct cfs_lstr src; + int rc; + + LIBCFS_ALLOC(expr_list, sizeof(*expr_list)); + if (expr_list == NULL) + return -ENOMEM; + + src.ls_str = str; + src.ls_len = len; + + INIT_LIST_HEAD(&expr_list->el_exprs); + + if (src.ls_str[0] == '[' && + src.ls_str[src.ls_len - 1] == ']') { + src.ls_str++; + src.ls_len -= 2; + + rc = -EINVAL; + while (src.ls_str != NULL) { + struct cfs_lstr tok; + + if (!cfs_gettok(&src, ',', &tok)) { + rc = -EINVAL; + break; + } + + rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); + if (rc != 0) + break; + + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } else { + rc = cfs_range_expr_parse(&src, min, max, 0, &expr); + if (rc == 0) { + list_add_tail(&expr->re_link, + &expr_list->el_exprs); + } + } + + if (rc != 0) + cfs_expr_list_free(expr_list); + else + *elpp = expr_list; + + return rc; +} +EXPORT_SYMBOL(cfs_expr_list_parse); + +/** + * Frees cfs_expr_list structures of \a list. + * + * For each struct cfs_expr_list structure found on \a list it frees + * range_expr list attached to it and frees the cfs_expr_list itself. + * + * \retval none + */ +void +cfs_expr_list_free_list(struct list_head *list) +{ + struct cfs_expr_list *el; + + while (!list_empty(list)) { + el = list_entry(list->next, + struct cfs_expr_list, el_link); + list_del(&el->el_link); + cfs_expr_list_free(el); + } +} +EXPORT_SYMBOL(cfs_expr_list_free_list); + +int +cfs_ip_addr_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + struct cfs_lstr src; + int rc; + int i; + + src.ls_str = str; + src.ls_len = len; + i = 0; + + while (src.ls_str != NULL) { + struct cfs_lstr res; + + if (!cfs_gettok(&src, '.', &res)) { + rc = -EINVAL; + goto out; + } + + rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); + if (rc != 0) + goto out; + + list_add_tail(&el->el_link, list); + i++; + } + + if (i == 4) + return 0; + + rc = -EINVAL; + out: + cfs_expr_list_free_list(list); + + return rc; +} +EXPORT_SYMBOL(cfs_ip_addr_parse); + +/** + * Matches address (\a addr) against address set encoded in \a list. + * + * \retval 1 if \a addr matches + * \retval 0 otherwise + */ +int +cfs_ip_addr_match(__u32 addr, struct list_head *list) +{ + struct cfs_expr_list *el; + int i = 0; + + list_for_each_entry_reverse(el, list, el_link) { + if (!cfs_expr_list_match(addr & 0xff, el)) + return 0; + addr >>= 8; + i++; + } + + return i == 4; +} +EXPORT_SYMBOL(cfs_ip_addr_match); + +void +cfs_ip_addr_free(struct list_head *list) +{ + cfs_expr_list_free_list(list); +} +EXPORT_SYMBOL(cfs_ip_addr_free); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c new file mode 100644 index 000000000000..6e255ff55e85 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c @@ -0,0 +1,1085 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Author: liang@whamcloud.com + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include + +#ifdef CONFIG_SMP + +/** + * modparam for setting number of partitions + * + * 0 : estimate best value based on cores or NUMA nodes + * 1 : disable multiple partitions + * >1 : specify number of partitions + */ +static int cpu_npartitions; +CFS_MODULE_PARM(cpu_npartitions, "i", int, 0444, "# of CPU partitions"); + +/** + * modparam for setting CPU partitions patterns: + * + * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, + * number in bracket is processor ID (core or HT) + * + * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket + * are NUMA node ID, number before bracket is CPU partition ID. + * + * NB: If user specified cpu_pattern, cpu_npartitions will be ignored + */ +static char *cpu_pattern = ""; +CFS_MODULE_PARM(cpu_pattern, "s", charp, 0444, "CPU partitions pattern"); + +struct cfs_cpt_data { + /* serialize hotplug etc */ + spinlock_t cpt_lock; + /* reserved for hotplug */ + unsigned long cpt_version; + /* mutex to protect cpt_cpumask */ + struct semaphore cpt_mutex; + /* scratch buffer for set/unset_node */ + cpumask_t *cpt_cpumask; +}; + +static struct cfs_cpt_data cpt_data; + +void +cfs_cpu_core_siblings(int cpu, cpumask_t *mask) +{ + /* return cpumask of cores in the same socket */ + cpumask_copy(mask, topology_core_cpumask(cpu)); +} +EXPORT_SYMBOL(cfs_cpu_core_siblings); + +/* return number of cores in the same socket of \a cpu */ +int +cfs_cpu_core_nsiblings(int cpu) +{ + int num; + + down(&cpt_data.cpt_mutex); + + cfs_cpu_core_siblings(cpu, cpt_data.cpt_cpumask); + num = cpus_weight(*cpt_data.cpt_cpumask); + + up(&cpt_data.cpt_mutex); + + return num; +} +EXPORT_SYMBOL(cfs_cpu_core_nsiblings); + +/* return cpumask of HTs in the same core */ +void +cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) +{ + cpumask_copy(mask, topology_thread_cpumask(cpu)); +} +EXPORT_SYMBOL(cfs_cpu_ht_siblings); + +/* return number of HTs in the same core of \a cpu */ +int +cfs_cpu_ht_nsiblings(int cpu) +{ + int num; + + down(&cpt_data.cpt_mutex); + + cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask); + num = cpus_weight(*cpt_data.cpt_cpumask); + + up(&cpt_data.cpt_mutex); + + return num; +} +EXPORT_SYMBOL(cfs_cpu_ht_nsiblings); + +void +cfs_node_to_cpumask(int node, cpumask_t *mask) +{ + cpumask_copy(mask, cpumask_of_node(node)); +} +EXPORT_SYMBOL(cfs_node_to_cpumask); + +void +cfs_cpt_table_free(struct cfs_cpt_table *cptab) +{ + int i; + + if (cptab->ctb_cpu2cpt != NULL) { + LIBCFS_FREE(cptab->ctb_cpu2cpt, + num_possible_cpus() * + sizeof(cptab->ctb_cpu2cpt[0])); + } + + for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + if (part->cpt_nodemask != NULL) { + LIBCFS_FREE(part->cpt_nodemask, + sizeof(*part->cpt_nodemask)); + } + + if (part->cpt_cpumask != NULL) + LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); + } + + if (cptab->ctb_parts != NULL) { + LIBCFS_FREE(cptab->ctb_parts, + cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); + } + + if (cptab->ctb_nodemask != NULL) + LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + if (cptab->ctb_cpumask != NULL) + LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); + + LIBCFS_FREE(cptab, sizeof(*cptab)); +} +EXPORT_SYMBOL(cfs_cpt_table_free); + +struct cfs_cpt_table * +cfs_cpt_table_alloc(unsigned int ncpt) +{ + struct cfs_cpt_table *cptab; + int i; + + LIBCFS_ALLOC(cptab, sizeof(*cptab)); + if (cptab == NULL) + return NULL; + + cptab->ctb_nparts = ncpt; + + LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size()); + LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); + + if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL) + goto failed; + + LIBCFS_ALLOC(cptab->ctb_cpu2cpt, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + if (cptab->ctb_cpu2cpt == NULL) + goto failed; + + memset(cptab->ctb_cpu2cpt, -1, + num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); + + LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0])); + if (cptab->ctb_parts == NULL) + goto failed; + + for (i = 0; i < ncpt; i++) { + struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; + + LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size()); + LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); + if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL) + goto failed; + } + + spin_lock(&cpt_data.cpt_lock); + /* Reserved for hotplug */ + cptab->ctb_version = cpt_data.cpt_version; + spin_unlock(&cpt_data.cpt_lock); + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} +EXPORT_SYMBOL(cfs_cpt_table_alloc); + +int +cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) +{ + char *tmp = buf; + int rc = 0; + int i; + int j; + + for (i = 0; i < cptab->ctb_nparts; i++) { + if (len > 0) { + rc = snprintf(tmp, len, "%d\t: ", i); + len -= rc; + } + + if (len <= 0) { + rc = -EFBIG; + goto out; + } + + tmp += rc; + for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) { + rc = snprintf(tmp, len, "%d ", j); + len -= rc; + if (len <= 0) { + rc = -EFBIG; + goto out; + } + tmp += rc; + } + + *tmp = '\n'; + tmp++; + len--; + } + + out: + if (rc < 0) + return rc; + + return tmp - buf; +} +EXPORT_SYMBOL(cfs_cpt_table_print); + +int +cfs_cpt_number(struct cfs_cpt_table *cptab) +{ + return cptab->ctb_nparts; +} +EXPORT_SYMBOL(cfs_cpt_number); + +int +cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cpus_weight(*cptab->ctb_cpumask) : + cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask); +} +EXPORT_SYMBOL(cfs_cpt_weight); + +int +cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS : + any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS; +} +EXPORT_SYMBOL(cfs_cpt_online); + +cpumask_t * +cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask; +} +EXPORT_SYMBOL(cfs_cpt_cpumask); + +nodemask_t * +cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) +{ + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + return cpt == CFS_CPT_ANY ? + cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; +} +EXPORT_SYMBOL(cfs_cpt_nodemask); + +int +cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + + LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); + + if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) { + CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); + return 0; + } + + if (cptab->ctb_cpu2cpt[cpu] != -1) { + CDEBUG(D_INFO, "CPU %d is already in partition %d\n", + cpu, cptab->ctb_cpu2cpt[cpu]); + return 0; + } + + cptab->ctb_cpu2cpt[cpu] = cpt; + + LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask)); + LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); + + cpu_set(cpu, *cptab->ctb_cpumask); + cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); + + node = cpu_to_node(cpu); + + /* first CPU of @node in this CPT table */ + if (!node_isset(node, *cptab->ctb_nodemask)) + node_set(node, *cptab->ctb_nodemask); + + /* first CPU of @node in this partition */ + if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)) + node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpu); + +void +cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) +{ + int node; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpu < 0 || cpu >= NR_CPUS) { + CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); + return; + } + + if (cpt == CFS_CPT_ANY) { + /* caller doesn't know the partition ID */ + cpt = cptab->ctb_cpu2cpt[cpu]; + if (cpt < 0) { /* not set in this CPT-table */ + CDEBUG(D_INFO, "Try to unset cpu %d which is " + "not in CPT-table %p\n", cpt, cptab); + return; + } + + } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { + CDEBUG(D_INFO, + "CPU %d is not in cpu-partition %d\n", cpu, cpt); + return; + } + + LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); + LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask)); + + cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); + cpu_clear(cpu, *cptab->ctb_cpumask); + cptab->ctb_cpu2cpt[cpu] = -1; + + node = cpu_to_node(cpu); + + LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)); + LASSERT(node_isset(node, *cptab->ctb_nodemask)); + + for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) { + /* this CPT has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i == NR_CPUS) + node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask); + + for_each_cpu_mask(i, *cptab->ctb_cpumask) { + /* this CPT-table has other CPU belonging to this node? */ + if (cpu_to_node(i) == node) + break; + } + + if (i == NR_CPUS) + node_clear(node, *cptab->ctb_nodemask); + + return; +} +EXPORT_SYMBOL(cfs_cpt_unset_cpu); + +int +cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) { + CDEBUG(D_INFO, "No online CPU is found in the CPU mask " + "for CPU partition %d\n", cpt); + return 0; + } + + for_each_cpu_mask(i, *mask) { + if (!cfs_cpt_set_cpu(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_cpumask); + +void +cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) +{ + int i; + + for_each_cpu_mask(i, *mask) + cfs_cpt_unset_cpu(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_cpumask); + +int +cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + int rc; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return 0; + } + + down(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + rc = cfs_cpt_set_cpumask(cptab, cpt, mask); + + up(&cpt_data.cpt_mutex); + + return rc; +} +EXPORT_SYMBOL(cfs_cpt_set_node); + +void +cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) +{ + cpumask_t *mask; + + if (node < 0 || node >= MAX_NUMNODES) { + CDEBUG(D_INFO, + "Invalid NUMA id %d for CPU partition %d\n", node, cpt); + return; + } + + down(&cpt_data.cpt_mutex); + + mask = cpt_data.cpt_cpumask; + cfs_node_to_cpumask(node, mask); + + cfs_cpt_unset_cpumask(cptab, cpt, mask); + + up(&cpt_data.cpt_mutex); +} +EXPORT_SYMBOL(cfs_cpt_unset_node); + +int +cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) { + if (!cfs_cpt_set_node(cptab, cpt, i)) + return 0; + } + + return 1; +} +EXPORT_SYMBOL(cfs_cpt_set_nodemask); + +void +cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) +{ + int i; + + for_each_node_mask(i, *mask) + cfs_cpt_unset_node(cptab, cpt, i); +} +EXPORT_SYMBOL(cfs_cpt_unset_nodemask); + +void +cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) +{ + int last; + int i; + + if (cpt == CFS_CPT_ANY) { + last = cptab->ctb_nparts - 1; + cpt = 0; + } else { + last = cpt; + } + + for (; cpt <= last; cpt++) { + for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) + cfs_cpt_unset_cpu(cptab, cpt, i); + } +} +EXPORT_SYMBOL(cfs_cpt_clear); + +int +cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) +{ + nodemask_t *mask; + int weight; + int rotor; + int node; + + /* convert CPU partition ID to HW node id */ + + if (cpt < 0 || cpt >= cptab->ctb_nparts) { + mask = cptab->ctb_nodemask; + rotor = cptab->ctb_spread_rotor++; + } else { + mask = cptab->ctb_parts[cpt].cpt_nodemask; + rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; + } + + weight = nodes_weight(*mask); + LASSERT(weight > 0); + + rotor %= weight; + + for_each_node_mask(node, *mask) { + if (rotor-- == 0) + return node; + } + + LBUG(); + return 0; +} +EXPORT_SYMBOL(cfs_cpt_spread_node); + +int +cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) +{ + int cpu = smp_processor_id(); + int cpt = cptab->ctb_cpu2cpt[cpu]; + + if (cpt < 0) { + if (!remap) + return cpt; + + /* don't return negative value for safety of upper layer, + * instead we shadow the unknown cpu to a valid partition ID */ + cpt = cpu % cptab->ctb_nparts; + } + + return cpt; +} +EXPORT_SYMBOL(cfs_cpt_current); + +int +cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) +{ + LASSERT(cpu >= 0 && cpu < NR_CPUS); + + return cptab->ctb_cpu2cpt[cpu]; +} +EXPORT_SYMBOL(cfs_cpt_of_cpu); + +int +cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) +{ + cpumask_t *cpumask; + nodemask_t *nodemask; + int rc; + int i; + + LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); + + if (cpt == CFS_CPT_ANY) { + cpumask = cptab->ctb_cpumask; + nodemask = cptab->ctb_nodemask; + } else { + cpumask = cptab->ctb_parts[cpt].cpt_cpumask; + nodemask = cptab->ctb_parts[cpt].cpt_nodemask; + } + + if (any_online_cpu(*cpumask) == NR_CPUS) { + CERROR("No online CPU found in CPU partition %d, did someone " + "do CPU hotplug on system? You might need to reload " + "Lustre modules to keep system working well.\n", cpt); + return -EINVAL; + } + + for_each_online_cpu(i) { + if (cpu_isset(i, *cpumask)) + continue; + + rc = set_cpus_allowed(current, *cpumask); + set_mems_allowed(*nodemask); + if (rc == 0) + schedule(); /* switch to allowed CPU */ + + return rc; + } + + /* don't need to set affinity because all online CPUs are covered */ + return 0; +} +EXPORT_SYMBOL(cfs_cpt_bind); + +/** + * Choose max to \a number CPUs from \a node and set them in \a cpt. + * We always prefer to choose CPU in the same core/socket. + */ +static int +cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, + cpumask_t *node, int number) +{ + cpumask_t *socket = NULL; + cpumask_t *core = NULL; + int rc = 0; + int cpu; + + LASSERT(number > 0); + + if (number >= cpus_weight(*node)) { + while (!cpus_empty(*node)) { + cpu = first_cpu(*node); + + rc = cfs_cpt_set_cpu(cptab, cpt, cpu); + if (!rc) + return -EINVAL; + cpu_clear(cpu, *node); + } + return 0; + } + + /* allocate scratch buffer */ + LIBCFS_ALLOC(socket, cpumask_size()); + LIBCFS_ALLOC(core, cpumask_size()); + if (socket == NULL || core == NULL) { + rc = -ENOMEM; + goto out; + } + + while (!cpus_empty(*node)) { + cpu = first_cpu(*node); + + /* get cpumask for cores in the same socket */ + cfs_cpu_core_siblings(cpu, socket); + cpus_and(*socket, *socket, *node); + + LASSERT(!cpus_empty(*socket)); + + while (!cpus_empty(*socket)) { + int i; + + /* get cpumask for hts in the same core */ + cfs_cpu_ht_siblings(cpu, core); + cpus_and(*core, *core, *node); + + LASSERT(!cpus_empty(*core)); + + for_each_cpu_mask(i, *core) { + cpu_clear(i, *socket); + cpu_clear(i, *node); + + rc = cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + rc = -EINVAL; + goto out; + } + + if (--number == 0) + goto out; + } + cpu = first_cpu(*socket); + } + } + + out: + if (socket != NULL) + LIBCFS_FREE(socket, cpumask_size()); + if (core != NULL) + LIBCFS_FREE(core, cpumask_size()); + return rc; +} + +#define CPT_WEIGHT_MIN 4u + +static unsigned int +cfs_cpt_num_estimate(void) +{ + unsigned nnode = num_online_nodes(); + unsigned ncpu = num_online_cpus(); + unsigned ncpt; + + if (ncpu <= CPT_WEIGHT_MIN) { + ncpt = 1; + goto out; + } + + /* generate reasonable number of CPU partitions based on total number + * of CPUs, Preferred N should be power2 and match this condition: + * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */ + for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {} + + if (ncpt <= nnode) { /* fat numa system */ + while (nnode > ncpt) + nnode >>= 1; + + } else { /* ncpt > nnode */ + while ((nnode << 1) <= ncpt) + nnode <<= 1; + } + + ncpt = nnode; + + out: +#if (BITS_PER_LONG == 32) + /* config many CPU partitions on 32-bit system could consume + * too much memory */ + ncpt = min(2U, ncpt); +#endif + while (ncpu % ncpt != 0) + ncpt--; /* worst case is 1 */ + + return ncpt; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create(int ncpt) +{ + struct cfs_cpt_table *cptab = NULL; + cpumask_t *mask = NULL; + int cpt = 0; + int num; + int rc; + int i; + + rc = cfs_cpt_num_estimate(); + if (ncpt <= 0) + ncpt = rc; + + if (ncpt > num_online_cpus() || ncpt > 4 * rc) { + CWARN("CPU partition number %d is larger than suggested " + "value (%d), your system may have performance" + "issue or run out of memory while under pressure\n", + ncpt, rc); + } + + if (num_online_cpus() % ncpt != 0) { + CERROR("CPU number %d is not multiple of cpu_npartition %d, " + "please try different cpu_npartitions value or" + "set pattern string by cpu_pattern=STRING\n", + (int)num_online_cpus(), ncpt); + goto failed; + } + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate CPU map(%d)\n", ncpt); + goto failed; + } + + num = num_online_cpus() / ncpt; + if (num == 0) { + CERROR("CPU changed while setting CPU partition\n"); + goto failed; + } + + LIBCFS_ALLOC(mask, cpumask_size()); + if (mask == NULL) { + CERROR("Failed to allocate scratch cpumask\n"); + goto failed; + } + + for_each_online_node(i) { + cfs_node_to_cpumask(i, mask); + + while (!cpus_empty(*mask)) { + struct cfs_cpu_partition *part; + int n; + + if (cpt >= ncpt) + goto failed; + + part = &cptab->ctb_parts[cpt]; + + n = num - cpus_weight(*part->cpt_cpumask); + LASSERT(n > 0); + + rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); + if (rc < 0) + goto failed; + + LASSERT(num >= cpus_weight(*part->cpt_cpumask)); + if (num == cpus_weight(*part->cpt_cpumask)) + cpt++; + } + } + + if (cpt != ncpt || + num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { + CERROR("Expect %d(%d) CPU partitions but got %d(%d), " + "CPU hotplug/unplug while setting?\n", + cptab->ctb_nparts, num, cpt, + cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)); + goto failed; + } + + LIBCFS_FREE(mask, cpumask_size()); + + return cptab; + + failed: + CERROR("Failed to setup CPU-partition-table with %d " + "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", + ncpt, num_online_nodes(), num_online_cpus()); + + if (mask != NULL) + LIBCFS_FREE(mask, cpumask_size()); + + if (cptab != NULL) + cfs_cpt_table_free(cptab); + + return NULL; +} + +static struct cfs_cpt_table * +cfs_cpt_table_create_pattern(char *pattern) +{ + struct cfs_cpt_table *cptab; + char *str = pattern; + int node = 0; + int high; + int ncpt; + int c; + + for (ncpt = 0;; ncpt++) { /* quick scan bracket */ + str = strchr(str, '['); + if (str == NULL) + break; + str++; + } + + str = cfs_trimwhite(pattern); + if (*str == 'n' || *str == 'N') { + pattern = str + 1; + node = 1; + } + + if (ncpt == 0 || + (node && ncpt > num_online_nodes()) || + (!node && ncpt > num_online_cpus())) { + CERROR("Invalid pattern %s, or too many partitions %d\n", + pattern, ncpt); + return NULL; + } + + high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1; + + cptab = cfs_cpt_table_alloc(ncpt); + if (cptab == NULL) { + CERROR("Failed to allocate cpu partition table\n"); + return NULL; + } + + for (str = cfs_trimwhite(pattern), c = 0;; c++) { + struct cfs_range_expr *range; + struct cfs_expr_list *el; + char *bracket = strchr(str, '['); + int cpt; + int rc; + int i; + int n; + + if (bracket == NULL) { + if (*str != 0) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } else if (c != ncpt) { + CERROR("expect %d partitions but found %d\n", + ncpt, c); + goto failed; + } + break; + } + + if (sscanf(str, "%u%n", &cpt, &n) < 1) { + CERROR("Invalid cpu pattern %s\n", str); + goto failed; + } + + if (cpt < 0 || cpt >= ncpt) { + CERROR("Invalid partition id %d, total partitions %d\n", + cpt, ncpt); + goto failed; + } + + if (cfs_cpt_weight(cptab, cpt) != 0) { + CERROR("Partition %d has already been set.\n", cpt); + goto failed; + } + + str = cfs_trimwhite(str + n); + if (str != bracket) { + CERROR("Invalid pattern %s\n", str); + goto failed; + } + + bracket = strchr(str, ']'); + if (bracket == NULL) { + CERROR("missing right bracket for cpt %d, %s\n", + cpt, str); + goto failed; + } + + if (cfs_expr_list_parse(str, (bracket - str) + 1, + 0, high, &el) != 0) { + CERROR("Can't parse number range: %s\n", str); + goto failed; + } + + list_for_each_entry(range, &el->el_exprs, re_link) { + for (i = range->re_lo; i <= range->re_hi; i++) { + if ((i - range->re_lo) % range->re_stride != 0) + continue; + + rc = node ? cfs_cpt_set_node(cptab, cpt, i) : + cfs_cpt_set_cpu(cptab, cpt, i); + if (!rc) { + cfs_expr_list_free(el); + goto failed; + } + } + } + + cfs_expr_list_free(el); + + if (!cfs_cpt_online(cptab, cpt)) { + CERROR("No online CPU is found on partition %d\n", cpt); + goto failed; + } + + str = cfs_trimwhite(bracket + 1); + } + + return cptab; + + failed: + cfs_cpt_table_free(cptab); + return NULL; +} + +#ifdef CONFIG_HOTPLUG_CPU +static int +cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&cpt_data.cpt_lock); + cpt_data.cpt_version++; + spin_unlock(&cpt_data.cpt_lock); + default: + CWARN("Lustre: can't support CPU hotplug well now, " + "performance and stability could be impacted" + "[CPU %u notify: %lx]\n", cpu, action); + } + + return NOTIFY_OK; +} + +static struct notifier_block cfs_cpu_notifier = { + .notifier_call = cfs_cpu_notify, + .priority = 0 +}; + +#endif + +void +cfs_cpu_fini(void) +{ + if (cfs_cpt_table != NULL) + cfs_cpt_table_free(cfs_cpt_table); + +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&cfs_cpu_notifier); +#endif + if (cpt_data.cpt_cpumask != NULL) + LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size()); +} + +int +cfs_cpu_init(void) +{ + LASSERT(cfs_cpt_table == NULL); + + memset(&cpt_data, 0, sizeof(cpt_data)); + + LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size()); + if (cpt_data.cpt_cpumask == NULL) { + CERROR("Failed to allocate scratch buffer\n"); + return -1; + } + + spin_lock_init(&cpt_data.cpt_lock); + sema_init(&cpt_data.cpt_mutex, 1); + +#ifdef CONFIG_HOTPLUG_CPU + register_hotcpu_notifier(&cfs_cpu_notifier); +#endif + + if (*cpu_pattern != 0) { + cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create cptab from pattern %s\n", + cpu_pattern); + goto failed; + } + + } else { + cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions); + if (cfs_cpt_table == NULL) { + CERROR("Failed to create ptable with npartitions %d\n", + cpu_npartitions); + goto failed; + } + } + + spin_lock(&cpt_data.cpt_lock); + if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) { + spin_unlock(&cpt_data.cpt_lock); + CERROR("CPU hotplug/unplug during setup\n"); + goto failed; + } + spin_unlock(&cpt_data.cpt_lock); + + LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n", + num_online_cpus(), cfs_cpt_number(cfs_cpt_table)); + return 0; + + failed: + cfs_cpu_fini(); + return -1; +} + +#endif diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c new file mode 100644 index 000000000000..20b2d61d9ff2 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-adler.c @@ -0,0 +1,144 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to zlib_adler32. + */ + +#include +#include +#include + + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + + +static u32 __adler32(u32 cksum, unsigned char const *p, size_t len) +{ + return zlib_adler32(cksum, p, len); +} + +static int adler32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 1; + + return 0; +} + +static int adler32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = *(u32 *)key; + return 0; +} + +static int adler32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *cksump = shash_desc_ctx(desc); + + *cksump = *mctx; + + return 0; +} + +static int adler32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *cksump = shash_desc_ctx(desc); + + *cksump = __adler32(*cksump, data, len); + return 0; +} +static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len, + u8 *out) +{ + *(u32 *)out = __adler32(*cksump, data, len); + return 0; +} + +static int adler32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int adler32_final(struct shash_desc *desc, u8 *out) +{ + u32 *cksump = shash_desc_ctx(desc); + + *(u32 *)out = *cksump; + return 0; +} + +static int adler32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = adler32_setkey, + .init = adler32_init, + .update = adler32_update, + .final = adler32_final, + .finup = adler32_finup, + .digest = adler32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "adler32", + .cra_driver_name = "adler32-zlib", + .cra_priority = 100, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = adler32_cra_init, + } +}; + + +int cfs_crypto_adler32_register(void) +{ + return crypto_register_shash(&alg); +} +EXPORT_SYMBOL(cfs_crypto_adler32_register); + +void cfs_crypto_adler32_unregister(void) +{ + crypto_unregister_shash(&alg); +} +EXPORT_SYMBOL(cfs_crypto_adler32_unregister); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c new file mode 100644 index 000000000000..83af630c06c6 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32.c @@ -0,0 +1,149 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + */ + +/* + * This is crypto api shash wrappers to crc32_le. + */ + +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +static u32 __crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le(crc, p, len); +} + +/** No default init with ~0 */ +static int crc32_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = __crc32_le(*crcp, data, len); + return 0; +} +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(__crc32_le(*crcp, data, len)); + return 0; +} + +static int crc32_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +static struct shash_alg alg = { + .setkey = crc32_setkey, + .init = crc32_init, + .update = crc32_update, + .final = crc32_final, + .finup = crc32_finup, + .digest = crc32_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-table", + .cra_priority = 100, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32_cra_init, + } +}; + +int cfs_crypto_crc32_register(void) +{ + return crypto_register_shash(&alg); +} +EXPORT_SYMBOL(cfs_crypto_crc32_register); + +void cfs_crypto_crc32_unregister(void) +{ + crypto_unregister_shash(&alg); +} +EXPORT_SYMBOL(cfs_crypto_crc32_unregister); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c new file mode 100644 index 000000000000..dd29aa502980 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto-crc32pclmul.c @@ -0,0 +1,193 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * + * Author: Alexander Boyko + */ +#include +#include +#include +#include +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define PCLMUL_MIN_LEN 64L /* minimum size of buffer + * for crc32_pclmul_le_16 */ +#define SCALE_F 16L /* size of xmm register */ +#define SCALE_F_MASK (SCALE_F - 1) + +u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32); + +static u32 __attribute__((pure)) + crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient; + unsigned int iremainder; + unsigned int prealign; + + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK) + return crc32_le(crc, p, len); + + if ((long)p & SCALE_F_MASK) { + /* align p to 16 byte */ + prealign = SCALE_F - ((long)p & SCALE_F_MASK); + + crc = crc32_le(crc, p, prealign); + len -= prealign; + p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) & + ~SCALE_F_MASK); + } + iquotient = len & (~SCALE_F_MASK); + iremainder = len & SCALE_F_MASK; + + kernel_fpu_begin(); + crc = crc32_pclmul_le_16(p, iquotient, crc); + kernel_fpu_end(); + + if (iremainder) + crc = crc32_le(crc, p + iquotient, iremainder); + + return crc; +} + +static int crc32_pclmul_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = 0; + + return 0; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32_pclmul_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32_pclmul_le(*crcp, data, len); + return 0; +} + +/* No final XOR 0xFFFFFFFF, like crc32_le */ +static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len)); + return 0; +} + +static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32_pclmul_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = cpu_to_le32p(crcp); + return 0; +} + +static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static struct shash_alg alg = { + .setkey = crc32_pclmul_setkey, + .init = crc32_pclmul_init, + .update = crc32_pclmul_update, + .final = crc32_pclmul_final, + .finup = crc32_pclmul_finup, + .digest = crc32_pclmul_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32", + .cra_driver_name = "crc32-pclmul", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32_pclmul_cra_init, + } +}; + +#ifndef X86_FEATURE_PCLMULQDQ +#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ +#endif + +int cfs_crypto_crc32_pclmul_register(void) +{ + + if (!boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + CDEBUG(D_INFO, "PCLMULQDQ-NI instructions are not " + "detected.\n"); + return -ENODEV; + } + return crypto_register_shash(&alg); +} + +void cfs_crypto_crc32_pclmul_unregister(void) +{ + crypto_unregister_shash(&alg); +} diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c new file mode 100644 index 000000000000..f3899bd94408 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-crypto.c @@ -0,0 +1,305 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see http://www.gnu.org/licenses + * + * Please visit http://www.xyratex.com/contact if you need additional + * information or have any questions. + * + * GPL HEADER END + */ + +/* + * Copyright 2012 Xyratex Technology Limited + * + * Copyright (c) 2012, Intel Corporation. + */ + +#include +#include +#include +#include +/** + * Array of hash algorithm speed in MByte per second + */ +static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; + + + +static int cfs_crypto_hash_alloc(unsigned char alg_id, + const struct cfs_crypto_hash_type **type, + struct hash_desc *desc, unsigned char *key, + unsigned int key_len) +{ + int err = 0; + + *type = cfs_crypto_hash_type(alg_id); + + if (*type == NULL) { + CWARN("Unsupported hash algorithm id = %d, max id is %d\n", + alg_id, CFS_HASH_ALG_MAX); + return -EINVAL; + } + desc->tfm = crypto_alloc_hash((*type)->cht_name, 0, 0); + + if (desc->tfm == NULL) + return -EINVAL; + + if (IS_ERR(desc->tfm)) { + CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n", + (*type)->cht_name); + return PTR_ERR(desc->tfm); + } + + desc->flags = 0; + + /** Shash have different logic for initialization then digest + * shash: crypto_hash_setkey, crypto_hash_init + * digest: crypto_digest_init, crypto_digest_setkey + * Skip this function for digest, because we use shash logic at + * cfs_crypto_hash_alloc. + */ + if (key != NULL) { + err = crypto_hash_setkey(desc->tfm, key, key_len); + } else if ((*type)->cht_key != 0) { + err = crypto_hash_setkey(desc->tfm, + (unsigned char *)&((*type)->cht_key), + (*type)->cht_size); + } + + if (err != 0) { + crypto_free_hash(desc->tfm); + return err; + } + + CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n", + (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_name, + (crypto_hash_tfm(desc->tfm))->__crt_alg->cra_driver_name, + cfs_crypto_hash_speeds[alg_id]); + + return crypto_hash_init(desc); +} + +int cfs_crypto_hash_digest(unsigned char alg_id, + const void *buf, unsigned int buf_len, + unsigned char *key, unsigned int key_len, + unsigned char *hash, unsigned int *hash_len) +{ + struct scatterlist sl; + struct hash_desc hdesc; + int err; + const struct cfs_crypto_hash_type *type; + + if (buf == NULL || buf_len == 0 || hash_len == NULL) + return -EINVAL; + + err = cfs_crypto_hash_alloc(alg_id, &type, &hdesc, key, key_len); + if (err != 0) + return err; + + if (hash == NULL || *hash_len < type->cht_size) { + *hash_len = type->cht_size; + crypto_free_hash(hdesc.tfm); + return -ENOSPC; + } + sg_init_one(&sl, (void *)buf, buf_len); + + hdesc.flags = 0; + err = crypto_hash_digest(&hdesc, &sl, sl.length, hash); + crypto_free_hash(hdesc.tfm); + + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_digest); + +struct cfs_crypto_hash_desc * + cfs_crypto_hash_init(unsigned char alg_id, + unsigned char *key, unsigned int key_len) +{ + + struct hash_desc *hdesc; + int err; + const struct cfs_crypto_hash_type *type; + + hdesc = kmalloc(sizeof(*hdesc), 0); + if (hdesc == NULL) + return ERR_PTR(-ENOMEM); + + err = cfs_crypto_hash_alloc(alg_id, &type, hdesc, key, key_len); + + if (err) { + kfree(hdesc); + return ERR_PTR(err); + } + return (struct cfs_crypto_hash_desc *)hdesc; +} +EXPORT_SYMBOL(cfs_crypto_hash_init); + +int cfs_crypto_hash_update_page(struct cfs_crypto_hash_desc *hdesc, + struct page *page, unsigned int offset, + unsigned int len) +{ + struct scatterlist sl; + + sg_init_table(&sl, 1); + sg_set_page(&sl, page, len, offset & ~CFS_PAGE_MASK); + + return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length); +} +EXPORT_SYMBOL(cfs_crypto_hash_update_page); + +int cfs_crypto_hash_update(struct cfs_crypto_hash_desc *hdesc, + const void *buf, unsigned int buf_len) +{ + struct scatterlist sl; + + sg_init_one(&sl, (void *)buf, buf_len); + + return crypto_hash_update((struct hash_desc *)hdesc, &sl, sl.length); +} +EXPORT_SYMBOL(cfs_crypto_hash_update); + +/* If hash_len pointer is NULL - destroy descriptor. */ +int cfs_crypto_hash_final(struct cfs_crypto_hash_desc *hdesc, + unsigned char *hash, unsigned int *hash_len) +{ + int err; + int size = crypto_hash_digestsize(((struct hash_desc *)hdesc)->tfm); + + if (hash_len == NULL) { + crypto_free_hash(((struct hash_desc *)hdesc)->tfm); + kfree(hdesc); + return 0; + } + if (hash == NULL || *hash_len < size) { + *hash_len = size; + return -ENOSPC; + } + err = crypto_hash_final((struct hash_desc *) hdesc, hash); + + if (err < 0) { + /* May be caller can fix error */ + return err; + } + crypto_free_hash(((struct hash_desc *)hdesc)->tfm); + kfree(hdesc); + return err; +} +EXPORT_SYMBOL(cfs_crypto_hash_final); + +static void cfs_crypto_performance_test(unsigned char alg_id, + const unsigned char *buf, + unsigned int buf_len) +{ + unsigned long start, end; + int bcount, err = 0; + int sec = 1; /* do test only 1 sec */ + unsigned char hash[64]; + unsigned int hash_len = 64; + + for (start = jiffies, end = start + sec * HZ, bcount = 0; + time_before(jiffies, end); bcount++) { + err = cfs_crypto_hash_digest(alg_id, buf, buf_len, NULL, 0, + hash, &hash_len); + if (err) + break; + + } + end = jiffies; + + if (err) { + cfs_crypto_hash_speeds[alg_id] = -1; + CDEBUG(D_INFO, "Crypto hash algorithm %s, err = %d\n", + cfs_crypto_hash_name(alg_id), err); + } else { + unsigned long tmp; + tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * + 1000) / (1024 * 1024); + cfs_crypto_hash_speeds[alg_id] = (int)tmp; + } + CDEBUG(D_INFO, "Crypto hash algorithm %s speed = %d MB/s\n", + cfs_crypto_hash_name(alg_id), cfs_crypto_hash_speeds[alg_id]); +} + +int cfs_crypto_hash_speed(unsigned char hash_alg) +{ + if (hash_alg < CFS_HASH_ALG_MAX) + return cfs_crypto_hash_speeds[hash_alg]; + else + return -1; +} +EXPORT_SYMBOL(cfs_crypto_hash_speed); + +/** + * Do performance test for all hash algorithms. + */ +static int cfs_crypto_test_hashes(void) +{ + unsigned char i; + unsigned char *data; + unsigned int j; + /* Data block size for testing hash. Maximum + * kmalloc size for 2.6.18 kernel is 128K */ + unsigned int data_len = 1 * 128 * 1024; + + data = kmalloc(data_len, 0); + if (data == NULL) { + CERROR("Failed to allocate mem\n"); + return -ENOMEM; + } + + for (j = 0; j < data_len; j++) + data[j] = j & 0xff; + + for (i = 0; i < CFS_HASH_ALG_MAX; i++) + cfs_crypto_performance_test(i, data, data_len); + + kfree(data); + return 0; +} + +static int crc32, adler32; + +#ifdef CONFIG_X86 +static int crc32pclmul; +#endif + +int cfs_crypto_register(void) +{ + crc32 = cfs_crypto_crc32_register(); + adler32 = cfs_crypto_adler32_register(); + +#ifdef CONFIG_X86 + crc32pclmul = cfs_crypto_crc32_pclmul_register(); +#endif + + /* check all algorithms and do performance test */ + cfs_crypto_test_hashes(); + return 0; +} +void cfs_crypto_unregister(void) +{ + if (crc32 == 0) + cfs_crypto_crc32_unregister(); + if (adler32 == 0) + cfs_crypto_adler32_unregister(); + +#ifdef CONFIG_X86 + if (crc32pclmul == 0) + cfs_crypto_crc32_pclmul_unregister(); +#endif + + return; +} diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c new file mode 100644 index 000000000000..f236510a2f3f --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-curproc.c @@ -0,0 +1,339 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-curproc.c + * + * Lustre curproc API implementation for Linux kernel + * + * Author: Nikita Danilov + */ + +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +/* + * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h) + * for Linux kernel. + */ + +int cfs_curproc_groups_nr(void) +{ + int nr; + + task_lock(current); + nr = current_cred()->group_info->ngroups; + task_unlock(current); + return nr; +} + +void cfs_curproc_groups_dump(gid_t *array, int size) +{ + task_lock(current); + size = min_t(int, size, current_cred()->group_info->ngroups); + memcpy(array, current_cred()->group_info->blocks[0], size * sizeof(__u32)); + task_unlock(current); +} + + +int current_is_in_group(gid_t gid) +{ + return in_group_p(gid); +} + +/* Currently all the CFS_CAP_* defines match CAP_* ones. */ +#define cfs_cap_pack(cap) (cap) +#define cfs_cap_unpack(cap) (cap) + +void cfs_cap_raise(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cap_raise(cred->cap_effective, cfs_cap_unpack(cap)); + commit_creds(cred); + } +} + +void cfs_cap_lower(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cap_lower(cred->cap_effective, cfs_cap_unpack(cap)); + commit_creds(cred); + } +} + +int cfs_cap_raised(cfs_cap_t cap) +{ + return cap_raised(current_cap(), cfs_cap_unpack(cap)); +} + +void cfs_kernel_cap_pack(kernel_cap_t kcap, cfs_cap_t *cap) +{ +#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330 + *cap = cfs_cap_pack(kcap); +#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026 + *cap = cfs_cap_pack(kcap[0]); +#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522 + /* XXX lost high byte */ + *cap = cfs_cap_pack(kcap.cap[0]); +#else + #error "need correct _KERNEL_CAPABILITY_VERSION " +#endif +} + +void cfs_kernel_cap_unpack(kernel_cap_t *kcap, cfs_cap_t cap) +{ +#if defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x19980330 + *kcap = cfs_cap_unpack(cap); +#elif defined (_LINUX_CAPABILITY_VERSION) && _LINUX_CAPABILITY_VERSION == 0x20071026 + (*kcap)[0] = cfs_cap_unpack(cap); +#elif defined(_KERNEL_CAPABILITY_VERSION) && _KERNEL_CAPABILITY_VERSION == 0x20080522 + kcap->cap[0] = cfs_cap_unpack(cap); +#else + #error "need correct _KERNEL_CAPABILITY_VERSION " +#endif +} + +cfs_cap_t cfs_curproc_cap_pack(void) +{ + cfs_cap_t cap; + cfs_kernel_cap_pack(current_cap(), &cap); + return cap; +} + +void cfs_curproc_cap_unpack(cfs_cap_t cap) +{ + struct cred *cred; + if ((cred = prepare_creds())) { + cfs_kernel_cap_unpack(&cred->cap_effective, cap); + commit_creds(cred); + } +} + +int cfs_capable(cfs_cap_t cap) +{ + return capable(cfs_cap_unpack(cap)); +} + +/* Check if task is running in 32-bit API mode, for the purpose of + * userspace binary interfaces. On 32-bit Linux this is (unfortunately) + * always true, even if the application is using LARGEFILE64 and 64-bit + * APIs, because Linux provides no way for the filesystem to know if it + * is called via 32-bit or 64-bit APIs. Other clients may vary. On + * 64-bit systems, this will only be true if the binary is calling a + * 32-bit system call. */ +int current_is_32bit(void) +{ + return is_compat_task(); +} + +static int cfs_access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + /* Just copied from kernel for the kernels which doesn't + * have access_process_vm() exported */ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, rc, offset; + void *maddr; + + rc = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (rc <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} + +/* Read the environment variable of current process specified by @key. */ +int cfs_get_environ(const char *key, char *value, int *val_len) +{ + struct mm_struct *mm; + char *buffer, *tmp_buf = NULL; + int buf_len = PAGE_CACHE_SIZE; + int key_len = strlen(key); + unsigned long addr; + int rc; + ENTRY; + + buffer = kmalloc(buf_len, GFP_USER); + if (!buffer) + RETURN(-ENOMEM); + + mm = get_task_mm(current); + if (!mm) { + kfree(buffer); + RETURN(-EINVAL); + } + + /* Avoid deadlocks on mmap_sem if called from sys_mmap_pgoff(), + * which is already holding mmap_sem for writes. If some other + * thread gets the write lock in the meantime, this thread will + * block, but at least it won't deadlock on itself. LU-1735 */ + if (down_read_trylock(&mm->mmap_sem) == 0) + return -EDEADLK; + up_read(&mm->mmap_sem); + + addr = mm->env_start; + while (addr < mm->env_end) { + int this_len, retval, scan_len; + char *env_start, *env_end; + + memset(buffer, 0, buf_len); + + this_len = min_t(int, mm->env_end - addr, buf_len); + retval = cfs_access_process_vm(current, addr, buffer, + this_len, 0); + if (retval != this_len) + break; + + addr += retval; + + /* Parse the buffer to find out the specified key/value pair. + * The "key=value" entries are separated by '\0'. */ + env_start = buffer; + scan_len = this_len; + while (scan_len) { + char *entry; + int entry_len; + + env_end = memscan(env_start, '\0', scan_len); + LASSERT(env_end >= env_start && + env_end <= env_start + scan_len); + + /* The last entry of this buffer cross the buffer + * boundary, reread it in next cycle. */ + if (unlikely(env_end - env_start == scan_len)) { + /* This entry is too large to fit in buffer */ + if (unlikely(scan_len == this_len)) { + CERROR("Too long env variable.\n"); + GOTO(out, rc = -EINVAL); + } + addr -= scan_len; + break; + } + + entry = env_start; + entry_len = env_end - env_start; + + /* Key length + length of '=' */ + if (entry_len > key_len + 1 && + !memcmp(entry, key, key_len)) { + entry += key_len + 1; + entry_len -= key_len + 1; + /* The 'value' buffer passed in is too small.*/ + if (entry_len >= *val_len) + GOTO(out, rc = -EOVERFLOW); + + memcpy(value, entry, entry_len); + *val_len = entry_len; + GOTO(out, rc = 0); + } + + scan_len -= (env_end - env_start + 1); + env_start = env_end + 1; + } + } + GOTO(out, rc = -ENOENT); + +out: + mmput(mm); + kfree((void *)buffer); + if (tmp_buf) + kfree((void *)tmp_buf); + return rc; +} +EXPORT_SYMBOL(cfs_get_environ); + +EXPORT_SYMBOL(cfs_curproc_groups_nr); +EXPORT_SYMBOL(cfs_curproc_groups_dump); +EXPORT_SYMBOL(current_is_in_group); +EXPORT_SYMBOL(cfs_cap_raise); +EXPORT_SYMBOL(cfs_cap_lower); +EXPORT_SYMBOL(cfs_cap_raised); +EXPORT_SYMBOL(cfs_curproc_cap_pack); +EXPORT_SYMBOL(cfs_curproc_cap_unpack); +EXPORT_SYMBOL(cfs_capable); +EXPORT_SYMBOL(current_is_32bit); + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c new file mode 100644 index 000000000000..e2c195b8dd53 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c @@ -0,0 +1,264 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-debug.c + * + * Author: Phil Schwan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#include "tracefile.h" + +#include + +char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall"; +char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; + +/** + * Upcall function once a Lustre log has been dumped. + * + * \param file path of the dumped log + */ +void libcfs_run_debug_log_upcall(char *file) +{ + char *argv[3]; + int rc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = lnet_debug_log_upcall; + + LASSERTF(file != NULL, "called on a null filename\n"); + argv[1] = file; //only need to pass the path of the file + + argv[2] = NULL; + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET debug log upcall %s %s; " + "check /proc/sys/lnet/debug_log_upcall\n", + rc, argv[0], argv[1]); + } else { + CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", + argv[0], argv[1]); + } + + EXIT; +} + +void libcfs_run_upcall(char **argv) +{ + int rc; + int argc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = lnet_upcall; + argc = 1; + while (argv[argc] != NULL) + argc++; + + LASSERT(argc >= 2); + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0 && rc != -ENOENT) { + CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; " + "check /proc/sys/lnet/upcall\n", + rc, argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } else { + CDEBUG(D_HA, "Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n", + argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } +} + +void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *msgdata) +{ + char *argv[6]; + char buf[32]; + + ENTRY; + snprintf (buf, sizeof buf, "%d", msgdata->msg_line); + + argv[1] = "LBUG"; + argv[2] = (char *)msgdata->msg_file; + argv[3] = (char *)msgdata->msg_fn; + argv[4] = buf; + argv[5] = NULL; + + libcfs_run_upcall (argv); +} + +/* coverity[+kill] */ +void lbug_with_loc(struct libcfs_debug_msg_data *msgdata) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(msgdata, "LBUG\n"); + + if (in_interrupt()) { + panic("LBUG in interrupt.\n"); + /* not reached */ + } + + libcfs_debug_dumpstack(NULL); + if (!libcfs_panic_on_lbug) + libcfs_debug_dumplog(); + libcfs_run_lbug_upcall(msgdata); + if (libcfs_panic_on_lbug) + panic("LBUG"); + set_task_state(current, TASK_UNINTERRUPTIBLE); + while (1) + schedule(); +} + + +#include +#include + + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +# define RELIABLE reliable +# define DUMP_TRACE_CONST const +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + char fmt[32]; + touch_nmi_watchdog(); + sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? "); + __print_symbol(fmt, addr); +} + +static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = { + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, +}; + +void libcfs_debug_dumpstack(struct task_struct *tsk) +{ + /* dump_stack() */ + /* show_trace() */ + if (tsk == NULL) + tsk = current; + printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm); + /* show_trace_log_lvl() */ + printk("\nCall Trace:\n"); + dump_trace(tsk, NULL, NULL, + 0, + &print_trace_ops, NULL); + printk("\n"); +} + +task_t *libcfs_current(void) +{ + CWARN("current task struct is %p\n", current); + return current; +} + +static int panic_notifier(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (libcfs_panic_in_progress) + return 0; + + libcfs_panic_in_progress = 1; + mb(); + + return 0; +} + +static struct notifier_block libcfs_panic_notifier = { + notifier_call : panic_notifier, + next : NULL, + priority : 10000 +}; + +void libcfs_register_panic_notifier(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier); +} + +void libcfs_unregister_panic_notifier(void) +{ + atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier); +} + +EXPORT_SYMBOL(libcfs_debug_dumpstack); +EXPORT_SYMBOL(libcfs_current); + + +EXPORT_SYMBOL(libcfs_run_upcall); +EXPORT_SYMBOL(libcfs_run_lbug_upcall); +EXPORT_SYMBOL(lbug_with_loc); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c new file mode 100644 index 000000000000..cb969694a38f --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-fs.c @@ -0,0 +1,113 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include + +#include + +/* write a userspace buffer to disk. + * NOTE: this returns 0 on success, not the number of bytes written. */ +ssize_t +filp_user_write(struct file *filp, const void *buf, size_t count, + loff_t *offset) +{ + mm_segment_t fs; + ssize_t size = 0; + + fs = get_fs(); + set_fs(KERNEL_DS); + while ((ssize_t)count > 0) { + size = vfs_write(filp, (const void __user *)buf, count, offset); + if (size < 0) + break; + count -= size; + buf += size; + size = 0; + } + set_fs(fs); + + return size; +} +EXPORT_SYMBOL(filp_user_write); + +#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL && \ + CFS_O_NOACCESS == O_NOACCESS &&\ + CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\ + CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\ + CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\ + CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\ + CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW) + +int cfs_oflags2univ(int flags) +{ + int f; + + f = flags & O_NOACCESS; + f |= (flags & O_CREAT) ? CFS_O_CREAT: 0; + f |= (flags & O_EXCL) ? CFS_O_EXCL: 0; + f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0; + f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0; + f |= (flags & O_APPEND) ? CFS_O_APPEND: 0; + f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0; + f |= (flags & O_SYNC)? CFS_O_SYNC: 0; + f |= (flags & FASYNC)? CFS_O_ASYNC: 0; + f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0; + f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0; + f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0; + f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0; + f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0; + return f; +} +#else + +int cfs_oflags2univ(int flags) +{ + return (flags); +} +#endif +EXPORT_SYMBOL(cfs_oflags2univ); + +/* + * XXX Liang: we don't need cfs_univ2oflags() now. + */ +int cfs_univ2oflags(int flags) +{ + return (flags); +} +EXPORT_SYMBOL(cfs_univ2oflags); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c new file mode 100644 index 000000000000..6f7162e71fb3 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-lock.c @@ -0,0 +1,38 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c new file mode 100644 index 000000000000..3be3ede1148d --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-mem.c @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include + +/* + * NB: we will rename some of above functions in another patch: + * - rename kmalloc to cfs_malloc + * - rename kmalloc/free_page to cfs_page_alloc/free + * - rename kmalloc/free_large to cfs_vmalloc/vfree + */ + +void * +cfs_cpt_malloc(struct cfs_cpt_table *cptab, int cpt, + size_t nr_bytes, unsigned int flags) +{ + void *ptr; + + ptr = kmalloc_node(nr_bytes, flags, + cfs_cpt_spread_node(cptab, cpt)); + if (ptr != NULL && (flags & __GFP_ZERO) != 0) + memset(ptr, 0, nr_bytes); + + return ptr; +} +EXPORT_SYMBOL(cfs_cpt_malloc); + +void * +cfs_cpt_vmalloc(struct cfs_cpt_table *cptab, int cpt, size_t nr_bytes) +{ + return vmalloc_node(nr_bytes, cfs_cpt_spread_node(cptab, cpt)); +} +EXPORT_SYMBOL(cfs_cpt_vmalloc); + +struct page * +cfs_page_cpt_alloc(struct cfs_cpt_table *cptab, int cpt, unsigned int flags) +{ + return alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), flags, 0); +} +EXPORT_SYMBOL(cfs_page_cpt_alloc); + +void * +cfs_mem_cache_cpt_alloc(struct kmem_cache *cachep, struct cfs_cpt_table *cptab, + int cpt, unsigned int flags) +{ + return kmem_cache_alloc_node(cachep, flags, + cfs_cpt_spread_node(cptab, cpt)); +} +EXPORT_SYMBOL(cfs_mem_cache_cpt_alloc); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c new file mode 100644 index 000000000000..2c7d4a3d660f --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-module.c @@ -0,0 +1,183 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#define LNET_MINOR 240 + +int libcfs_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct libcfs_ioctl_hdr *hdr; + struct libcfs_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct libcfs_ioctl_hdr *)buf; + data = (struct libcfs_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if (err) + RETURN(err); + + if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) { + CERROR("PORTALS: version mismatch kernel vs application\n"); + RETURN(-EINVAL); + } + + if (hdr->ioc_len + buf >= end) { + CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + RETURN(-EINVAL); + } + + + if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) { + CERROR("PORTALS: user buffer too small for ioctl\n"); + RETURN(-EINVAL); + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if (err) + RETURN(err); + + if (libcfs_ioctl_is_invalid(data)) { + CERROR("PORTALS: ioctl not correctly formatted\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + + if (data->ioc_inllen2) + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + cfs_size_round(data->ioc_inllen1); + + RETURN(0); +} + +int libcfs_ioctl_popdata(void *arg, void *data, int size) +{ + if (copy_to_user((char *)arg, data, size)) + return -EFAULT; + return 0; +} + +extern struct cfs_psdev_ops libcfs_psdev_ops; + +static int +libcfs_psdev_open(struct inode * inode, struct file * file) +{ + struct libcfs_device_userstate **pdu = NULL; + int rc = 0; + + if (!inode) + return (-EINVAL); + pdu = (struct libcfs_device_userstate **)&file->private_data; + if (libcfs_psdev_ops.p_open != NULL) + rc = libcfs_psdev_ops.p_open(0, (void *)pdu); + else + return (-EPERM); + return rc; +} + +/* called when closing /dev/device */ +static int +libcfs_psdev_release(struct inode * inode, struct file * file) +{ + struct libcfs_device_userstate *pdu; + int rc = 0; + + if (!inode) + return (-EINVAL); + pdu = file->private_data; + if (libcfs_psdev_ops.p_close != NULL) + rc = libcfs_psdev_ops.p_close(0, (void *)pdu); + else + rc = -EPERM; + return rc; +} + +static long libcfs_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct cfs_psdev_file pfile; + int rc = 0; + + if (current_fsuid() != 0) + return -EACCES; + + if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + return (-EINVAL); + } + + /* Handle platform-dependent IOC requests */ + switch (cmd) { + case IOC_LIBCFS_PANIC: + if (!cfs_capable(CFS_CAP_SYS_BOOT)) + return (-EPERM); + panic("debugctl-invoked panic"); + return (0); + case IOC_LIBCFS_MEMHOG: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + /* go thought */ + } + + pfile.off = 0; + pfile.private_data = file->private_data; + if (libcfs_psdev_ops.p_ioctl != NULL) + rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); + else + rc = -EPERM; + return (rc); +} + +static struct file_operations libcfs_fops = { + unlocked_ioctl: libcfs_ioctl, + open : libcfs_psdev_open, + release : libcfs_psdev_release +}; + +psdev_t libcfs_dev = { + LNET_MINOR, + "lnet", + &libcfs_fops +}; diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c new file mode 100644 index 000000000000..b652a79a4811 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-prim.c @@ -0,0 +1,259 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include +#include +#include + +#include + +#if defined(CONFIG_KGDB) +#include +#endif + +#define LINUX_WAITQ(w) ((wait_queue_t *) w) +#define LINUX_WAITQ_HEAD(w) ((wait_queue_head_t *) w) + +void +init_waitqueue_entry_current(wait_queue_t *link) +{ + init_waitqueue_entry(LINUX_WAITQ(link), current); +} +EXPORT_SYMBOL(init_waitqueue_entry_current); + +/** + * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively + * waiting threads, which is not always desirable because all threads will + * be waken up again and again, even user only needs a few of them to be + * active most time. This is not good for performance because cache can + * be polluted by different threads. + * + * LIFO list can resolve this problem because we always wakeup the most + * recent active thread by default. + * + * NB: please don't call non-exclusive & exclusive wait on the same + * waitq if add_wait_queue_exclusive_head is used. + */ +void +add_wait_queue_exclusive_head(wait_queue_head_t *waitq, wait_queue_t *link) +{ + unsigned long flags; + + spin_lock_irqsave(&LINUX_WAITQ_HEAD(waitq)->lock, flags); + __add_wait_queue_exclusive(LINUX_WAITQ_HEAD(waitq), LINUX_WAITQ(link)); + spin_unlock_irqrestore(&LINUX_WAITQ_HEAD(waitq)->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_head); + +void +waitq_wait(wait_queue_t *link, cfs_task_state_t state) +{ + schedule(); +} +EXPORT_SYMBOL(waitq_wait); + +int64_t +waitq_timedwait(wait_queue_t *link, cfs_task_state_t state, + int64_t timeout) +{ + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(waitq_timedwait); + +void +schedule_timeout_and_set_state(cfs_task_state_t state, int64_t timeout) +{ + set_current_state(state); + schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_and_set_state); + +/* deschedule for a bit... */ +void +cfs_pause(cfs_duration_t ticks) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} +EXPORT_SYMBOL(cfs_pause); + +void cfs_init_timer(timer_list_t *t) +{ + init_timer(t); +} +EXPORT_SYMBOL(cfs_init_timer); + +void cfs_timer_init(timer_list_t *t, cfs_timer_func_t *func, void *arg) +{ + init_timer(t); + t->function = func; + t->data = (unsigned long)arg; +} +EXPORT_SYMBOL(cfs_timer_init); + +void cfs_timer_done(timer_list_t *t) +{ + return; +} +EXPORT_SYMBOL(cfs_timer_done); + +void cfs_timer_arm(timer_list_t *t, cfs_time_t deadline) +{ + mod_timer(t, deadline); +} +EXPORT_SYMBOL(cfs_timer_arm); + +void cfs_timer_disarm(timer_list_t *t) +{ + del_timer(t); +} +EXPORT_SYMBOL(cfs_timer_disarm); + +int cfs_timer_is_armed(timer_list_t *t) +{ + return timer_pending(t); +} +EXPORT_SYMBOL(cfs_timer_is_armed); + +cfs_time_t cfs_timer_deadline(timer_list_t *t) +{ + return t->expires; +} +EXPORT_SYMBOL(cfs_timer_deadline); + +void cfs_enter_debugger(void) +{ +#if defined(CONFIG_KGDB) +// BREAKPOINT(); +#else + /* nothing */ +#endif +} + + +sigset_t +cfs_block_allsigs(void) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + sigfillset(¤t->blocked); + recalc_sigpending(); + SIGNAL_MASK_UNLOCK(current, flags); + + return old; +} + +sigset_t cfs_block_sigs(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, sigs); + recalc_sigpending(); + SIGNAL_MASK_UNLOCK(current, flags); + return old; +} + +/* Block all signals except for the @sigs */ +sigset_t cfs_block_sigsinv(unsigned long sigs) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + sigaddsetmask(¤t->blocked, ~sigs); + recalc_sigpending(); + SIGNAL_MASK_UNLOCK(current, flags); + + return old; +} + +void +cfs_restore_sigs (sigset_t old) +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + current->blocked = old; + recalc_sigpending(); + SIGNAL_MASK_UNLOCK(current, flags); +} + +int +cfs_signal_pending(void) +{ + return signal_pending(current); +} + +void +cfs_clear_sigpending(void) +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + clear_tsk_thread_flag(current, TIF_SIGPENDING); + SIGNAL_MASK_UNLOCK(current, flags); +} + +int +libcfs_arch_init(void) +{ + return 0; +} + +void +libcfs_arch_cleanup(void) +{ + return; +} + +EXPORT_SYMBOL(libcfs_arch_init); +EXPORT_SYMBOL(libcfs_arch_cleanup); +EXPORT_SYMBOL(cfs_enter_debugger); +EXPORT_SYMBOL(cfs_block_allsigs); +EXPORT_SYMBOL(cfs_block_sigs); +EXPORT_SYMBOL(cfs_block_sigsinv); +EXPORT_SYMBOL(cfs_restore_sigs); +EXPORT_SYMBOL(cfs_signal_pending); +EXPORT_SYMBOL(cfs_clear_sigpending); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c new file mode 100644 index 000000000000..522b28e99e41 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-proc.c @@ -0,0 +1,580 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-proc.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "tracefile.h" + +#ifdef CONFIG_SYSCTL +static ctl_table_header_t *lnet_table_header = NULL; +#endif +extern char lnet_upcall[1024]; +/** + * The path of debug log dump upcall script. + */ +extern char lnet_debug_log_upcall[1024]; + +#define CTL_LNET (0x100) +enum { + PSDEV_DEBUG = 1, /* control debugging */ + PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ + PSDEV_PRINTK, /* force all messages to console */ + PSDEV_CONSOLE_RATELIMIT, /* ratelimit console messages */ + PSDEV_CONSOLE_MAX_DELAY_CS, /* maximum delay over which we skip messages */ + PSDEV_CONSOLE_MIN_DELAY_CS, /* initial delay over which we skip messages */ + PSDEV_CONSOLE_BACKOFF, /* delay increase factor */ + PSDEV_DEBUG_PATH, /* crashdump log location */ + PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ + PSDEV_CPT_TABLE, /* information about cpu partitions */ + PSDEV_LNET_UPCALL, /* User mode upcall script */ + PSDEV_LNET_MEMUSED, /* bytes currently PORTAL_ALLOCated */ + PSDEV_LNET_CATASTROPHE, /* if we have LBUGged or panic'd */ + PSDEV_LNET_PANIC_ON_LBUG, /* flag to panic on LBUG */ + PSDEV_LNET_DUMP_KERNEL, /* snapshot kernel debug buffer to file */ + PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */ + PSDEV_LNET_DEBUG_MB, /* size of debug buffer */ + PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */ + PSDEV_LNET_WATCHDOG_RATELIMIT, /* ratelimit watchdog messages */ + PSDEV_LNET_FORCE_LBUG, /* hook to force an LBUG */ + PSDEV_LNET_FAIL_LOC, /* control test failures instrumentation */ + PSDEV_LNET_FAIL_VAL, /* userdata for fail loc */ +}; + +int +proc_call_handler(void *data, int write, + loff_t *ppos, void *buffer, size_t *lenp, + int (*handler)(void *data, int write, + loff_t pos, void *buffer, int len)) +{ + int rc = handler(data, write, *ppos, buffer, *lenp); + + if (rc < 0) + return rc; + + if (write) { + *ppos += *lenp; + } else { + *lenp = rc; + *ppos += rc; + } + return 0; +} +EXPORT_SYMBOL(proc_call_handler); + +static int __proc_dobitmasks(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + const int tmpstrlen = 512; + char *tmpstr; + int rc; + unsigned int *mask = data; + int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; + int is_printk = (mask == &libcfs_printk) ? 1 : 0; + + rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen); + if (rc < 0) + return rc; + + if (!write) { + libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys); + rc = strlen(tmpstr); + + if (pos >= rc) { + rc = 0; + } else { + rc = cfs_trace_copyout_string(buffer, nob, + tmpstr + pos, "\n"); + } + } else { + rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob); + if (rc < 0) { + cfs_trace_free_string_buffer(tmpstr, tmpstrlen); + return rc; + } + + rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys); + /* Always print LBUG/LASSERT to console, so keep this mask */ + if (is_printk) + *mask |= D_EMERG; + } + + cfs_trace_free_string_buffer(tmpstr, tmpstrlen); + return rc; +} + +DECLARE_PROC_HANDLER(proc_dobitmasks) + +static int min_watchdog_ratelimit = 0; /* disable ratelimiting */ +static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */ + +static int __proc_dump_kernel(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + if (!write) + return 0; + + return cfs_trace_dump_debug_buffer_usrstr(buffer, nob); +} + +DECLARE_PROC_HANDLER(proc_dump_kernel) + +static int __proc_daemon_file(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + if (!write) { + int len = strlen(cfs_tracefile); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, + cfs_tracefile + pos, "\n"); + } + + return cfs_trace_daemon_command_usrstr(buffer, nob); +} + +DECLARE_PROC_HANDLER(proc_daemon_file) + +static int __proc_debug_mb(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + if (!write) { + char tmpstr[32]; + int len = snprintf(tmpstr, sizeof(tmpstr), "%d", + cfs_trace_get_debug_mb()); + + if (pos >= len) + return 0; + + return cfs_trace_copyout_string(buffer, nob, tmpstr + pos, + "\n"); + } + + return cfs_trace_set_debug_mb_usrstr(buffer, nob); +} + +DECLARE_PROC_HANDLER(proc_debug_mb) + +int LL_PROC_PROTO(proc_console_max_delay_cs) +{ + int rc, max_delay_cs; + ctl_table_t dummy = *table; + cfs_duration_t d; + + dummy.data = &max_delay_cs; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + max_delay_cs = cfs_duration_sec(libcfs_console_max_delay * 100); + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + return rc; + } + + /* write */ + max_delay_cs = 0; + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (max_delay_cs <= 0) + return -EINVAL; + + d = cfs_time_seconds(max_delay_cs) / 100; + if (d == 0 || d < libcfs_console_min_delay) + return -EINVAL; + libcfs_console_max_delay = d; + + return rc; +} + +int LL_PROC_PROTO(proc_console_min_delay_cs) +{ + int rc, min_delay_cs; + ctl_table_t dummy = *table; + cfs_duration_t d; + + dummy.data = &min_delay_cs; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + min_delay_cs = cfs_duration_sec(libcfs_console_min_delay * 100); + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + return rc; + } + + /* write */ + min_delay_cs = 0; + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (min_delay_cs <= 0) + return -EINVAL; + + d = cfs_time_seconds(min_delay_cs) / 100; + if (d == 0 || d > libcfs_console_max_delay) + return -EINVAL; + libcfs_console_min_delay = d; + + return rc; +} + +int LL_PROC_PROTO(proc_console_backoff) +{ + int rc, backoff; + ctl_table_t dummy = *table; + + dummy.data = &backoff; + dummy.proc_handler = &proc_dointvec; + + if (!write) { /* read */ + backoff= libcfs_console_backoff; + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + return rc; + } + + /* write */ + backoff = 0; + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); + if (rc < 0) + return rc; + if (backoff <= 0) + return -EINVAL; + + libcfs_console_backoff = backoff; + + return rc; +} + +int LL_PROC_PROTO(libcfs_force_lbug) +{ + if (write) + LBUG(); + return 0; +} + +int LL_PROC_PROTO(proc_fail_loc) +{ + int rc; + long old_fail_loc = cfs_fail_loc; + + rc = ll_proc_dolongvec(table, write, filp, buffer, lenp, ppos); + if (old_fail_loc != cfs_fail_loc) + wake_up(&cfs_race_waitq); + return rc; +} + +static int __proc_cpt_table(void *data, int write, + loff_t pos, void *buffer, int nob) +{ + char *buf = NULL; + int len = 4096; + int rc = 0; + + if (write) + return -EPERM; + + LASSERT(cfs_cpt_table != NULL); + + while (1) { + LIBCFS_ALLOC(buf, len); + if (buf == NULL) + return -ENOMEM; + + rc = cfs_cpt_table_print(cfs_cpt_table, buf, len); + if (rc >= 0) + break; + + LIBCFS_FREE(buf, len); + if (rc == -EFBIG) { + len <<= 1; + continue; + } + goto out; + } + + if (pos >= rc) { + rc = 0; + goto out; + } + + rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); + out: + if (buf != NULL) + LIBCFS_FREE(buf, len); + return rc; +} +DECLARE_PROC_HANDLER(proc_cpt_table) + +static ctl_table_t lnet_table[] = { + /* + * NB No .strategy entries have been provided since sysctl(8) prefers + * to go via /proc for portability. + */ + { + INIT_CTL_NAME(PSDEV_DEBUG) + .procname = "debug", + .data = &libcfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME(PSDEV_SUBSYSTEM_DEBUG) + .procname = "subsystem_debug", + .data = &libcfs_subsystem_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME(PSDEV_PRINTK) + .procname = "printk", + .data = &libcfs_printk, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dobitmasks, + }, + { + INIT_CTL_NAME(PSDEV_CONSOLE_RATELIMIT) + .procname = "console_ratelimit", + .data = &libcfs_console_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(PSDEV_CONSOLE_MAX_DELAY_CS) + .procname = "console_max_delay_centisecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_max_delay_cs + }, + { + INIT_CTL_NAME(PSDEV_CONSOLE_MIN_DELAY_CS) + .procname = "console_min_delay_centisecs", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_min_delay_cs + }, + { + INIT_CTL_NAME(PSDEV_CONSOLE_BACKOFF) + .procname = "console_backoff", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_console_backoff + }, + + { + INIT_CTL_NAME(PSDEV_DEBUG_PATH) + .procname = "debug_path", + .data = libcfs_debug_file_path_arr, + .maxlen = sizeof(libcfs_debug_file_path_arr), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + + { + INIT_CTL_NAME(PSDEV_CPT_TABLE) + .procname = "cpu_partition_table", + .maxlen = 128, + .mode = 0444, + .proc_handler = &proc_cpt_table, + }, + + { + INIT_CTL_NAME(PSDEV_LNET_UPCALL) + .procname = "upcall", + .data = lnet_upcall, + .maxlen = sizeof(lnet_upcall), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + { + INIT_CTL_NAME(PSDEV_LNET_DEBUG_LOG_UPCALL) + .procname = "debug_log_upcall", + .data = lnet_debug_log_upcall, + .maxlen = sizeof(lnet_debug_log_upcall), + .mode = 0644, + .proc_handler = &proc_dostring, + }, + { + INIT_CTL_NAME(PSDEV_LNET_MEMUSED) + .procname = "lnet_memused", + .data = (int *)&libcfs_kmemory.counter, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY(&sysctl_intvec) + }, + { + INIT_CTL_NAME(PSDEV_LNET_CATASTROPHE) + .procname = "catastrophe", + .data = &libcfs_catastrophe, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY(&sysctl_intvec) + }, + { + INIT_CTL_NAME(PSDEV_LNET_PANIC_ON_LBUG) + .procname = "panic_on_lbug", + .data = &libcfs_panic_on_lbug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY(&sysctl_intvec) + }, + { + INIT_CTL_NAME(PSDEV_LNET_DUMP_KERNEL) + .procname = "dump_kernel", + .maxlen = 256, + .mode = 0200, + .proc_handler = &proc_dump_kernel, + }, + { + INIT_CTL_NAME(PSDEV_LNET_DAEMON_FILE) + .procname = "daemon_file", + .mode = 0644, + .maxlen = 256, + .proc_handler = &proc_daemon_file, + }, + { + INIT_CTL_NAME(PSDEV_LNET_DEBUG_MB) + .procname = "debug_mb", + .mode = 0644, + .proc_handler = &proc_debug_mb, + }, + { + INIT_CTL_NAME(PSDEV_LNET_WATCHDOG_RATELIMIT) + .procname = "watchdog_ratelimit", + .data = &libcfs_watchdog_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &min_watchdog_ratelimit, + .extra2 = &max_watchdog_ratelimit, + }, + { INIT_CTL_NAME(PSDEV_LNET_FORCE_LBUG) + .procname = "force_lbug", + .data = NULL, + .maxlen = 0, + .mode = 0200, + .proc_handler = &libcfs_force_lbug + }, + { + INIT_CTL_NAME(PSDEV_LNET_FAIL_LOC) + .procname = "fail_loc", + .data = &cfs_fail_loc, + .maxlen = sizeof(cfs_fail_loc), + .mode = 0644, + .proc_handler = &proc_fail_loc + }, + { + INIT_CTL_NAME(PSDEV_LNET_FAIL_VAL) + .procname = "fail_val", + .data = &cfs_fail_val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(0) + } +}; + +#ifdef CONFIG_SYSCTL +static ctl_table_t top_table[] = { + { + INIT_CTL_NAME(CTL_LNET) + .procname = "lnet", + .mode = 0555, + .data = NULL, + .maxlen = 0, + .child = lnet_table, + }, + { + INIT_CTL_NAME(0) + } +}; +#endif + +int insert_proc(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header == NULL) + lnet_table_header = cfs_register_sysctl_table(top_table, 0); +#endif + return 0; +} + +void remove_proc(void) +{ +#ifdef CONFIG_SYSCTL + if (lnet_table_header != NULL) + unregister_sysctl_table(lnet_table_header); + + lnet_table_header = NULL; +#endif +} diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c new file mode 100644 index 000000000000..a3043478b7c1 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-sync.c @@ -0,0 +1,35 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +# define DEBUG_SUBSYSTEM S_LNET diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c new file mode 100644 index 000000000000..4a018167cb03 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c @@ -0,0 +1,664 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#include +#include +#include +/* For sys_open & sys_close */ +#include + +int +libcfs_sock_ioctl(int cmd, unsigned long arg) +{ + mm_segment_t oldmm = get_fs(); + struct socket *sock; + int fd = -1; + int rc; + struct file *sock_filp; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + sock_filp = sock_alloc_file(sock, 0, NULL); + if (!sock_filp) { + rc = -ENOMEM; + goto out; + } + + set_fs(KERNEL_DS); + if (sock_filp->f_op->unlocked_ioctl) + rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg); + set_fs(oldmm); + + fput(sock_filp); + + out: + if (fd >= 0) + sys_close(fd); + else + sock_release(sock); + return rc; +} + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + struct ifreq ifr; + int nob; + int rc; + __u32 val; + + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + return -EINVAL; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + + strcpy(ifr.ifr_name, name); + rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + return rc; + } + + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + return 0; + } + + *up = 1; + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + return rc; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + return rc; + } + + val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; + *mask = ntohl(val); + + return 0; +} + +EXPORT_SYMBOL(libcfs_ipif_query); + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + struct ifreq *ifr; + struct ifconf ifc; + int rc; + int nob; + int i; + + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) { + toobig = 1; + nalloc = PAGE_CACHE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + + rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc); + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + LASSERT (rc == 0); + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); + + if (nfound < nalloc || toobig) + break; + + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + + if (nfound == 0) + goto out1; + + LIBCFS_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + /* NULL out all names[i] */ + memset (names, 0, nfound * sizeof(*names)); + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + LIBCFS_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + + out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); + out1: + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + out0: + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_enumerate); + +void +libcfs_ipif_free_enumeration (char **names, int n) +{ + int i; + + LASSERT (n > 0); + + for (i = 0; i < n && names[i] != NULL; i++) + LIBCFS_FREE(names[i], IFNAMSIZ); + + LIBCFS_FREE(names, n * sizeof(*names)); +} + +EXPORT_SYMBOL(libcfs_ipif_free_enumeration); + +int +libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + /* Set send timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket send timeout " + "%ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + } + + set_fs (KERNEL_DS); + then = jiffies; + rc = sock_sendmsg (sock, &msg, iov.iov_len); + ticks -= jiffies - then; + set_fs (oldmm); + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return (-ECONNABORTED); + } + + if (ticks <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} +EXPORT_SYMBOL(libcfs_sock_write); + +int +libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + LASSERT (ticks > 0); + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + /* Set receive timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket recv timeout %ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + + set_fs(KERNEL_DS); + then = jiffies; + rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); + ticks -= jiffies - then; + set_fs(oldmm); + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNRESET; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (ticks <= 0) + return -ETIMEDOUT; + } +} + +EXPORT_SYMBOL(libcfs_sock_read); + +static int +libcfs_sock_create (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + struct socket *sock; + int rc; + int option; + mm_segment_t oldmm = get_fs(); + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (rc); + } + + set_fs (KERNEL_DS); + option = 1; + rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } + + if (local_ip != 0 || local_port != 0) { + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = (local_ip == 0) ? + INADDR_ANY : htonl(local_ip); + + rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + + return 0; + + failed: + sock_release(sock); + return rc; +} + +int +libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize) +{ + mm_segment_t oldmm = get_fs(); + int option; + int rc; + + if (txbufsize != 0) { + option = txbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return (rc); + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return (rc); + } + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_setbuf); + +int +libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port) +{ + struct sockaddr_in sin; + int len = sizeof (sin); + int rc; + + rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len, + remote ? 2 : 0); + if (rc != 0) { + CERROR ("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + + if (ip != NULL) + *ip = ntohl (sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs (sin.sin_port); + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getaddr); + +int +libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize) +{ + + if (txbufsize != NULL) { + *txbufsize = sock->sk->sk_sndbuf; + } + + if (rxbufsize != NULL) { + *rxbufsize = sock->sk->sk_rcvbuf; + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getbuf); + +int +libcfs_sock_listen (struct socket **sockp, + __u32 local_ip, int local_port, int backlog) +{ + int fatal; + int rc; + + rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + + rc = (*sockp)->ops->listen(*sockp, backlog); + if (rc == 0) + return 0; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_listen); + +int +libcfs_sock_accept (struct socket **newsockp, struct socket *sock) +{ + wait_queue_t wait; + struct socket *newsock; + int rc; + + init_waitqueue_entry(&wait, current); + + /* XXX this should add a ref to sock->ops->owner, if + * TCP could be a module */ + rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); + if (rc) { + CERROR("Can't allocate socket\n"); + return rc; + } + + newsock->ops = sock->ops; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(cfs_sk_sleep(sock->sk), &wait); + + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + if (rc == -EAGAIN) { + /* Nothing ready, so wait for activity */ + schedule(); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + } + + remove_wait_queue(cfs_sk_sleep(sock->sk), &wait); + set_current_state(TASK_RUNNING); + + if (rc != 0) + goto failed; + + *newsockp = newsock; + return 0; + + failed: + sock_release(newsock); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_accept); + +void +libcfs_sock_abort_accept (struct socket *sock) +{ + wake_up_all(cfs_sk_sleep(sock->sk)); +} + +EXPORT_SYMBOL(libcfs_sock_abort_accept); + +int +libcfs_sock_connect (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + struct sockaddr_in srvaddr; + int rc; + + rc = libcfs_sock_create(sockp, fatal, local_ip, local_port); + if (rc != 0) + return rc; + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = (*sockp)->ops->connect(*sockp, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + 0); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *fatal = !(rc == -EADDRNOTAVAIL); + + CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_connect); + +void +libcfs_sock_release (struct socket *sock) +{ + sock_release(sock); +} + +EXPORT_SYMBOL(libcfs_sock_release); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c new file mode 100644 index 000000000000..6f563436a255 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.c @@ -0,0 +1,275 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE + +#include +#include "tracefile.h" + +/* percents to share the total debug memory for each type */ +static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = { + 80, /* 80% pages for CFS_TCD_TYPE_PROC */ + 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */ + 10 /* 10% pages for CFS_TCD_TYPE_IRQ */ +}; + +char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; + +struct rw_semaphore cfs_tracefile_sem; + +int cfs_tracefile_init_arch() +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + init_rwsem(&cfs_tracefile_sem); + + /* initialize trace_data */ + memset(cfs_trace_data, 0, sizeof(cfs_trace_data)); + for (i = 0; i < CFS_TCD_TYPE_MAX; i++) { + cfs_trace_data[i] = + kmalloc(sizeof(union cfs_trace_data_union) * + num_possible_cpus(), GFP_KERNEL); + if (cfs_trace_data[i] == NULL) + goto out; + + } + + /* arch related info initialized */ + cfs_tcd_for_each(tcd, i, j) { + spin_lock_init(&tcd->tcd_lock); + tcd->tcd_pages_factor = pages_factor[i]; + tcd->tcd_type = i; + tcd->tcd_cpu = j; + } + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) { + cfs_trace_console_buffers[i][j] = + kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE, + GFP_KERNEL); + + if (cfs_trace_console_buffers[i][j] == NULL) + goto out; + } + + return 0; + +out: + cfs_tracefile_fini_arch(); + printk(KERN_ERR "lnet: Not enough memory\n"); + return -ENOMEM; +} + +void cfs_tracefile_fini_arch() +{ + int i; + int j; + + for (i = 0; i < num_possible_cpus(); i++) + for (j = 0; j < 3; j++) + if (cfs_trace_console_buffers[i][j] != NULL) { + kfree(cfs_trace_console_buffers[i][j]); + cfs_trace_console_buffers[i][j] = NULL; + } + + for (i = 0; cfs_trace_data[i] != NULL; i++) { + kfree(cfs_trace_data[i]); + cfs_trace_data[i] = NULL; + } + + fini_rwsem(&cfs_tracefile_sem); +} + +void cfs_tracefile_read_lock() +{ + down_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_read_unlock() +{ + up_read(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_lock() +{ + down_write(&cfs_tracefile_sem); +} + +void cfs_tracefile_write_unlock() +{ + up_write(&cfs_tracefile_sem); +} + +cfs_trace_buf_type_t cfs_trace_buf_idx_get() +{ + if (in_irq()) + return CFS_TCD_TYPE_IRQ; + else if (in_softirq()) + return CFS_TCD_TYPE_SOFTIRQ; + else + return CFS_TCD_TYPE_PROC; +} + +/* + * The walking argument indicates the locking comes from all tcd types + * iterator and we must lock it and dissable local irqs to avoid deadlocks + * with other interrupt locks that might be happening. See LU-1311 + * for details. + */ +int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_lock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_lock_irq(&tcd->tcd_lock); + else + spin_lock(&tcd->tcd_lock); + return 1; +} + +void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking) +{ + __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); + if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) + spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); + else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) + spin_unlock_bh(&tcd->tcd_lock); + else if (unlikely(walking)) + spin_unlock_irq(&tcd->tcd_lock); + else + spin_unlock(&tcd->tcd_lock); +} + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage) +{ + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + return tcd->tcd_cpu == tage->cpu; +} + +void +cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *msgdata, + unsigned long stack) +{ + struct timeval tv; + + do_gettimeofday(&tv); + + header->ph_subsys = msgdata->msg_subsys; + header->ph_mask = msgdata->msg_mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); + header->ph_sec = (__u32)tv.tv_sec; + header->ph_usec = tv.tv_usec; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = msgdata->msg_line; + header->ph_extern_pid = 0; + return; +} + +static char * +dbghdr_to_err_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNetError"; + default: + return "LustreError"; + } +} + +static char * +dbghdr_to_info_string(struct ptldebug_header *hdr) +{ + switch (hdr->ph_subsys) { + + case S_LND: + case S_LNET: + return "LNet"; + default: + return "Lustre"; + } +} + +void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn) +{ + char *prefix = "Lustre", *ptype = NULL; + + if ((mask & D_EMERG) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = dbghdr_to_err_string(hdr); + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_WARNING; + } else if ((mask & (D_CONSOLE | libcfs_printk)) != 0) { + prefix = dbghdr_to_info_string(hdr); + ptype = KERN_INFO; + } + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %.*s", ptype, prefix, len, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, + hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, + fn, len, buf); + } + return; +} + +int cfs_trace_max_debug_mb(void) +{ + int total_mb = (num_physpages >> (20 - PAGE_SHIFT)); + + return MAX(512, (total_mb * 80)/100); +} diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h new file mode 100644 index 000000000000..ba84e4ffddd1 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tracefile.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_LINUX_TRACEFILE_H__ +#define __LIBCFS_LINUX_TRACEFILE_H__ + +/** + * three types of trace_data in linux + */ +typedef enum { + CFS_TCD_TYPE_PROC = 0, + CFS_TCD_TYPE_SOFTIRQ, + CFS_TCD_TYPE_IRQ, + CFS_TCD_TYPE_MAX +} cfs_trace_buf_type_t; + +#endif diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c new file mode 100644 index 000000000000..e73903cde212 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-utils.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/linux/linux-utils.c + * + * Author: Phil Schwan + */ + +/* + * miscellaneous libcfs stuff + */ +#define DEBUG_SUBSYSTEM S_LNET +#include +#include + +/* + * Convert server error code to client format. Error codes are from + * Linux errno.h, so for Linux client---identity. + */ +int convert_server_error(__u64 ecode) +{ + return ecode; +} +EXPORT_SYMBOL(convert_server_error); + +/* + * convert flag from client to server. + */ +int convert_client_oflag(int cflag, int *result) +{ + *result = cflag; + return 0; +} +EXPORT_SYMBOL(convert_client_oflag); + +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{} + +EXPORT_SYMBOL(cfs_stack_trace_fill); + +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + return NULL; +} +EXPORT_SYMBOL(cfs_stack_trace_frame); diff --git a/drivers/staging/lustre/lustre/libcfs/lwt.c b/drivers/staging/lustre/lustre/libcfs/lwt.c new file mode 100644 index 000000000000..b631f7dde8e7 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/lwt.c @@ -0,0 +1,266 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/lwt.c + * + * Author: Eric Barton + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#if LWT_SUPPORT + +#if !KLWT_SUPPORT +int lwt_enabled; +lwt_cpu_t lwt_cpus[NR_CPUS]; +#endif + +int lwt_pages_per_cpu; + +/* NB only root is allowed to retrieve LWT info; it's an open door into the + * kernel... */ + +int +lwt_lookup_string (int *size, char *knl_ptr, + char *user_ptr, int user_size) +{ + int maxsize = 128; + + /* knl_ptr was retrieved from an LWT snapshot and the caller wants to + * turn it into a string. NB we can crash with an access violation + * trying to determine the string length, so we're trusting our + * caller... */ + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + return (-EPERM); + + if (user_size > 0 && + maxsize > user_size) + maxsize = user_size; + + *size = strnlen (knl_ptr, maxsize - 1) + 1; + + if (user_ptr != NULL) { + if (user_size < 4) + return (-EINVAL); + + if (copy_to_user (user_ptr, knl_ptr, *size)) + return (-EFAULT); + + /* Did I truncate the string? */ + if (knl_ptr[*size - 1] != 0) + copy_to_user (user_ptr + *size - 4, "...", 4); + } + + return (0); +} + +int +lwt_control (int enable, int clear) +{ + lwt_page_t *p; + int i; + int j; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + return (-EPERM); + + if (!enable) { + LWT_EVENT(0,0,0,0); + lwt_enabled = 0; + mb(); + /* give people some time to stop adding traces */ + schedule_timeout(10); + } + + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + if (p == NULL) + return (-ENODATA); + + if (!clear) + continue; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + memset (p->lwtp_events, 0, PAGE_CACHE_SIZE); + + p = list_entry (p->lwtp_list.next, + lwt_page_t, lwtp_list); + } + } + + if (enable) { + lwt_enabled = 1; + mb(); + LWT_EVENT(0,0,0,0); + } + + return (0); +} + +int +lwt_snapshot (cfs_cycles_t *now, int *ncpu, int *total_size, + void *user_ptr, int user_size) +{ + const int events_per_page = PAGE_CACHE_SIZE / sizeof(lwt_event_t); + const int bytes_per_page = events_per_page * sizeof(lwt_event_t); + lwt_page_t *p; + int i; + int j; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + return (-EPERM); + + *ncpu = num_online_cpus(); + *total_size = num_online_cpus() * lwt_pages_per_cpu * + bytes_per_page; + *now = get_cycles(); + + if (user_ptr == NULL) + return (0); + + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + if (p == NULL) + return (-ENODATA); + + for (j = 0; j < lwt_pages_per_cpu; j++) { + if (copy_to_user(user_ptr, p->lwtp_events, + bytes_per_page)) + return (-EFAULT); + + user_ptr = ((char *)user_ptr) + bytes_per_page; + p = list_entry(p->lwtp_list.next, + lwt_page_t, lwtp_list); + } + } + + return (0); +} + +int +lwt_init () +{ + int i; + int j; + + for (i = 0; i < num_online_cpus(); i++) + if (lwt_cpus[i].lwtc_current_page != NULL) + return (-EALREADY); + + LASSERT (!lwt_enabled); + + /* NULL pointers, zero scalars */ + memset (lwt_cpus, 0, sizeof (lwt_cpus)); + lwt_pages_per_cpu = + LWT_MEMORY / (num_online_cpus() * PAGE_CACHE_SIZE); + + for (i = 0; i < num_online_cpus(); i++) + for (j = 0; j < lwt_pages_per_cpu; j++) { + struct page *page = alloc_page (GFP_KERNEL); + lwt_page_t *lwtp; + + if (page == NULL) { + CERROR ("Can't allocate page\n"); + lwt_fini (); + return (-ENOMEM); + } + + LIBCFS_ALLOC(lwtp, sizeof (*lwtp)); + if (lwtp == NULL) { + CERROR ("Can't allocate lwtp\n"); + __free_page(page); + lwt_fini (); + return (-ENOMEM); + } + + lwtp->lwtp_page = page; + lwtp->lwtp_events = page_address(page); + memset (lwtp->lwtp_events, 0, PAGE_CACHE_SIZE); + + if (j == 0) { + INIT_LIST_HEAD (&lwtp->lwtp_list); + lwt_cpus[i].lwtc_current_page = lwtp; + } else { + list_add (&lwtp->lwtp_list, + &lwt_cpus[i].lwtc_current_page->lwtp_list); + } + } + + lwt_enabled = 1; + mb(); + + LWT_EVENT(0,0,0,0); + + return (0); +} + +void +lwt_fini () +{ + int i; + + lwt_control(0, 0); + + for (i = 0; i < num_online_cpus(); i++) + while (lwt_cpus[i].lwtc_current_page != NULL) { + lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page; + + if (list_empty (&lwtp->lwtp_list)) { + lwt_cpus[i].lwtc_current_page = NULL; + } else { + lwt_cpus[i].lwtc_current_page = + list_entry (lwtp->lwtp_list.next, + lwt_page_t, lwtp_list); + + list_del (&lwtp->lwtp_list); + } + + __free_page (lwtp->lwtp_page); + LIBCFS_FREE (lwtp, sizeof (*lwtp)); + } +} + +EXPORT_SYMBOL(lwt_enabled); +EXPORT_SYMBOL(lwt_cpus); + +EXPORT_SYMBOL(lwt_init); +EXPORT_SYMBOL(lwt_fini); +EXPORT_SYMBOL(lwt_lookup_string); +EXPORT_SYMBOL(lwt_control); +EXPORT_SYMBOL(lwt_snapshot); +#endif diff --git a/drivers/staging/lustre/lustre/libcfs/module.c b/drivers/staging/lustre/lustre/libcfs/module.c new file mode 100644 index 000000000000..3372537c6f3b --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/module.c @@ -0,0 +1,498 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include +#include +#include "tracefile.h" + +void +kportal_memhog_free (struct libcfs_device_userstate *ldu) +{ + struct page **level0p = &ldu->ldu_memhog_root_page; + struct page **level1p; + struct page **level2p; + int count1; + int count2; + + if (*level0p != NULL) { + + level1p = (struct page **)page_address(*level0p); + count1 = 0; + + while (count1 < PAGE_CACHE_SIZE/sizeof(struct page *) && + *level1p != NULL) { + + level2p = (struct page **)page_address(*level1p); + count2 = 0; + + while (count2 < PAGE_CACHE_SIZE/sizeof(struct page *) && + *level2p != NULL) { + + __free_page(*level2p); + ldu->ldu_memhog_pages--; + level2p++; + count2++; + } + + __free_page(*level1p); + ldu->ldu_memhog_pages--; + level1p++; + count1++; + } + + __free_page(*level0p); + ldu->ldu_memhog_pages--; + + *level0p = NULL; + } + + LASSERT (ldu->ldu_memhog_pages == 0); +} + +int +kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags) +{ + struct page **level0p; + struct page **level1p; + struct page **level2p; + int count1; + int count2; + + LASSERT (ldu->ldu_memhog_pages == 0); + LASSERT (ldu->ldu_memhog_root_page == NULL); + + if (npages < 0) + return -EINVAL; + + if (npages == 0) + return 0; + + level0p = &ldu->ldu_memhog_root_page; + *level0p = alloc_page(flags); + if (*level0p == NULL) + return -ENOMEM; + ldu->ldu_memhog_pages++; + + level1p = (struct page **)page_address(*level0p); + count1 = 0; + memset(level1p, 0, PAGE_CACHE_SIZE); + + while (ldu->ldu_memhog_pages < npages && + count1 < PAGE_CACHE_SIZE/sizeof(struct page *)) { + + if (cfs_signal_pending()) + return (-EINTR); + + *level1p = alloc_page(flags); + if (*level1p == NULL) + return -ENOMEM; + ldu->ldu_memhog_pages++; + + level2p = (struct page **)page_address(*level1p); + count2 = 0; + memset(level2p, 0, PAGE_CACHE_SIZE); + + while (ldu->ldu_memhog_pages < npages && + count2 < PAGE_CACHE_SIZE/sizeof(struct page *)) { + + if (cfs_signal_pending()) + return (-EINTR); + + *level2p = alloc_page(flags); + if (*level2p == NULL) + return (-ENOMEM); + ldu->ldu_memhog_pages++; + + level2p++; + count2++; + } + + level1p++; + count1++; + } + + return 0; +} + +/* called when opening /dev/device */ +static int libcfs_psdev_open(unsigned long flags, void *args) +{ + struct libcfs_device_userstate *ldu; + ENTRY; + + try_module_get(THIS_MODULE); + + LIBCFS_ALLOC(ldu, sizeof(*ldu)); + if (ldu != NULL) { + ldu->ldu_memhog_pages = 0; + ldu->ldu_memhog_root_page = NULL; + } + *(struct libcfs_device_userstate **)args = ldu; + + RETURN(0); +} + +/* called when closing /dev/device */ +static int libcfs_psdev_release(unsigned long flags, void *args) +{ + struct libcfs_device_userstate *ldu; + ENTRY; + + ldu = (struct libcfs_device_userstate *)args; + if (ldu != NULL) { + kportal_memhog_free(ldu); + LIBCFS_FREE(ldu, sizeof(*ldu)); + } + + module_put(THIS_MODULE); + RETURN(0); +} + +static struct rw_semaphore ioctl_list_sem; +static struct list_head ioctl_list; + +int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand) +{ + int rc = 0; + + down_write(&ioctl_list_sem); + if (!list_empty(&hand->item)) + rc = -EBUSY; + else + list_add_tail(&hand->item, &ioctl_list); + up_write(&ioctl_list_sem); + + return rc; +} +EXPORT_SYMBOL(libcfs_register_ioctl); + +int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand) +{ + int rc = 0; + + down_write(&ioctl_list_sem); + if (list_empty(&hand->item)) + rc = -ENOENT; + else + list_del_init(&hand->item); + up_write(&ioctl_list_sem); + + return rc; +} +EXPORT_SYMBOL(libcfs_deregister_ioctl); + +static int libcfs_ioctl_int(struct cfs_psdev_file *pfile,unsigned long cmd, + void *arg, struct libcfs_ioctl_data *data) +{ + int err = -EINVAL; + ENTRY; + + switch (cmd) { + case IOC_LIBCFS_CLEAR_DEBUG: + libcfs_debug_clear_buffer(); + RETURN(0); + /* + * case IOC_LIBCFS_PANIC: + * Handled in arch/cfs_module.c + */ + case IOC_LIBCFS_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + RETURN(-EINVAL); + libcfs_debug_mark_buffer(data->ioc_inlbuf1); + RETURN(0); +#if LWT_SUPPORT + case IOC_LIBCFS_LWT_CONTROL: + err = lwt_control ((data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0); + break; + + case IOC_LIBCFS_LWT_SNAPSHOT: { + cfs_cycles_t now; + int ncpu; + int total_size; + + err = lwt_snapshot (&now, &ncpu, &total_size, + data->ioc_pbuf1, data->ioc_plen1); + data->ioc_u64[0] = now; + data->ioc_u32[0] = ncpu; + data->ioc_u32[1] = total_size; + + /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ + data->ioc_u32[2] = sizeof(lwt_event_t); + data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where); + + if (err == 0 && + libcfs_ioctl_popdata(arg, data, sizeof (*data))) + err = -EFAULT; + break; + } + + case IOC_LIBCFS_LWT_LOOKUP_STRING: + err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, + data->ioc_pbuf2, data->ioc_plen2); + if (err == 0 && + libcfs_ioctl_popdata(arg, data, sizeof (*data))) + err = -EFAULT; + break; +#endif + case IOC_LIBCFS_MEMHOG: + if (pfile->private_data == NULL) { + err = -EINVAL; + } else { + kportal_memhog_free(pfile->private_data); + /* XXX The ioc_flags is not GFP flags now, need to be fixed */ + err = kportal_memhog_alloc(pfile->private_data, + data->ioc_count, + data->ioc_flags); + if (err != 0) + kportal_memhog_free(pfile->private_data); + } + break; + + case IOC_LIBCFS_PING_TEST: { + extern void (kping_client)(struct libcfs_ioctl_data *); + void (*ping)(struct libcfs_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n", + data->ioc_count, libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(data->ioc_nid)); + ping = symbol_get(kping_client); + if (!ping) + CERROR("symbol_get failed\n"); + else { + ping(data); + symbol_put(kping_client); + } + RETURN(0); + } + + default: { + struct libcfs_ioctl_handler *hand; + err = -EINVAL; + down_read(&ioctl_list_sem); + list_for_each_entry(hand, &ioctl_list, item) { + err = hand->handle_ioctl(cmd, data); + if (err != -EINVAL) { + if (err == 0) + err = libcfs_ioctl_popdata(arg, + data, sizeof (*data)); + break; + } + } + up_read(&ioctl_list_sem); + break; + } + } + + RETURN(err); +} + +static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *arg) +{ + char *buf; + struct libcfs_ioctl_data *data; + int err = 0; + ENTRY; + + LIBCFS_ALLOC_GFP(buf, 1024, GFP_IOFS); + if (buf == NULL) + RETURN(-ENOMEM); + + /* 'cmd' and permissions get checked in our arch-specific caller */ + if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + GOTO(out, err = -EINVAL); + } + data = (struct libcfs_ioctl_data *)buf; + + err = libcfs_ioctl_int(pfile, cmd, arg, data); + +out: + LIBCFS_FREE(buf, 1024); + RETURN(err); +} + + +struct cfs_psdev_ops libcfs_psdev_ops = { + libcfs_psdev_open, + libcfs_psdev_release, + NULL, + NULL, + libcfs_ioctl +}; + +extern int insert_proc(void); +extern void remove_proc(void); +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +extern psdev_t libcfs_dev; +extern struct rw_semaphore cfs_tracefile_sem; +extern struct mutex cfs_trace_thread_mutex; +extern struct cfs_wi_sched *cfs_sched_rehash; + +extern void libcfs_init_nidstrings(void); +extern int libcfs_arch_init(void); +extern void libcfs_arch_cleanup(void); + +static int init_libcfs_module(void) +{ + int rc; + + libcfs_arch_init(); + libcfs_init_nidstrings(); + init_rwsem(&cfs_tracefile_sem); + mutex_init(&cfs_trace_thread_mutex); + init_rwsem(&ioctl_list_sem); + INIT_LIST_HEAD(&ioctl_list); + init_waitqueue_head(&cfs_race_waitq); + + rc = libcfs_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc); + return (rc); + } + + rc = cfs_cpu_init(); + if (rc != 0) + goto cleanup_debug; + +#if LWT_SUPPORT + rc = lwt_init(); + if (rc != 0) { + CERROR("lwt_init: error %d\n", rc); + goto cleanup_debug; + } +#endif + rc = misc_register(&libcfs_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_lwt; + } + + rc = cfs_wi_startup(); + if (rc) { + CERROR("initialize workitem: error %d\n", rc); + goto cleanup_deregister; + } + + /* max to 4 threads, should be enough for rehash */ + rc = min(cfs_cpt_weight(cfs_cpt_table, CFS_CPT_ANY), 4); + rc = cfs_wi_sched_create("cfs_rh", cfs_cpt_table, CFS_CPT_ANY, + rc, &cfs_sched_rehash); + if (rc != 0) { + CERROR("Startup workitem scheduler: error: %d\n", rc); + goto cleanup_deregister; + } + + rc = cfs_crypto_register(); + if (rc) { + CERROR("cfs_crypto_regster: error %d\n", rc); + goto cleanup_wi; + } + + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_crypto; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return 0; + cleanup_crypto: + cfs_crypto_unregister(); + cleanup_wi: + cfs_wi_shutdown(); + cleanup_deregister: + misc_deregister(&libcfs_dev); + cleanup_lwt: +#if LWT_SUPPORT + lwt_fini(); +#endif + cleanup_debug: + libcfs_debug_cleanup(); + return rc; +} + +static void exit_libcfs_module(void) +{ + int rc; + + remove_proc(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + if (cfs_sched_rehash != NULL) { + cfs_wi_sched_destroy(cfs_sched_rehash); + cfs_sched_rehash = NULL; + } + + cfs_crypto_unregister(); + cfs_wi_shutdown(); + + rc = misc_deregister(&libcfs_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + +#if LWT_SUPPORT + lwt_fini(); +#endif + cfs_cpu_fini(); + + if (atomic_read(&libcfs_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&libcfs_kmemory)); + + rc = libcfs_debug_cleanup(); + if (rc) + printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n", + rc); + + fini_rwsem(&ioctl_list_sem); + fini_rwsem(&cfs_tracefile_sem); + + libcfs_arch_cleanup(); +} + +cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module); diff --git a/drivers/staging/lustre/lustre/libcfs/nidstrings.c b/drivers/staging/lustre/lustre/libcfs/nidstrings.c new file mode 100644 index 000000000000..9a2d70ce2421 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/nidstrings.c @@ -0,0 +1,867 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/nidstrings.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx = 0; + +static spinlock_t libcfs_nidstring_lock; + +void libcfs_init_nidstrings (void) +{ + spin_lock_init(&libcfs_nidstring_lock); +} + +# define NIDSTR_LOCK(f) spin_lock_irqsave(&libcfs_nidstring_lock, f) +# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f) + +static char * +libcfs_next_nidstring (void) +{ + char *str; + unsigned long flags; + + NIDSTR_LOCK(flags); + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == + sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0])) + libcfs_nidstring_idx = 0; + + NIDSTR_UNLOCK(flags); + return str; +} + +static int libcfs_lo_str2addr(const char *str, int nob, __u32 *addr); +static void libcfs_ip_addr2str(__u32 addr, char *str); +static int libcfs_ip_str2addr(const char *str, int nob, __u32 *addr); +static void libcfs_decnum_addr2str(__u32 addr, char *str); +static void libcfs_hexnum_addr2str(__u32 addr, char *str); +static int libcfs_num_str2addr(const char *str, int nob, __u32 *addr); +static int libcfs_num_parse(char *str, int len, struct list_head *list); +static int libcfs_num_match(__u32 addr, struct list_head *list); + +struct netstrfns { + int nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(__u32 addr, char *str); + int (*nf_str2addr)(const char *str, int nob, __u32 *addr); + int (*nf_parse_addrlist)(char *str, int len, + struct list_head *list); + int (*nf_match_addr)(__u32 addr, struct list_head *list); +}; + +static struct netstrfns libcfs_netstrfns[] = { + {/* .nf_type */ LOLND, + /* .nf_name */ "lo", + /* .nf_modname */ "klolnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_lo_str2addr, + /* .nf_parse_addr*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ SOCKLND, + /* .nf_name */ "tcp", + /* .nf_modname */ "ksocklnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ O2IBLND, + /* .nf_name */ "o2ib", + /* .nf_modname */ "ko2iblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ CIBLND, + /* .nf_name */ "cib", + /* .nf_modname */ "kciblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ OPENIBLND, + /* .nf_name */ "openib", + /* .nf_modname */ "kopeniblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ IIBLND, + /* .nf_name */ "iib", + /* .nf_modname */ "kiiblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ VIBLND, + /* .nf_name */ "vib", + /* .nf_modname */ "kviblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ RALND, + /* .nf_name */ "ra", + /* .nf_modname */ "kralnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ QSWLND, + /* .nf_name */ "elan", + /* .nf_modname */ "kqswlnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ GMLND, + /* .nf_name */ "gm", + /* .nf_modname */ "kgmlnd", + /* .nf_addr2str */ libcfs_hexnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ MXLND, + /* .nf_name */ "mx", + /* .nf_modname */ "kmxlnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr, + /* .nf_parse_addrlist*/ cfs_ip_addr_parse, + /* .nf_match_addr*/ cfs_ip_addr_match}, + {/* .nf_type */ PTLLND, + /* .nf_name */ "ptl", + /* .nf_modname */ "kptllnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + {/* .nf_type */ GNILND, + /* .nf_name */ "gni", + /* .nf_modname */ "kgnilnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr, + /* .nf_parse_addrlist*/ libcfs_num_parse, + /* .nf_match_addr*/ libcfs_num_match}, + /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */ + {/* .nf_type */ -1}, +}; + +const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]); + +int +libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +void +libcfs_ip_addr2str(__u32 addr, char *str) +{ +#if 0 /* never lookup */ +#endif + snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ + +int +libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) +{ + int a; + int b; + int c; + int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + + return 0; +} + +void +libcfs_decnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "%u", addr); +} + +void +libcfs_hexnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr); +} + +int +libcfs_num_str2addr(const char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +struct netstrfns * +libcfs_lnd2netstrfns(int lnd) +{ + int i; + + if (lnd >= 0) + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +struct netstrfns * +libcfs_namenum2netstrfns(const char *name) +{ + struct netstrfns *nf; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (nf->nf_type >= 0 && + !strncmp(name, nf->nf_name, strlen(nf->nf_name))) + return nf; + } + return NULL; +} + +struct netstrfns * +libcfs_name2netstrfns(const char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (libcfs_netstrfns[i].nf_type >= 0 && + !strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(int type) +{ + return libcfs_lnd2netstrfns(type) != NULL; +} + +char * +libcfs_lnd2modname(int lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} + +char * +libcfs_lnd2str(int lnd) +{ + char *str; + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + if (nf != NULL) + return nf->nf_name; + + str = libcfs_next_nidstring(); + snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd); + return str; +} + +int +libcfs_str2lnd(const char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -1; +} + +char * +libcfs_net2str(__u32 net) +{ + int lnd = LNET_NETTYP(net); + int num = LNET_NETNUM(net); + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + char *str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num); + else if (num == 0) + snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name); + else + snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num); + + return str; +} + +char * +libcfs_nid2str(lnet_nid_t nid) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + int lnd = LNET_NETTYP(net); + int nnum = LNET_NETNUM(net); + struct netstrfns *nf; + char *str; + int nob; + + if (nid == LNET_NID_ANY) + return ""; + + nf = libcfs_lnd2netstrfns(lnd); + str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum); + else { + nf->nf_addr2str(addr, str); + nob = strlen(str); + if (nnum == 0) + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s", + nf->nf_name); + else + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u", + nf->nf_name, nnum); + } + + return str; +} + +static struct netstrfns * +libcfs_str2net_internal(const char *str, __u32 *net) +{ + struct netstrfns *nf; + int nob; + int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (nf->nf_type >= 0 && + !strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(const char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NIDNET(LNET_NID_ANY); +} + +lnet_nid_t +libcfs_str2nid(const char *str) +{ + const char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT (nf != NULL); + } + + if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} + +char * +libcfs_id2str(lnet_process_id_t id) +{ + char *str = libcfs_next_nidstring(); + + if (id.pid == LNET_PID_ANY) { + snprintf(str, LNET_NIDSTR_SIZE, + "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); + return str; + } + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} + +int +libcfs_str2anynid(lnet_nid_t *nidp, const char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} + +/** + * Nid range list syntax. + * \verbatim + * + * :== [ ' ' ] + * :== '@' + * :== '*' | + * | + * + * :== ... + * + * :== | + * + * :== '[' [ ',' ] ']' + * :== | + * '-' | + * '-' '/' + * :== | + * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | + * "vib" | "ra" | "elan" | "mx" | "ptl" + * \endverbatim + */ + +/** + * Structure to represent \ token of the syntax. + * + * One of this is created for each \ parsed. + */ +struct nidrange { + /** + * Link to list of this structures which is built on nid range + * list parsing. + */ + struct list_head nr_link; + /** + * List head for addrrange::ar_link. + */ + struct list_head nr_addrranges; + /** + * Flag indicating that *@ is found. + */ + int nr_all; + /** + * Pointer to corresponding element of libcfs_netstrfns. + */ + struct netstrfns *nr_netstrfns; + /** + * Number of network. E.g. 5 if \ is "elan5". + */ + int nr_netnum; +}; + +/** + * Structure to represent \ token of the syntax. + */ +struct addrrange { + /** + * Link to nidrange::nr_addrranges. + */ + struct list_head ar_link; + /** + * List head for cfs_expr_list::el_list. + */ + struct list_head ar_numaddr_ranges; +}; + +/** + * Nf_parse_addrlist method for networks using numeric addresses. + * + * Examples of such networks are gm and elan. + * + * \retval 0 if \a str parsed to numeric address + * \retval errno otherwise + */ +static int +libcfs_num_parse(char *str, int len, struct list_head *list) +{ + struct cfs_expr_list *el; + int rc; + + rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); + if (rc == 0) + list_add_tail(&el->el_link, list); + + return rc; +} + +/** + * Parses \ token on the syntax. + * + * Allocates struct addrrange and links to \a nidrange via + * (nidrange::nr_addrranges) + * + * \retval 1 if \a src parses to '*' | \ | \ + * \retval 0 otherwise + */ +static int +parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) +{ + struct addrrange *addrrange; + + if (src->ls_len == 1 && src->ls_str[0] == '*') { + nidrange->nr_all = 1; + return 1; + } + + LIBCFS_ALLOC(addrrange, sizeof(struct addrrange)); + if (addrrange == NULL) + return 0; + list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); + INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); + + return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, + src->ls_len, + &addrrange->ar_numaddr_ranges); +} + +/** + * Finds or creates struct nidrange. + * + * Checks if \a src is a valid network name, looks for corresponding + * nidrange on the ist of nidranges (\a nidlist), creates new struct + * nidrange if it is not found. + * + * \retval pointer to struct nidrange matching network specified via \a src + * \retval NULL if \a src does not match any network + */ +static struct nidrange * +add_nidrange(const struct cfs_lstr *src, + struct list_head *nidlist) +{ + struct netstrfns *nf; + struct nidrange *nr; + int endlen; + unsigned netnum; + + if (src->ls_len >= LNET_NIDSTR_SIZE) + return NULL; + + nf = libcfs_namenum2netstrfns(src->ls_str); + if (nf == NULL) + return NULL; + endlen = src->ls_len - strlen(nf->nf_name); + if (endlen == 0) + /* network name only, e.g. "elan" or "tcp" */ + netnum = 0; + else { + /* e.g. "elan25" or "tcp23", refuse to parse if + * network name is not appended with decimal or + * hexadecimal number */ + if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), + endlen, &netnum, 0, MAX_NUMERIC_VALUE)) + return NULL; + } + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns != nf) + continue; + if (nr->nr_netnum != netnum) + continue; + return nr; + } + + LIBCFS_ALLOC(nr, sizeof(struct nidrange)); + if (nr == NULL) + return NULL; + list_add_tail(&nr->nr_link, nidlist); + INIT_LIST_HEAD(&nr->nr_addrranges); + nr->nr_netstrfns = nf; + nr->nr_all = 0; + nr->nr_netnum = netnum; + + return nr; +} + +/** + * Parses \ token of the syntax. + * + * \retval 1 if \a src parses to \ '@' \ + * \retval 0 otherwise + */ +static int +parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) +{ + struct cfs_lstr addrrange; + struct cfs_lstr net; + struct cfs_lstr tmp; + struct nidrange *nr; + + tmp = *src; + if (cfs_gettok(src, '@', &addrrange) == 0) + goto failed; + + if (cfs_gettok(src, '@', &net) == 0 || src->ls_str != NULL) + goto failed; + + nr = add_nidrange(&net, nidlist); + if (nr == NULL) + goto failed; + + if (parse_addrange(&addrrange, nr) != 0) + goto failed; + + return 1; + failed: + CWARN("can't parse nidrange: \"%.*s\"\n", tmp.ls_len, tmp.ls_str); + return 0; +} + +/** + * Frees addrrange structures of \a list. + * + * For each struct addrrange structure found on \a list it frees + * cfs_expr_list list attached to it and frees the addrrange itself. + * + * \retval none + */ +static void +free_addrranges(struct list_head *list) +{ + while (!list_empty(list)) { + struct addrrange *ar; + + ar = list_entry(list->next, struct addrrange, ar_link); + + cfs_expr_list_free_list(&ar->ar_numaddr_ranges); + list_del(&ar->ar_link); + LIBCFS_FREE(ar, sizeof(struct addrrange)); + } +} + +/** + * Frees nidrange strutures of \a list. + * + * For each struct nidrange structure found on \a list it frees + * addrrange list attached to it and frees the nidrange itself. + * + * \retval none + */ +void +cfs_free_nidlist(struct list_head *list) +{ + struct list_head *pos, *next; + struct nidrange *nr; + + list_for_each_safe(pos, next, list) { + nr = list_entry(pos, struct nidrange, nr_link); + free_addrranges(&nr->nr_addrranges); + list_del(pos); + LIBCFS_FREE(nr, sizeof(struct nidrange)); + } +} + +/** + * Parses nid range list. + * + * Parses with rigorous syntax and overflow checking \a str into + * \ [ ' ' \ ], compiles \a str into set of + * structures and links that structure to \a nidlist. The resulting + * list can be used to match a NID againts set of NIDS defined by \a + * str. + * \see cfs_match_nid + * + * \retval 1 on success + * \retval 0 otherwise + */ +int +cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) +{ + struct cfs_lstr src; + struct cfs_lstr res; + int rc; + ENTRY; + + src.ls_str = str; + src.ls_len = len; + INIT_LIST_HEAD(nidlist); + while (src.ls_str) { + rc = cfs_gettok(&src, ' ', &res); + if (rc == 0) { + cfs_free_nidlist(nidlist); + RETURN(0); + } + rc = parse_nidrange(&res, nidlist); + if (rc == 0) { + cfs_free_nidlist(nidlist); + RETURN(0); + } + } + RETURN(1); +} + +/* + * Nf_match_addr method for networks using numeric addresses + * + * \retval 1 on match + * \retval 0 otherwise + */ +static int +libcfs_num_match(__u32 addr, struct list_head *numaddr) +{ + struct cfs_expr_list *el; + + LASSERT(!list_empty(numaddr)); + el = list_entry(numaddr->next, struct cfs_expr_list, el_link); + + return cfs_expr_list_match(addr, el); +} + +/** + * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). + * + * \see cfs_parse_nidlist() + * + * \retval 1 on match + * \retval 0 otherwises + */ +int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) +{ + struct nidrange *nr; + struct addrrange *ar; + ENTRY; + + list_for_each_entry(nr, nidlist, nr_link) { + if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) + continue; + if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) + continue; + if (nr->nr_all) + RETURN(1); + list_for_each_entry(ar, &nr->nr_addrranges, ar_link) + if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), + &ar->ar_numaddr_ranges)) + RETURN(1); + } + RETURN(0); +} + + +EXPORT_SYMBOL(libcfs_isknown_lnd); +EXPORT_SYMBOL(libcfs_lnd2modname); +EXPORT_SYMBOL(libcfs_lnd2str); +EXPORT_SYMBOL(libcfs_str2lnd); +EXPORT_SYMBOL(libcfs_net2str); +EXPORT_SYMBOL(libcfs_nid2str); +EXPORT_SYMBOL(libcfs_str2net); +EXPORT_SYMBOL(libcfs_str2nid); +EXPORT_SYMBOL(libcfs_id2str); +EXPORT_SYMBOL(libcfs_str2anynid); +EXPORT_SYMBOL(cfs_free_nidlist); +EXPORT_SYMBOL(cfs_parse_nidlist); +EXPORT_SYMBOL(cfs_match_nid); diff --git a/drivers/staging/lustre/lustre/libcfs/prng.c b/drivers/staging/lustre/lustre/libcfs/prng.c new file mode 100644 index 000000000000..69224d84bc4b --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/prng.c @@ -0,0 +1,139 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/prng.c + * + * concatenation of following two 16-bit multiply with carry generators + * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16, + * number and carry packed within the same 32 bit integer. + * algorithm recommended by Marsaglia +*/ + +#include + +/* +From: George Marsaglia +Newsgroups: sci.math +Subject: Re: A RANDOM NUMBER GENERATOR FOR C +Date: Tue, 30 Sep 1997 05:29:35 -0700 + + * You may replace the two constants 36969 and 18000 by any + * pair of distinct constants from this list: + * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584 + * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243 + * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974 + * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114 + * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088 + * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834 + * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013 + * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083 + * (or any other 16-bit constants k for which both k*2^16-1 + * and k*2^15-1 are prime) */ + +#define RANDOM_CONST_A 18030 +#define RANDOM_CONST_B 29013 + +static unsigned int seed_x = 521288629; +static unsigned int seed_y = 362436069; + +/** + * cfs_rand - creates new seeds + * + * First it creates new seeds from the previous seeds. Then it generates a + * new psuedo random number for use. + * + * Returns a pseudo-random 32-bit integer + */ +unsigned int cfs_rand(void) +{ + seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16); + seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16); + + return ((seed_x << 16) + (seed_y & 65535)); +} +EXPORT_SYMBOL(cfs_rand); + +/** + * cfs_srand - sets the inital seed + * @seed1 : (seed_x) should have the most entropy in the low bits of the word + * @seed2 : (seed_y) should have the most entropy in the high bits of the word + * + * Replaces the original seeds with new values. Used to generate a new pseudo + * random numbers. + */ +void cfs_srand(unsigned int seed1, unsigned int seed2) +{ + if (seed1) + seed_x = seed1; /* use default seeds if parameter is 0 */ + if (seed2) + seed_y = seed2; +} +EXPORT_SYMBOL(cfs_srand); + +/** + * cfs_get_random_bytes - generate a bunch of random numbers + * @buf : buffer to fill with random numbers + * @size: size of passed in buffer + * + * Fills a buffer with random bytes + */ +void cfs_get_random_bytes(void *buf, int size) +{ + int *p = buf; + int rem, tmp; + + LASSERT(size >= 0); + + rem = min((int)((unsigned long)buf & (sizeof(int) - 1)), size); + if (rem) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, rem); + p = buf + rem; + size -= rem; + } + + while (size >= sizeof(int)) { + get_random_bytes(&tmp, sizeof(tmp)); + *p = cfs_rand() ^ tmp; + size -= sizeof(int); + p++; + } + buf = p; + if (size) { + get_random_bytes(&tmp, sizeof(tmp)); + tmp ^= cfs_rand(); + memcpy(buf, &tmp, size); + } +} +EXPORT_SYMBOL(cfs_get_random_bytes); diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.c b/drivers/staging/lustre/lustre/libcfs/tracefile.c new file mode 100644 index 000000000000..439e71dfae33 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/tracefile.c @@ -0,0 +1,1195 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/tracefile.c + * + * Author: Zach Brown + * Author: Phil Schwan + */ + + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE +#include "tracefile.h" + +#include + +/* XXX move things up to the top, comment */ +union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned; + +char cfs_tracefile[TRACEFILE_NAME_SIZE]; +long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; +static struct tracefiled_ctl trace_tctl; +struct mutex cfs_trace_thread_mutex; +static int thread_running = 0; + +atomic_t cfs_tage_allocated = ATOMIC_INIT(0); + +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd); + +static inline struct cfs_trace_page * +cfs_tage_from_list(struct list_head *list) +{ + return list_entry(list, struct cfs_trace_page, linkage); +} + +static struct cfs_trace_page *cfs_tage_alloc(int gfp) +{ + struct page *page; + struct cfs_trace_page *tage; + + /* My caller is trying to free memory */ + if (!in_interrupt() && memory_pressure_get()) + return NULL; + + /* + * Don't spam console with allocation failures: they will be reported + * by upper layer anyway. + */ + gfp |= __GFP_NOWARN; + page = alloc_page(gfp); + if (page == NULL) + return NULL; + + tage = kmalloc(sizeof(*tage), gfp); + if (tage == NULL) { + __free_page(page); + return NULL; + } + + tage->page = page; + atomic_inc(&cfs_tage_allocated); + return tage; +} + +static void cfs_tage_free(struct cfs_trace_page *tage) +{ + __LASSERT(tage != NULL); + __LASSERT(tage->page != NULL); + + __free_page(tage->page); + kfree(tage); + atomic_dec(&cfs_tage_allocated); +} + +static void cfs_tage_to_tail(struct cfs_trace_page *tage, + struct list_head *queue) +{ + __LASSERT(tage != NULL); + __LASSERT(queue != NULL); + + list_move_tail(&tage->linkage, queue); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp, + struct list_head *stock) +{ + int i; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) { + struct cfs_trace_page *tage; + + tage = cfs_tage_alloc(gfp); + if (tage == NULL) + break; + list_add_tail(&tage->linkage, stock); + } + return i; +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page * +cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len) +{ + struct cfs_trace_page *tage; + + if (tcd->tcd_cur_pages > 0) { + __LASSERT(!list_empty(&tcd->tcd_pages)); + tage = cfs_tage_from_list(tcd->tcd_pages.prev); + if (tage->used + len <= PAGE_CACHE_SIZE) + return tage; + } + + if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { + if (tcd->tcd_cur_stock_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev); + --tcd->tcd_cur_stock_pages; + list_del_init(&tage->linkage); + } else { + tage = cfs_tage_alloc(GFP_ATOMIC); + if (unlikely(tage == NULL)) { + if ((!memory_pressure_get() || + in_interrupt()) && printk_ratelimit()) + printk(KERN_WARNING + "cannot allocate a tage (%ld)\n", + tcd->tcd_cur_pages); + return NULL; + } + } + + tage->used = 0; + tage->cpu = smp_processor_id(); + tage->type = tcd->tcd_type; + list_add_tail(&tage->linkage, &tcd->tcd_pages); + tcd->tcd_cur_pages++; + + if (tcd->tcd_cur_pages > 8 && thread_running) { + struct tracefiled_ctl *tctl = &trace_tctl; + /* + * wake up tracefiled to process some pages. + */ + wake_up(&tctl->tctl_waitq); + } + return tage; + } + return NULL; +} + +static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) +{ + int pgcount = tcd->tcd_cur_pages / 10; + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (printk_ratelimit()) + printk(KERN_WARNING "debug daemon buffer overflowed; " + "discarding 10%% of pages (%d of %ld)\n", + pgcount + 1, tcd->tcd_cur_pages); + + INIT_LIST_HEAD(&pc.pc_pages); + spin_lock_init(&pc.pc_lock); + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + if (pgcount-- == 0) + break; + + list_move_tail(&tage->linkage, &pc.pc_pages); + tcd->tcd_cur_pages--; + } + put_pages_on_tcd_daemon_list(&pc, tcd); +} + +/* return a page that has 'len' bytes left at the end */ +static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd, + unsigned long len) +{ + struct cfs_trace_page *tage; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + if (len > PAGE_CACHE_SIZE) { + printk(KERN_ERR + "cowardly refusing to write %lu bytes in a page\n", len); + return NULL; + } + + tage = cfs_trace_get_tage_try(tcd, len); + if (tage != NULL) + return tage; + if (thread_running) + cfs_tcd_shrink(tcd); + if (tcd->tcd_cur_pages > 0) { + tage = cfs_tage_from_list(tcd->tcd_pages.next); + tage->used = 0; + cfs_tage_to_tail(tage, &tcd->tcd_pages); + } + return tage; +} + +int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, + const char *format, ...) +{ + va_list args; + int rc; + + va_start(args, format); + rc = libcfs_debug_vmsg2(msgdata, format, args, NULL); + va_end(args); + + return rc; +} +EXPORT_SYMBOL(libcfs_debug_msg); + +int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, + const char *format1, va_list args, + const char *format2, ...) +{ + struct cfs_trace_cpu_data *tcd = NULL; + struct ptldebug_header header = {0}; + struct cfs_trace_page *tage; + /* string_buf is used only if tcd != NULL, and is always set then */ + char *string_buf = NULL; + char *debug_buf; + int known_size; + int needed = 85; /* average message length */ + int max_nob; + va_list ap; + int depth; + int i; + int remain; + int mask = msgdata->msg_mask; + char *file = (char *)msgdata->msg_file; + cfs_debug_limit_state_t *cdls = msgdata->msg_cdls; + + if (strchr(file, '/')) + file = strrchr(file, '/') + 1; + + tcd = cfs_trace_get_tcd(); + + /* cfs_trace_get_tcd() grabs a lock, which disables preemption and + * pins us to a particular CPU. This avoids an smp_processor_id() + * warning on Linux when debugging is enabled. */ + cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK()); + + if (tcd == NULL) /* arch may not log in IRQ context */ + goto console; + + if (tcd->tcd_cur_pages == 0) + header.ph_flags |= PH_FLAG_FIRST_RECORD; + + if (tcd->tcd_shutting_down) { + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + depth = __current_nesting_level(); + known_size = strlen(file) + 1 + depth; + if (msgdata->msg_fn) + known_size += strlen(msgdata->msg_fn) + 1; + + if (libcfs_debug_binary) + known_size += sizeof(header); + + /*/ + * '2' used because vsnprintf return real size required for output + * _without_ terminating NULL. + * if needed is to small for this format. + */ + for (i = 0; i < 2; i++) { + tage = cfs_trace_get_tage(tcd, needed + known_size + 1); + if (tage == NULL) { + if (needed + known_size > PAGE_CACHE_SIZE) + mask |= D_ERROR; + + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + string_buf = (char *)page_address(tage->page) + + tage->used + known_size; + + max_nob = PAGE_CACHE_SIZE - tage->used - known_size; + if (max_nob <= 0) { + printk(KERN_EMERG "negative max_nob: %d\n", + max_nob); + mask |= D_ERROR; + cfs_trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + needed = 0; + if (format1) { + va_copy(ap, args); + needed = vsnprintf(string_buf, max_nob, format1, ap); + va_end(ap); + } + + if (format2) { + remain = max_nob - needed; + if (remain < 0) + remain = 0; + + va_start(ap, format2); + needed += vsnprintf(string_buf + needed, remain, + format2, ap); + va_end(ap); + } + + if (needed < max_nob) /* well. printing ok.. */ + break; + } + + if (*(string_buf+needed-1) != '\n') + printk(KERN_INFO "format at %s:%d:%s doesn't end in " + "newline\n", file, msgdata->msg_line, msgdata->msg_fn); + + header.ph_len = known_size + needed; + debug_buf = (char *)page_address(tage->page) + tage->used; + + if (libcfs_debug_binary) { + memcpy(debug_buf, &header, sizeof(header)); + tage->used += sizeof(header); + debug_buf += sizeof(header); + } + + /* indent message according to the nesting level */ + while (depth-- > 0) { + *(debug_buf++) = '.'; + ++ tage->used; + } + + strcpy(debug_buf, file); + tage->used += strlen(file) + 1; + debug_buf += strlen(file) + 1; + + if (msgdata->msg_fn) { + strcpy(debug_buf, msgdata->msg_fn); + tage->used += strlen(msgdata->msg_fn) + 1; + debug_buf += strlen(msgdata->msg_fn) + 1; + } + + __LASSERT(debug_buf == string_buf); + + tage->used += needed; + __LASSERT (tage->used <= PAGE_CACHE_SIZE); + +console: + if ((mask & libcfs_printk) == 0) { + /* no console output requested */ + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cdls != NULL) { + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + !cfs_time_after(cfs_time_current(), cdls->cdls_next)) { + /* skipping a console message */ + cdls->cdls_count++; + if (tcd != NULL) + cfs_trace_put_tcd(tcd); + return 1; + } + + if (cfs_time_after(cfs_time_current(), cdls->cdls_next + + libcfs_console_max_delay + + cfs_time_seconds(10))) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= libcfs_console_backoff * 4; + } else { + cdls->cdls_delay *= libcfs_console_backoff; + + if (cdls->cdls_delay < libcfs_console_min_delay) + cdls->cdls_delay = libcfs_console_min_delay; + else if (cdls->cdls_delay > libcfs_console_max_delay) + cdls->cdls_delay = libcfs_console_max_delay; + } + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1; + } + + if (tcd != NULL) { + cfs_print_to_console(&header, mask, string_buf, needed, file, + msgdata->msg_fn); + cfs_trace_put_tcd(tcd); + } else { + string_buf = cfs_trace_get_console_buffer(); + + needed = 0; + if (format1 != NULL) { + va_copy(ap, args); + needed = vsnprintf(string_buf, + CFS_TRACE_CONSOLE_BUFFER_SIZE, + format1, ap); + va_end(ap); + } + if (format2 != NULL) { + remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed; + if (remain > 0) { + va_start(ap, format2); + needed += vsnprintf(string_buf+needed, remain, + format2, ap); + va_end(ap); + } + } + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + cfs_trace_put_console_buffer(string_buf); + } + + if (cdls != NULL && cdls->cdls_count != 0) { + string_buf = cfs_trace_get_console_buffer(); + + needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE, + "Skipped %d previous similar message%s\n", + cdls->cdls_count, + (cdls->cdls_count > 1) ? "s" : ""); + + cfs_print_to_console(&header, mask, + string_buf, needed, file, msgdata->msg_fn); + + cfs_trace_put_console_buffer(string_buf); + cdls->cdls_count = 0; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_debug_vmsg2); + +void +cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *msgdata) +{ + struct ptldebug_header hdr; + + libcfs_panic_in_progress = 1; + libcfs_catastrophe = 1; + mb(); + + cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK()); + + cfs_print_to_console(&hdr, D_EMERG, str, strlen(str), + msgdata->msg_file, msgdata->msg_fn); + + panic("Lustre debug assertion failure\n"); + + /* not reached */ +} + +static void +panic_collect_pages(struct page_collection *pc) +{ + /* Do the collect_pages job on a single CPU: assumes that all other + * CPUs have been stopped during a panic. If this isn't true for some + * arch, this will have to be implemented separately in each arch. */ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + + INIT_LIST_HEAD(&pc->pc_pages); + + cfs_tcd_for_each(tcd, i, j) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } +} + +static void collect_pages_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + spin_lock(&pc->pc_lock); + cfs_for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + list_splice_init(&tcd->tcd_pages, &pc->pc_pages); + tcd->tcd_cur_pages = 0; + if (pc->pc_want_daemon_pages) { + list_splice_init(&tcd->tcd_daemon_pages, + &pc->pc_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } + } + spin_unlock(&pc->pc_lock); +} + +static void collect_pages(struct page_collection *pc) +{ + INIT_LIST_HEAD(&pc->pc_pages); + + if (libcfs_panic_in_progress) + panic_collect_pages(pc); + else + collect_pages_on_all_cpus(pc); +} + +static void put_pages_back_on_all_cpus(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + struct list_head *cur_head; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + spin_lock(&pc->pc_lock); + cfs_for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + cur_head = tcd->tcd_pages.next; + + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, + linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != cpu || tage->type != i) + continue; + + cfs_tage_to_tail(tage, cur_head); + tcd->tcd_cur_pages++; + } + } + } + spin_unlock(&pc->pc_lock); +} + +static void put_pages_back(struct page_collection *pc) +{ + if (!libcfs_panic_in_progress) + put_pages_back_on_all_cpus(pc); +} + +/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that + * we have a good amount of data at all times for dumping during an LBUG, even + * if we have been steadily writing (and otherwise discarding) pages via the + * debug daemon. */ +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct cfs_trace_cpu_data *tcd) +{ + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock(&pc->pc_lock); + list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type) + continue; + + cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages); + tcd->tcd_cur_daemon_pages++; + + if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { + struct cfs_trace_page *victim; + + __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); + victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next); + + __LASSERT_TAGE_INVARIANT(victim); + + list_del(&victim->linkage); + cfs_tage_free(victim); + tcd->tcd_cur_daemon_pages--; + } + } + spin_unlock(&pc->pc_lock); +} + +static void put_pages_on_daemon_list(struct page_collection *pc) +{ + struct cfs_trace_cpu_data *tcd; + int i, cpu; + + cfs_for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) + put_pages_on_tcd_daemon_list(pc, tcd); + } +} + +void cfs_trace_debug_print(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock_init(&pc.pc_lock); + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + char *p, *file, *fn; + struct page *page; + + __LASSERT_TAGE_INVARIANT(tage); + + page = tage->page; + p = page_address(page); + while (p < ((char *)page_address(page) + tage->used)) { + struct ptldebug_header *hdr; + int len; + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, p, len, file, fn); + + p += len; + } + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_tracefile_dump_all_pages(char *filename) +{ + struct page_collection pc; + struct file *filp; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int rc; + + DECL_MMSPACE; + + cfs_tracefile_write_lock(); + + filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", + filename, rc); + goto out; + } + + spin_lock_init(&pc.pc_lock); + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) { + rc = 0; + goto close; + } + + /* ok, for now, just write the pages. in the future we'll be building + * iobufs with the pages and calling generic_direct_IO */ + MMSPACE_OPEN; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + rc = filp_write(filp, page_address(tage->page), + tage->used, filp_poff(filp)); + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u but wrote " + "%d\n", tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } + list_del(&tage->linkage); + cfs_tage_free(tage); + } + MMSPACE_CLOSE; + rc = filp_fsync(filp); + if (rc) + printk(KERN_ERR "sync returns %d\n", rc); +close: + filp_close(filp, NULL); +out: + cfs_tracefile_write_unlock(); + return rc; +} + +void cfs_trace_flush_pages(void) +{ + struct page_collection pc; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + + spin_lock_init(&pc.pc_lock); + + pc.pc_want_daemon_pages = 1; + collect_pages(&pc); + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { + + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } +} + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char *usr_buffer, int usr_buffer_nob) +{ + int nob; + + if (usr_buffer_nob > knl_buffer_nob) + return -EOVERFLOW; + + if (copy_from_user((void *)knl_buffer, + (void *)usr_buffer, usr_buffer_nob)) + return -EFAULT; + + nob = strnlen(knl_buffer, usr_buffer_nob); + while (nob-- >= 0) /* strip trailing whitespace */ + if (!isspace(knl_buffer[nob])) + break; + + if (nob < 0) /* empty string */ + return -EINVAL; + + if (nob == knl_buffer_nob) /* no space to terminate */ + return -EOVERFLOW; + + knl_buffer[nob + 1] = 0; /* terminate */ + return 0; +} +EXPORT_SYMBOL(cfs_trace_copyin_string); + +int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob, + const char *knl_buffer, char *append) +{ + /* NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and "" (i.e. a + * terminating zero byte) for sysctl entries */ + int nob = strlen(knl_buffer); + + if (nob > usr_buffer_nob) + nob = usr_buffer_nob; + + if (copy_to_user(usr_buffer, knl_buffer, nob)) + return -EFAULT; + + if (append != NULL && nob < usr_buffer_nob) { + if (copy_to_user(usr_buffer + nob, append, 1)) + return -EFAULT; + + nob++; + } + + return nob; +} +EXPORT_SYMBOL(cfs_trace_copyout_string); + +int cfs_trace_allocate_string_buffer(char **str, int nob) +{ + if (nob > 2 * PAGE_CACHE_SIZE) /* string must be "sensible" */ + return -EINVAL; + + *str = kmalloc(nob, GFP_IOFS | __GFP_ZERO); + if (*str == NULL) + return -ENOMEM; + + return 0; +} + +void cfs_trace_free_string_buffer(char *str, int nob) +{ + kfree(str); +} + +int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc != 0) + goto out; + + if (str[0] != '/') { + rc = -EINVAL; + goto out; + } + rc = cfs_tracefile_dump_all_pages(str); +out: + cfs_trace_free_string_buffer(str, usr_str_nob + 1); + return rc; +} + +int cfs_trace_daemon_command(char *str) +{ + int rc = 0; + + cfs_tracefile_write_lock(); + + if (strcmp(str, "stop") == 0) { + cfs_tracefile_write_unlock(); + cfs_trace_stop_thread(); + cfs_tracefile_write_lock(); + memset(cfs_tracefile, 0, sizeof(cfs_tracefile)); + + } else if (strncmp(str, "size=", 5) == 0) { + cfs_tracefile_size = simple_strtoul(str + 5, NULL, 0); + if (cfs_tracefile_size < 10 || cfs_tracefile_size > 20480) + cfs_tracefile_size = CFS_TRACEFILE_SIZE; + else + cfs_tracefile_size <<= 20; + + } else if (strlen(str) >= sizeof(cfs_tracefile)) { + rc = -ENAMETOOLONG; + } else if (str[0] != '/') { + rc = -EINVAL; + } else { + strcpy(cfs_tracefile, str); + + printk(KERN_INFO + "Lustre: debug daemon will attempt to start writing " + "to %s (%lukB max)\n", cfs_tracefile, + (long)(cfs_tracefile_size >> 10)); + + cfs_trace_start_thread(); + } + + cfs_tracefile_write_unlock(); + return rc; +} + +int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob) +{ + char *str; + int rc; + + rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); + if (rc != 0) + return rc; + + rc = cfs_trace_copyin_string(str, usr_str_nob + 1, + usr_str, usr_str_nob); + if (rc == 0) + rc = cfs_trace_daemon_command(str); + + cfs_trace_free_string_buffer(str, usr_str_nob + 1); + return rc; +} + +int cfs_trace_set_debug_mb(int mb) +{ + int i; + int j; + int pages; + int limit = cfs_trace_max_debug_mb(); + struct cfs_trace_cpu_data *tcd; + + if (mb < num_possible_cpus()) { + printk(KERN_WARNING + "Lustre: %d MB is too small for debug buffer size, " + "setting it to %d MB.\n", mb, num_possible_cpus()); + mb = num_possible_cpus(); + } + + if (mb > limit) { + printk(KERN_WARNING + "Lustre: %d MB is too large for debug buffer size, " + "setting it to %d MB.\n", mb, limit); + mb = limit; + } + + mb /= num_possible_cpus(); + pages = mb << (20 - PAGE_CACHE_SHIFT); + + cfs_tracefile_write_lock(); + + cfs_tcd_for_each(tcd, i, j) + tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; + + cfs_tracefile_write_unlock(); + + return 0; +} + +int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob) +{ + char str[32]; + int rc; + + rc = cfs_trace_copyin_string(str, sizeof(str), usr_str, usr_str_nob); + if (rc < 0) + return rc; + + return cfs_trace_set_debug_mb(simple_strtoul(str, NULL, 0)); +} + +int cfs_trace_get_debug_mb(void) +{ + int i; + int j; + struct cfs_trace_cpu_data *tcd; + int total_pages = 0; + + cfs_tracefile_read_lock(); + + cfs_tcd_for_each(tcd, i, j) + total_pages += tcd->tcd_max_pages; + + cfs_tracefile_read_unlock(); + + return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1; +} + +static int tracefiled(void *arg) +{ + struct page_collection pc; + struct tracefiled_ctl *tctl = arg; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + struct file *filp; + int last_loop = 0; + int rc; + + DECL_MMSPACE; + + /* we're started late enough that we pick up init's fs context */ + /* this is so broken in uml? what on earth is going on? */ + + spin_lock_init(&pc.pc_lock); + complete(&tctl->tctl_start); + + while (1) { + wait_queue_t __wait; + + pc.pc_want_daemon_pages = 0; + collect_pages(&pc); + if (list_empty(&pc.pc_pages)) + goto end_loop; + + filp = NULL; + cfs_tracefile_read_lock(); + if (cfs_tracefile[0] != 0) { + filp = filp_open(cfs_tracefile, + O_CREAT | O_RDWR | O_LARGEFILE, + 0600); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + filp = NULL; + printk(KERN_WARNING "couldn't open %s: " + "%d\n", cfs_tracefile, rc); + } + } + cfs_tracefile_read_unlock(); + if (filp == NULL) { + put_pages_on_daemon_list(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + goto end_loop; + } + + MMSPACE_OPEN; + + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) { + static loff_t f_pos; + + __LASSERT_TAGE_INVARIANT(tage); + + if (f_pos >= (off_t)cfs_tracefile_size) + f_pos = 0; + else if (f_pos > (off_t)filp_size(filp)) + f_pos = filp_size(filp); + + rc = filp_write(filp, page_address(tage->page), + tage->used, &f_pos); + if (rc != (int)tage->used) { + printk(KERN_WARNING "wanted to write %u " + "but wrote %d\n", tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + } + } + MMSPACE_CLOSE; + + filp_close(filp, NULL); + put_pages_on_daemon_list(&pc); + if (!list_empty(&pc.pc_pages)) { + int i; + + printk(KERN_ALERT "Lustre: trace pages aren't " + " empty\n"); + printk(KERN_ERR "total cpus(%d): ", + num_possible_cpus()); + for (i = 0; i < num_possible_cpus(); i++) + if (cpu_online(i)) + printk(KERN_ERR "%d(on) ", i); + else + printk(KERN_ERR "%d(off) ", i); + printk(KERN_ERR "\n"); + + i = 0; + list_for_each_entry_safe(tage, tmp, &pc.pc_pages, + linkage) + printk(KERN_ERR "page %d belongs to cpu " + "%d\n", ++i, tage->cpu); + printk(KERN_ERR "There are %d pages unwritten\n", + i); + } + __LASSERT(list_empty(&pc.pc_pages)); +end_loop: + if (atomic_read(&tctl->tctl_shutdown)) { + if (last_loop == 0) { + last_loop = 1; + continue; + } else { + break; + } + } + init_waitqueue_entry_current(&__wait); + add_wait_queue(&tctl->tctl_waitq, &__wait); + set_current_state(TASK_INTERRUPTIBLE); + waitq_timedwait(&__wait, TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + remove_wait_queue(&tctl->tctl_waitq, &__wait); + } + complete(&tctl->tctl_stop); + return 0; +} + +int cfs_trace_start_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + int rc = 0; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) + goto out; + + init_completion(&tctl->tctl_start); + init_completion(&tctl->tctl_stop); + init_waitqueue_head(&tctl->tctl_waitq); + atomic_set(&tctl->tctl_shutdown, 0); + + if (IS_ERR(kthread_run(tracefiled, tctl, "ktracefiled"))) { + rc = -ECHILD; + goto out; + } + + wait_for_completion(&tctl->tctl_start); + thread_running = 1; +out: + mutex_unlock(&cfs_trace_thread_mutex); + return rc; +} + +void cfs_trace_stop_thread(void) +{ + struct tracefiled_ctl *tctl = &trace_tctl; + + mutex_lock(&cfs_trace_thread_mutex); + if (thread_running) { + printk(KERN_INFO + "Lustre: shutting down debug daemon thread...\n"); + atomic_set(&tctl->tctl_shutdown, 1); + wait_for_completion(&tctl->tctl_stop); + thread_running = 0; + } + mutex_unlock(&cfs_trace_thread_mutex); +} + +int cfs_tracefile_init(int max_pages) +{ + struct cfs_trace_cpu_data *tcd; + int i; + int j; + int rc; + int factor; + + rc = cfs_tracefile_init_arch(); + if (rc != 0) + return rc; + + cfs_tcd_for_each(tcd, i, j) { + /* tcd_pages_factor is initialized int tracefile_init_arch. */ + factor = tcd->tcd_pages_factor; + INIT_LIST_HEAD(&tcd->tcd_pages); + INIT_LIST_HEAD(&tcd->tcd_stock_pages); + INIT_LIST_HEAD(&tcd->tcd_daemon_pages); + tcd->tcd_cur_pages = 0; + tcd->tcd_cur_stock_pages = 0; + tcd->tcd_cur_daemon_pages = 0; + tcd->tcd_max_pages = (max_pages * factor) / 100; + LASSERT(tcd->tcd_max_pages > 0); + tcd->tcd_shutting_down = 0; + } + + return 0; +} + +static void trace_cleanup_on_all_cpus(void) +{ + struct cfs_trace_cpu_data *tcd; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + int i, cpu; + + cfs_for_each_possible_cpu(cpu) { + cfs_tcd_for_each_type_lock(tcd, i, cpu) { + tcd->tcd_shutting_down = 1; + + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, + linkage) { + __LASSERT_TAGE_INVARIANT(tage); + + list_del(&tage->linkage); + cfs_tage_free(tage); + } + + tcd->tcd_cur_pages = 0; + } + } +} + +static void cfs_trace_cleanup(void) +{ + struct page_collection pc; + + INIT_LIST_HEAD(&pc.pc_pages); + spin_lock_init(&pc.pc_lock); + + trace_cleanup_on_all_cpus(); + + cfs_tracefile_fini_arch(); +} + +void cfs_tracefile_exit(void) +{ + cfs_trace_stop_thread(); + cfs_trace_cleanup(); +} diff --git a/drivers/staging/lustre/lustre/libcfs/tracefile.h b/drivers/staging/lustre/lustre/libcfs/tracefile.h new file mode 100644 index 000000000000..7e8d17c12b5b --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/tracefile.h @@ -0,0 +1,340 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LIBCFS_TRACEFILE_H__ +#define __LIBCFS_TRACEFILE_H__ + +#include + +#include "linux/linux-tracefile.h" + +/* trace file lock routines */ + +#define TRACEFILE_NAME_SIZE 1024 +extern char cfs_tracefile[TRACEFILE_NAME_SIZE]; +extern long long cfs_tracefile_size; + +extern void libcfs_run_debug_log_upcall(char *file); + +int cfs_tracefile_init_arch(void); +void cfs_tracefile_fini_arch(void); + +void cfs_tracefile_read_lock(void); +void cfs_tracefile_read_unlock(void); +void cfs_tracefile_write_lock(void); +void cfs_tracefile_write_unlock(void); + +int cfs_tracefile_dump_all_pages(char *filename); +void cfs_trace_debug_print(void); +void cfs_trace_flush_pages(void); +int cfs_trace_start_thread(void); +void cfs_trace_stop_thread(void); +int cfs_tracefile_init(int max_pages); +void cfs_tracefile_exit(void); + + + +int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, + const char *usr_buffer, int usr_buffer_nob); +int cfs_trace_copyout_string(char *usr_buffer, int usr_buffer_nob, + const char *knl_str, char *append); +int cfs_trace_allocate_string_buffer(char **str, int nob); +void cfs_trace_free_string_buffer(char *str, int nob); +int cfs_trace_dump_debug_buffer_usrstr(void *usr_str, int usr_str_nob); +int cfs_trace_daemon_command(char *str); +int cfs_trace_daemon_command_usrstr(void *usr_str, int usr_str_nob); +int cfs_trace_set_debug_mb(int mb); +int cfs_trace_set_debug_mb_usrstr(void *usr_str, int usr_str_nob); +int cfs_trace_get_debug_mb(void); + +extern void libcfs_debug_dumplog_internal(void *arg); +extern void libcfs_register_panic_notifier(void); +extern void libcfs_unregister_panic_notifier(void); +extern int libcfs_panic_in_progress; +extern int cfs_trace_max_debug_mb(void); + +#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) +#define CFS_TRACEFILE_SIZE (500 << 20) + +#ifdef LUSTRE_TRACEFILE_PRIVATE + +/* + * Private declare for tracefile + */ +#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) + +#define CFS_TRACEFILE_SIZE (500 << 20) + +/* Size of a buffer for sprinting console messages if we can't get a page + * from system */ +#define CFS_TRACE_CONSOLE_BUFFER_SIZE 1024 + +union cfs_trace_data_union { + struct cfs_trace_cpu_data { + /* + * Even though this structure is meant to be per-CPU, locking + * is needed because in some places the data may be accessed + * from other CPUs. This lock is directly used in trace_get_tcd + * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and + * tcd_for_each_type_lock + */ + spinlock_t tcd_lock; + unsigned long tcd_lock_flags; + + /* + * pages with trace records not yet processed by tracefiled. + */ + struct list_head tcd_pages; + /* number of pages on ->tcd_pages */ + unsigned long tcd_cur_pages; + + /* + * pages with trace records already processed by + * tracefiled. These pages are kept in memory, so that some + * portion of log can be written in the event of LBUG. This + * list is maintained in LRU order. + * + * Pages are moved to ->tcd_daemon_pages by tracefiled() + * (put_pages_on_daemon_list()). LRU pages from this list are + * discarded when list grows too large. + */ + struct list_head tcd_daemon_pages; + /* number of pages on ->tcd_daemon_pages */ + unsigned long tcd_cur_daemon_pages; + + /* + * Maximal number of pages allowed on ->tcd_pages and + * ->tcd_daemon_pages each. + * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current + * implementation. + */ + unsigned long tcd_max_pages; + + /* + * preallocated pages to write trace records into. Pages from + * ->tcd_stock_pages are moved to ->tcd_pages by + * portals_debug_msg(). + * + * This list is necessary, because on some platforms it's + * impossible to perform efficient atomic page allocation in a + * non-blockable context. + * + * Such platforms fill ->tcd_stock_pages "on occasion", when + * tracing code is entered in blockable context. + * + * trace_get_tage_try() tries to get a page from + * ->tcd_stock_pages first and resorts to atomic page + * allocation only if this queue is empty. ->tcd_stock_pages + * is replenished when tracing code is entered in blocking + * context (darwin-tracefile.c:trace_get_tcd()). We try to + * maintain TCD_STOCK_PAGES (40 by default) pages in this + * queue. Atomic allocation is only required if more than + * TCD_STOCK_PAGES pagesful are consumed by trace records all + * emitted in non-blocking contexts. Which is quite unlikely. + */ + struct list_head tcd_stock_pages; + /* number of pages on ->tcd_stock_pages */ + unsigned long tcd_cur_stock_pages; + + unsigned short tcd_shutting_down; + unsigned short tcd_cpu; + unsigned short tcd_type; + /* The factors to share debug memory. */ + unsigned short tcd_pages_factor; + } tcd; + char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))]; +}; + +#define TCD_MAX_TYPES 8 +extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS]; + +#define cfs_tcd_for_each(tcd, i, j) \ + for (i = 0; cfs_trace_data[i] != NULL; i++) \ + for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \ + j < num_possible_cpus(); \ + j++, (tcd) = &(*cfs_trace_data[i])[j].tcd) + +#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \ + for (i = 0; cfs_trace_data[i] && \ + (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \ + cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++) + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct page_collection { + struct list_head pc_pages; + /* + * spin-lock protecting ->pc_pages. It is taken by smp_call_function() + * call-back functions. XXX nikita: Which is horrible: all processors + * receive NMI at the same time only to be serialized by this + * lock. Probably ->pc_pages should be replaced with an array of + * NR_CPUS elements accessed locklessly. + */ + spinlock_t pc_lock; + /* + * if this flag is set, collect_pages() will spill both + * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, + * only ->tcd_pages are spilled. + */ + int pc_want_daemon_pages; +}; + +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct tracefiled_ctl { + struct completion tctl_start; + struct completion tctl_stop; + wait_queue_head_t tctl_waitq; + pid_t tctl_pid; + atomic_t tctl_shutdown; +}; + +/* + * small data-structure for each page owned by tracefiled. + */ +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ +struct cfs_trace_page { + /* + * page itself + */ + struct page *page; + /* + * linkage into one of the lists in trace_data_union or + * page_collection + */ + struct list_head linkage; + /* + * number of bytes used within this page + */ + unsigned int used; + /* + * cpu that owns this page + */ + unsigned short cpu; + /* + * type(context) of this page + */ + unsigned short type; +}; + +extern void cfs_set_ptldebug_header(struct ptldebug_header *header, + struct libcfs_debug_msg_data *m, + unsigned long stack); +extern void cfs_print_to_console(struct ptldebug_header *hdr, int mask, + const char *buf, int len, const char *file, + const char *fn); + +extern int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking); +extern void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking); + +/** + * trace_buf_type_t, trace_buf_idx_get() and trace_console_buffers[][] + * are not public libcfs API; they should be defined in + * platform-specific tracefile include files + * (see, for example, linux-tracefile.h). + */ + +extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; +extern cfs_trace_buf_type_t cfs_trace_buf_idx_get(void); + +static inline char * +cfs_trace_get_console_buffer(void) +{ + unsigned int i = get_cpu(); + unsigned int j = cfs_trace_buf_idx_get(); + + return cfs_trace_console_buffers[i][j]; +} + +static inline void +cfs_trace_put_console_buffer(char *buffer) +{ + put_cpu(); +} + +static inline struct cfs_trace_cpu_data * +cfs_trace_get_tcd(void) +{ + struct cfs_trace_cpu_data *tcd = + &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd; + + cfs_trace_lock_tcd(tcd, 0); + + return tcd; +} + +static inline void +cfs_trace_put_tcd (struct cfs_trace_cpu_data *tcd) +{ + cfs_trace_unlock_tcd(tcd, 0); + + put_cpu(); +} + +int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, int gfp, + struct list_head *stock); + + +int cfs_tcd_owns_tage(struct cfs_trace_cpu_data *tcd, + struct cfs_trace_page *tage); + +extern void cfs_trace_assertion_failed(const char *str, + struct libcfs_debug_msg_data *m); + +/* ASSERTION that is safe to use within the debug system */ +#define __LASSERT(cond) \ +do { \ + if (unlikely(!(cond))) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ + cfs_trace_assertion_failed("ASSERTION("#cond") failed", \ + &msgdata); \ + } \ +} while (0) + +#define __LASSERT_TAGE_INVARIANT(tage) \ +do { \ + __LASSERT(tage != NULL); \ + __LASSERT(tage->page != NULL); \ + __LASSERT(tage->used <= PAGE_CACHE_SIZE); \ + __LASSERT(page_count(tage->page) > 0); \ +} while (0) + +#endif /* LUSTRE_TRACEFILE_PRIVATE */ + +#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/drivers/staging/lustre/lustre/libcfs/upcall_cache.c b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c new file mode 100644 index 000000000000..18c68c3493b8 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/upcall_cache.c @@ -0,0 +1,462 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/upcall_cache.c + * + * Supplementary groups cache. + */ +#define DEBUG_SUBSYSTEM S_SEC + +#include + +static struct upcall_cache_entry *alloc_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry; + + LIBCFS_ALLOC(entry, sizeof(*entry)); + if (!entry) + return NULL; + + UC_CACHE_SET_NEW(entry); + INIT_LIST_HEAD(&entry->ue_hash); + entry->ue_key = key; + atomic_set(&entry->ue_refcount, 0); + init_waitqueue_head(&entry->ue_waitq); + if (cache->uc_ops->init_entry) + cache->uc_ops->init_entry(entry, args); + return entry; +} + +/* protected by cache lock */ +static void free_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (cache->uc_ops->free_entry) + cache->uc_ops->free_entry(cache, entry); + + list_del(&entry->ue_hash); + CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n", + entry, entry->ue_key); + LIBCFS_FREE(entry, sizeof(*entry)); +} + +static inline int upcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->upcall_compare) + return cache->uc_ops->upcall_compare(cache, entry, key, args); + + return 0; +} + +static inline int downcall_compare(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key, void *args) +{ + if (entry->ue_key != key) + return -1; + + if (cache->uc_ops->downcall_compare) + return cache->uc_ops->downcall_compare(cache, entry, key, args); + + return 0; +} + +static inline void get_entry(struct upcall_cache_entry *entry) +{ + atomic_inc(&entry->ue_refcount); +} + +static inline void put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->ue_refcount) && + (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) { + free_entry(cache, entry); + } +} + +static int check_unlink_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + if (UC_CACHE_IS_VALID(entry) && + cfs_time_before(cfs_time_current(), entry->ue_expire)) + return 0; + + if (UC_CACHE_IS_ACQUIRING(entry)) { + if (entry->ue_acquire_expire == 0 || + cfs_time_before(cfs_time_current(), + entry->ue_acquire_expire)) + return 0; + + UC_CACHE_SET_EXPIRED(entry); + wake_up_all(&entry->ue_waitq); + } else if (!UC_CACHE_IS_INVALID(entry)) { + UC_CACHE_SET_EXPIRED(entry); + } + + list_del_init(&entry->ue_hash); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + return 1; +} + +static inline int refresh_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + LASSERT(cache->uc_ops->do_upcall); + return cache->uc_ops->do_upcall(cache, entry); +} + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *cache, + __u64 key, void *args) +{ + struct upcall_cache_entry *entry = NULL, *new = NULL, *next; + struct list_head *head; + wait_queue_t wait; + int rc, found; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; +find_again: + found = 0; + spin_lock(&cache->uc_lock); + list_for_each_entry_safe(entry, next, head, ue_hash) { + /* check invalid & expired items */ + if (check_unlink_entry(cache, entry)) + continue; + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (!found) { + if (!new) { + spin_unlock(&cache->uc_lock); + new = alloc_entry(cache, key, args); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + goto find_again; + } else { + list_add(&new->ue_hash, head); + entry = new; + } + } else { + if (new) { + free_entry(cache, new); + new = NULL; + } + list_move(&entry->ue_hash, head); + } + get_entry(entry); + + /* acquire for new one */ + if (UC_CACHE_IS_NEW(entry)) { + UC_CACHE_SET_ACQUIRING(entry); + UC_CACHE_CLEAR_NEW(entry); + spin_unlock(&cache->uc_lock); + rc = refresh_entry(cache, entry); + spin_lock(&cache->uc_lock); + entry->ue_acquire_expire = + cfs_time_shift(cache->uc_acquire_expire); + if (rc < 0) { + UC_CACHE_CLEAR_ACQUIRING(entry); + UC_CACHE_SET_INVALID(entry); + wake_up_all(&entry->ue_waitq); + if (unlikely(rc == -EREMCHG)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(rc)); + } + } + } + /* someone (and only one) is doing upcall upon this item, + * wait it to complete */ + if (UC_CACHE_IS_ACQUIRING(entry)) { + long expiry = (entry == new) ? + cfs_time_seconds(cache->uc_acquire_expire) : + MAX_SCHEDULE_TIMEOUT; + long left; + + init_waitqueue_entry_current(&wait); + add_wait_queue(&entry->ue_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&cache->uc_lock); + + left = waitq_timedwait(&wait, TASK_INTERRUPTIBLE, + expiry); + + spin_lock(&cache->uc_lock); + remove_wait_queue(&entry->ue_waitq, &wait); + if (UC_CACHE_IS_ACQUIRING(entry)) { + /* we're interrupted or upcall failed in the middle */ + rc = left > 0 ? -EINTR : -ETIMEDOUT; + CERROR("acquire for key "LPU64": error %d\n", + entry->ue_key, rc); + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(rc)); + } + } + + /* invalid means error, don't need to try again */ + if (UC_CACHE_IS_INVALID(entry)) { + put_entry(cache, entry); + GOTO(out, entry = ERR_PTR(-EIDRM)); + } + + /* check expired + * We can't refresh the existing one because some + * memory might be shared by multiple processes. + */ + if (check_unlink_entry(cache, entry)) { + /* if expired, try again. but if this entry is + * created by me but too quickly turn to expired + * without any error, should at least give a + * chance to use it once. + */ + if (entry != new) { + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + new = NULL; + goto find_again; + } + } + + /* Now we know it's good */ +out: + spin_unlock(&cache->uc_lock); + RETURN(entry); +} +EXPORT_SYMBOL(upcall_cache_get_entry); + +void upcall_cache_put_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + ENTRY; + + if (!entry) { + EXIT; + return; + } + + LASSERT(atomic_read(&entry->ue_refcount) > 0); + spin_lock(&cache->uc_lock); + put_entry(cache, entry); + spin_unlock(&cache->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_put_entry); + +int upcall_cache_downcall(struct upcall_cache *cache, __u32 err, __u64 key, + void *args) +{ + struct upcall_cache_entry *entry = NULL; + struct list_head *head; + int found = 0, rc = 0; + ENTRY; + + LASSERT(cache); + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (downcall_compare(cache, entry, key, args) == 0) { + found = 1; + get_entry(entry); + break; + } + } + + if (!found) { + CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n", + cache->uc_name, key); + /* haven't found, it's possible */ + spin_unlock(&cache->uc_lock); + RETURN(-EINVAL); + } + + if (err) { + CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n", + cache->uc_name, entry->ue_key, err); + GOTO(out, rc = -EINVAL); + } + + if (!UC_CACHE_IS_ACQUIRING(entry)) { + CDEBUG(D_RPCTRACE,"%s: found uptodate entry %p (key "LPU64")\n", + cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = 0); + } + + if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) { + CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n", + cache->uc_name, entry, entry->ue_key); + GOTO(out, rc = -EINVAL); + } + + spin_unlock(&cache->uc_lock); + if (cache->uc_ops->parse_downcall) + rc = cache->uc_ops->parse_downcall(cache, entry, args); + spin_lock(&cache->uc_lock); + if (rc) + GOTO(out, rc); + + entry->ue_expire = cfs_time_shift(cache->uc_entry_expire); + UC_CACHE_SET_VALID(entry); + CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n", + cache->uc_name, entry, entry->ue_key); +out: + if (rc) { + UC_CACHE_SET_INVALID(entry); + list_del_init(&entry->ue_hash); + } + UC_CACHE_CLEAR_ACQUIRING(entry); + spin_unlock(&cache->uc_lock); + wake_up_all(&entry->ue_waitq); + put_entry(cache, entry); + + RETURN(rc); +} +EXPORT_SYMBOL(upcall_cache_downcall); + +static void cache_flush(struct upcall_cache *cache, int force) +{ + struct upcall_cache_entry *entry, *next; + int i; + ENTRY; + + spin_lock(&cache->uc_lock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) { + list_for_each_entry_safe(entry, next, + &cache->uc_hashtable[i], ue_hash) { + if (!force && atomic_read(&entry->ue_refcount)) { + UC_CACHE_SET_EXPIRED(entry); + continue; + } + LASSERT(!atomic_read(&entry->ue_refcount)); + free_entry(cache, entry); + } + } + spin_unlock(&cache->uc_lock); + EXIT; +} + +void upcall_cache_flush_idle(struct upcall_cache *cache) +{ + cache_flush(cache, 0); +} +EXPORT_SYMBOL(upcall_cache_flush_idle); + +void upcall_cache_flush_all(struct upcall_cache *cache) +{ + cache_flush(cache, 1); +} +EXPORT_SYMBOL(upcall_cache_flush_all); + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key, void *args) +{ + struct list_head *head; + struct upcall_cache_entry *entry; + int found = 0; + ENTRY; + + head = &cache->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&cache->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (upcall_compare(cache, entry, key, args) == 0) { + found = 1; + break; + } + } + + if (found) { + CWARN("%s: flush entry %p: key "LPU64", ref %d, fl %x, " + "cur %lu, ex %ld/%ld\n", + cache->uc_name, entry, entry->ue_key, + atomic_read(&entry->ue_refcount), entry->ue_flags, + cfs_time_current_sec(), entry->ue_acquire_expire, + entry->ue_expire); + UC_CACHE_SET_EXPIRED(entry); + if (!atomic_read(&entry->ue_refcount)) + free_entry(cache, entry); + } + spin_unlock(&cache->uc_lock); +} +EXPORT_SYMBOL(upcall_cache_flush_one); + +struct upcall_cache *upcall_cache_init(const char *name, const char *upcall, + struct upcall_cache_ops *ops) +{ + struct upcall_cache *cache; + int i; + ENTRY; + + LIBCFS_ALLOC(cache, sizeof(*cache)); + if (!cache) + RETURN(ERR_PTR(-ENOMEM)); + + spin_lock_init(&cache->uc_lock); + rwlock_init(&cache->uc_upcall_rwlock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) + INIT_LIST_HEAD(&cache->uc_hashtable[i]); + strncpy(cache->uc_name, name, sizeof(cache->uc_name) - 1); + /* upcall pathname proc tunable */ + strncpy(cache->uc_upcall, upcall, sizeof(cache->uc_upcall) - 1); + cache->uc_entry_expire = 20 * 60; + cache->uc_acquire_expire = 30; + cache->uc_ops = ops; + + RETURN(cache); +} +EXPORT_SYMBOL(upcall_cache_init); + +void upcall_cache_cleanup(struct upcall_cache *cache) +{ + if (!cache) + return; + upcall_cache_flush_all(cache); + LIBCFS_FREE(cache, sizeof(*cache)); +} +EXPORT_SYMBOL(upcall_cache_cleanup); diff --git a/drivers/staging/lustre/lustre/libcfs/watchdog.c b/drivers/staging/lustre/lustre/libcfs/watchdog.c new file mode 100644 index 000000000000..7c385ada3e10 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/watchdog.c @@ -0,0 +1,516 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/watchdog.c + * + * Author: Jacob Berkman + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include "tracefile.h" + +struct lc_watchdog { + spinlock_t lcw_lock; /* check or change lcw_list */ + int lcw_refcount; /* must hold lcw_pending_timers_lock */ + timer_list_t lcw_timer; /* kernel timer */ + struct list_head lcw_list; /* chain on pending list */ + cfs_time_t lcw_last_touched; /* last touched stamp */ + task_t *lcw_task; /* owner task */ + void (*lcw_callback)(pid_t, void *); + void *lcw_data; + + pid_t lcw_pid; + + enum { + LC_WATCHDOG_DISABLED, + LC_WATCHDOG_ENABLED, + LC_WATCHDOG_EXPIRED + } lcw_state; +}; + +#ifdef WITH_WATCHDOG +/* + * The dispatcher will complete lcw_start_completion when it starts, + * and lcw_stop_completion when it exits. + * Wake lcw_event_waitq to signal timer callback dispatches. + */ +static struct completion lcw_start_completion; +static struct completion lcw_stop_completion; +static wait_queue_head_t lcw_event_waitq; + +/* + * Set this and wake lcw_event_waitq to stop the dispatcher. + */ +enum { + LCW_FLAG_STOP = 0 +}; +static unsigned long lcw_flags = 0; + +/* + * Number of outstanding watchdogs. + * When it hits 1, we start the dispatcher. + * When it hits 0, we stop the dispatcher. + */ +static __u32 lcw_refcount = 0; +static DEFINE_MUTEX(lcw_refcount_mutex); + +/* + * List of timers that have fired that need their callbacks run by the + * dispatcher. + */ +/* BH lock! */ +static DEFINE_SPINLOCK(lcw_pending_timers_lock); +static struct list_head lcw_pending_timers = LIST_HEAD_INIT(lcw_pending_timers); + +/* Last time a watchdog expired */ +static cfs_time_t lcw_last_watchdog_time; +static int lcw_recent_watchdog_count; + +static void +lcw_dump(struct lc_watchdog *lcw) +{ + ENTRY; + rcu_read_lock(); + if (lcw->lcw_task == NULL) { + LCONSOLE_WARN("Process " LPPID " was not found in the task " + "list; watchdog callback may be incomplete\n", + (int)lcw->lcw_pid); + } else { + libcfs_debug_dumpstack(lcw->lcw_task); + } + + rcu_read_unlock(); + EXIT; +} + +static void lcw_cb(ulong_ptr_t data) +{ + struct lc_watchdog *lcw = (struct lc_watchdog *)data; + ENTRY; + + if (lcw->lcw_state != LC_WATCHDOG_ENABLED) { + EXIT; + return; + } + + lcw->lcw_state = LC_WATCHDOG_EXPIRED; + + spin_lock_bh(&lcw->lcw_lock); + LASSERT(list_empty(&lcw->lcw_list)); + + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount++; /* +1 for pending list */ + list_add(&lcw->lcw_list, &lcw_pending_timers); + wake_up(&lcw_event_waitq); + + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + EXIT; +} + +static int is_watchdog_fired(void) +{ + int rc; + + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) + return 1; + + spin_lock_bh(&lcw_pending_timers_lock); + rc = !list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + return rc; +} + +static void lcw_dump_stack(struct lc_watchdog *lcw) +{ + cfs_time_t current_time; + cfs_duration_t delta_time; + struct timeval timediff; + + current_time = cfs_time_current(); + delta_time = cfs_time_sub(current_time, lcw->lcw_last_touched); + cfs_duration_usec(delta_time, &timediff); + + /* + * Check to see if we should throttle the watchdog timer to avoid + * too many dumps going to the console thus triggering an NMI. + */ + delta_time = cfs_duration_sec(cfs_time_sub(current_time, + lcw_last_watchdog_time)); + + if (delta_time < libcfs_watchdog_ratelimit && + lcw_recent_watchdog_count > 3) { + LCONSOLE_WARN("Service thread pid %u was inactive for " + "%lu.%.02lus. Watchdog stack traces are limited " + "to 3 per %d seconds, skipping this one.\n", + (int)lcw->lcw_pid, + timediff.tv_sec, + timediff.tv_usec / 10000, + libcfs_watchdog_ratelimit); + } else { + if (delta_time < libcfs_watchdog_ratelimit) { + lcw_recent_watchdog_count++; + } else { + memcpy(&lcw_last_watchdog_time, ¤t_time, + sizeof(current_time)); + lcw_recent_watchdog_count = 0; + } + + LCONSOLE_WARN("Service thread pid %u was inactive for " + "%lu.%.02lus. The thread might be hung, or it " + "might only be slow and will resume later. " + "Dumping the stack trace for debugging purposes:" + "\n", + (int)lcw->lcw_pid, + timediff.tv_sec, + timediff.tv_usec / 10000); + lcw_dump(lcw); + } +} + +static int lcw_dispatch_main(void *data) +{ + int rc = 0; + struct lc_watchdog *lcw; + LIST_HEAD (zombies); + + ENTRY; + + complete(&lcw_start_completion); + + while (1) { + int dumplog = 1; + + cfs_wait_event_interruptible(lcw_event_waitq, + is_watchdog_fired(), rc); + CDEBUG(D_INFO, "Watchdog got woken up...\n"); + if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { + CDEBUG(D_INFO, "LCW_FLAG_STOP set, shutting down...\n"); + + spin_lock_bh(&lcw_pending_timers_lock); + rc = !list_empty(&lcw_pending_timers); + spin_unlock_bh(&lcw_pending_timers_lock); + if (rc) { + CERROR("pending timers list was not empty at " + "time of watchdog dispatch shutdown\n"); + } + break; + } + + spin_lock_bh(&lcw_pending_timers_lock); + while (!list_empty(&lcw_pending_timers)) { + int is_dumplog; + + lcw = list_entry(lcw_pending_timers.next, + struct lc_watchdog, lcw_list); + /* +1 ref for callback to make sure lwc wouldn't be + * deleted after releasing lcw_pending_timers_lock */ + lcw->lcw_refcount++; + spin_unlock_bh(&lcw_pending_timers_lock); + + /* lock ordering */ + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + + if (list_empty(&lcw->lcw_list)) { + /* already removed from pending list */ + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + list_add(&lcw->lcw_list, &zombies); + spin_unlock_bh(&lcw->lcw_lock); + /* still hold lcw_pending_timers_lock */ + continue; + } + + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + + CDEBUG(D_INFO, "found lcw for pid " LPPID "\n", + lcw->lcw_pid); + lcw_dump_stack(lcw); + + is_dumplog = lcw->lcw_callback == lc_watchdog_dumplog; + if (lcw->lcw_state != LC_WATCHDOG_DISABLED && + (dumplog || !is_dumplog)) { + lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data); + if (dumplog && is_dumplog) + dumplog = 0; + } + + spin_lock_bh(&lcw_pending_timers_lock); + lcw->lcw_refcount--; /* -1 ref for callback */ + if (lcw->lcw_refcount == 0) + list_add(&lcw->lcw_list, &zombies); + } + spin_unlock_bh(&lcw_pending_timers_lock); + + while (!list_empty(&zombies)) { + lcw = list_entry(lcw_pending_timers.next, + struct lc_watchdog, lcw_list); + list_del(&lcw->lcw_list); + LIBCFS_FREE(lcw, sizeof(*lcw)); + } + } + + complete(&lcw_stop_completion); + + RETURN(rc); +} + +static void lcw_dispatch_start(void) +{ + task_t *task; + + ENTRY; + LASSERT(lcw_refcount == 1); + + init_completion(&lcw_stop_completion); + init_completion(&lcw_start_completion); + init_waitqueue_head(&lcw_event_waitq); + + CDEBUG(D_INFO, "starting dispatch thread\n"); + task = kthread_run(lcw_dispatch_main, NULL, "lc_watchdogd"); + if (IS_ERR(task)) { + CERROR("error spawning watchdog dispatch thread: %ld\n", + PTR_ERR(task)); + EXIT; + return; + } + wait_for_completion(&lcw_start_completion); + CDEBUG(D_INFO, "watchdog dispatcher initialization complete.\n"); + + EXIT; +} + +static void lcw_dispatch_stop(void) +{ + ENTRY; + LASSERT(lcw_refcount == 0); + + CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); + + set_bit(LCW_FLAG_STOP, &lcw_flags); + wake_up(&lcw_event_waitq); + + wait_for_completion(&lcw_stop_completion); + + CDEBUG(D_INFO, "watchdog dispatcher has shut down.\n"); + + EXIT; +} + +struct lc_watchdog *lc_watchdog_add(int timeout, + void (*callback)(pid_t, void *), + void *data) +{ + struct lc_watchdog *lcw = NULL; + ENTRY; + + LIBCFS_ALLOC(lcw, sizeof(*lcw)); + if (lcw == NULL) { + CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + + spin_lock_init(&lcw->lcw_lock); + lcw->lcw_refcount = 1; /* refcount for owner */ + lcw->lcw_task = current; + lcw->lcw_pid = current_pid(); + lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog; + lcw->lcw_data = data; + lcw->lcw_state = LC_WATCHDOG_DISABLED; + + INIT_LIST_HEAD(&lcw->lcw_list); + cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw); + + mutex_lock(&lcw_refcount_mutex); + if (++lcw_refcount == 1) + lcw_dispatch_start(); + mutex_unlock(&lcw_refcount_mutex); + + /* Keep this working in case we enable them by default */ + if (lcw->lcw_state == LC_WATCHDOG_ENABLED) { + lcw->lcw_last_touched = cfs_time_current(); + cfs_timer_arm(&lcw->lcw_timer, cfs_time_seconds(timeout) + + cfs_time_current()); + } + + RETURN(lcw); +} +EXPORT_SYMBOL(lc_watchdog_add); + +static void lcw_update_time(struct lc_watchdog *lcw, const char *message) +{ + cfs_time_t newtime = cfs_time_current();; + + if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) { + struct timeval timediff; + cfs_time_t delta_time = cfs_time_sub(newtime, + lcw->lcw_last_touched); + cfs_duration_usec(delta_time, &timediff); + + LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. " + "This indicates the system was overloaded (too " + "many service threads, or there were not enough " + "hardware resources).\n", + lcw->lcw_pid, + message, + timediff.tv_sec, + timediff.tv_usec / 10000); + } + lcw->lcw_last_touched = newtime; +} + +static void lc_watchdog_del_pending(struct lc_watchdog *lcw) +{ + spin_lock_bh(&lcw->lcw_lock); + if (unlikely(!list_empty(&lcw->lcw_list))) { + spin_lock_bh(&lcw_pending_timers_lock); + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + spin_unlock_bh(&lcw_pending_timers_lock); + } + + spin_unlock_bh(&lcw->lcw_lock); +} + +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) +{ + ENTRY; + LASSERT(lcw != NULL); + + lc_watchdog_del_pending(lcw); + + lcw_update_time(lcw, "resumed"); + lcw->lcw_state = LC_WATCHDOG_ENABLED; + + cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() + + cfs_time_seconds(timeout)); + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_touch); + +void lc_watchdog_disable(struct lc_watchdog *lcw) +{ + ENTRY; + LASSERT(lcw != NULL); + + lc_watchdog_del_pending(lcw); + + lcw_update_time(lcw, "completed"); + lcw->lcw_state = LC_WATCHDOG_DISABLED; + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_disable); + +void lc_watchdog_delete(struct lc_watchdog *lcw) +{ + int dead; + + ENTRY; + LASSERT(lcw != NULL); + + cfs_timer_disarm(&lcw->lcw_timer); + + lcw_update_time(lcw, "stopped"); + + spin_lock_bh(&lcw->lcw_lock); + spin_lock_bh(&lcw_pending_timers_lock); + if (unlikely(!list_empty(&lcw->lcw_list))) { + list_del_init(&lcw->lcw_list); + lcw->lcw_refcount--; /* -1 ref for pending list */ + } + + lcw->lcw_refcount--; /* -1 ref for owner */ + dead = lcw->lcw_refcount == 0; + spin_unlock_bh(&lcw_pending_timers_lock); + spin_unlock_bh(&lcw->lcw_lock); + + if (dead) + LIBCFS_FREE(lcw, sizeof(*lcw)); + + mutex_lock(&lcw_refcount_mutex); + if (--lcw_refcount == 0) + lcw_dispatch_stop(); + mutex_unlock(&lcw_refcount_mutex); + + EXIT; +} +EXPORT_SYMBOL(lc_watchdog_delete); + +/* + * Provided watchdog handlers + */ + +void lc_watchdog_dumplog(pid_t pid, void *data) +{ + libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid)); +} +EXPORT_SYMBOL(lc_watchdog_dumplog); + +#else /* !defined(WITH_WATCHDOG) */ + +struct lc_watchdog *lc_watchdog_add(int timeout, + void (*callback)(pid_t pid, void *), + void *data) +{ + static struct lc_watchdog watchdog; + return &watchdog; +} +EXPORT_SYMBOL(lc_watchdog_add); + +void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) +{ +} +EXPORT_SYMBOL(lc_watchdog_touch); + +void lc_watchdog_disable(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_disable); + +void lc_watchdog_delete(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_delete); + +#endif diff --git a/drivers/staging/lustre/lustre/libcfs/workitem.c b/drivers/staging/lustre/lustre/libcfs/workitem.c new file mode 100644 index 000000000000..b533666c1900 --- /dev/null +++ b/drivers/staging/lustre/lustre/libcfs/workitem.c @@ -0,0 +1,475 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/workitem.c + * + * Author: Isaac Huang + * Liang Zhen + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +#define CFS_WS_NAME_LEN 16 + +typedef struct cfs_wi_sched { + struct list_head ws_list; /* chain on global list */ + /** serialised workitems */ + spinlock_t ws_lock; + /** where schedulers sleep */ + wait_queue_head_t ws_waitq; + /** concurrent workitems */ + struct list_head ws_runq; + /** rescheduled running-workitems, a workitem can be rescheduled + * while running in wi_action(), but we don't to execute it again + * unless it returns from wi_action(), so we put it on ws_rerunq + * while rescheduling, and move it to runq after it returns + * from wi_action() */ + struct list_head ws_rerunq; + /** CPT-table for this scheduler */ + struct cfs_cpt_table *ws_cptab; + /** CPT id for affinity */ + int ws_cpt; + /** number of scheduled workitems */ + int ws_nscheduled; + /** started scheduler thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_nthreads:30; + /** shutting down, protected by cfs_wi_data::wi_glock */ + unsigned int ws_stopping:1; + /** serialize starting thread, protected by cfs_wi_data::wi_glock */ + unsigned int ws_starting:1; + /** scheduler name */ + char ws_name[CFS_WS_NAME_LEN]; +} cfs_wi_sched_t; + +struct cfs_workitem_data { + /** serialize */ + spinlock_t wi_glock; + /** list of all schedulers */ + struct list_head wi_scheds; + /** WI module is initialized */ + int wi_init; + /** shutting down the whole WI module */ + int wi_stopping; +} cfs_wi_data; + +static inline void +cfs_wi_sched_lock(cfs_wi_sched_t *sched) +{ + spin_lock(&sched->ws_lock); +} + +static inline void +cfs_wi_sched_unlock(cfs_wi_sched_t *sched) +{ + spin_unlock(&sched->ws_lock); +} + +static inline int +cfs_wi_sched_cansleep(cfs_wi_sched_t *sched) +{ + cfs_wi_sched_lock(sched); + if (sched->ws_stopping) { + cfs_wi_sched_unlock(sched); + return 0; + } + + if (!list_empty(&sched->ws_runq)) { + cfs_wi_sched_unlock(sched); + return 0; + } + cfs_wi_sched_unlock(sched); + return 1; +} + + +/* XXX: + * 0. it only works when called from wi->wi_action. + * 1. when it returns no one shall try to schedule the workitem. + */ +void +cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + cfs_wi_sched_lock(sched); + + LASSERT(wi->wi_running); + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + } + + LASSERT(list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; /* LBUG future schedule attempts */ + cfs_wi_sched_unlock(sched); + + return; +} +EXPORT_SYMBOL(cfs_wi_exit); + +/** + * cancel schedule request of workitem \a wi + */ +int +cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + int rc; + + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + /* + * return 0 if it's running already, otherwise return 1, which + * means the workitem will not be scheduled and will not have + * any race with wi_action. + */ + cfs_wi_sched_lock(sched); + + rc = !(wi->wi_running); + + if (wi->wi_scheduled) { /* cancel pending schedules */ + LASSERT(!list_empty(&wi->wi_list)); + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_scheduled = 0; + } + + LASSERT (list_empty(&wi->wi_list)); + + cfs_wi_sched_unlock(sched); + return rc; +} +EXPORT_SYMBOL(cfs_wi_deschedule); + +/* + * Workitem scheduled with (serial == 1) is strictly serialised not only with + * itself, but also with others scheduled this way. + * + * Now there's only one static serialised queue, but in the future more might + * be added, and even dynamic creation of serialised queues might be supported. + */ +void +cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) +{ + LASSERT(!in_interrupt()); /* because we use plain spinlock */ + LASSERT(!sched->ws_stopping); + + cfs_wi_sched_lock(sched); + + if (!wi->wi_scheduled) { + LASSERT (list_empty(&wi->wi_list)); + + wi->wi_scheduled = 1; + sched->ws_nscheduled++; + if (!wi->wi_running) { + list_add_tail(&wi->wi_list, &sched->ws_runq); + wake_up(&sched->ws_waitq); + } else { + list_add(&wi->wi_list, &sched->ws_rerunq); + } + } + + LASSERT (!list_empty(&wi->wi_list)); + cfs_wi_sched_unlock(sched); + return; +} +EXPORT_SYMBOL(cfs_wi_schedule); + + +static int +cfs_wi_scheduler (void *arg) +{ + struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg; + + cfs_block_allsigs(); + + /* CPT affinity scheduler? */ + if (sched->ws_cptab != NULL) + cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt); + + spin_lock(&cfs_wi_data.wi_glock); + + LASSERT(sched->ws_starting == 1); + sched->ws_starting--; + sched->ws_nthreads++; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_lock(sched); + + while (!sched->ws_stopping) { + int nloops = 0; + int rc; + cfs_workitem_t *wi; + + while (!list_empty(&sched->ws_runq) && + nloops < CFS_WI_RESCHED) { + wi = list_entry(sched->ws_runq.next, + cfs_workitem_t, wi_list); + LASSERT(wi->wi_scheduled && !wi->wi_running); + + list_del_init(&wi->wi_list); + + LASSERT(sched->ws_nscheduled > 0); + sched->ws_nscheduled--; + + wi->wi_running = 1; + wi->wi_scheduled = 0; + + + cfs_wi_sched_unlock(sched); + nloops++; + + rc = (*wi->wi_action) (wi); + + cfs_wi_sched_lock(sched); + if (rc != 0) /* WI should be dead, even be freed! */ + continue; + + wi->wi_running = 0; + if (list_empty(&wi->wi_list)) + continue; + + LASSERT(wi->wi_scheduled); + /* wi is rescheduled, should be on rerunq now, we + * move it to runq so it can run action now */ + list_move_tail(&wi->wi_list, &sched->ws_runq); + } + + if (!list_empty(&sched->ws_runq)) { + cfs_wi_sched_unlock(sched); + /* don't sleep because some workitems still + * expect me to come back soon */ + cond_resched(); + cfs_wi_sched_lock(sched); + continue; + } + + cfs_wi_sched_unlock(sched); + cfs_wait_event_interruptible_exclusive(sched->ws_waitq, + !cfs_wi_sched_cansleep(sched), rc); + cfs_wi_sched_lock(sched); + } + + cfs_wi_sched_unlock(sched); + + spin_lock(&cfs_wi_data.wi_glock); + sched->ws_nthreads--; + spin_unlock(&cfs_wi_data.wi_glock); + + return 0; +} + + +void +cfs_wi_sched_destroy(struct cfs_wi_sched *sched) +{ + int i; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + + spin_lock(&cfs_wi_data.wi_glock); + if (sched->ws_stopping) { + CDEBUG(D_INFO, "%s is in progress of stopping\n", + sched->ws_name); + spin_unlock(&cfs_wi_data.wi_glock); + return; + } + + LASSERT(!list_empty(&sched->ws_list)); + sched->ws_stopping = 1; + + spin_unlock(&cfs_wi_data.wi_glock); + + i = 2; + wake_up_all(&sched->ws_waitq); + + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_nthreads > 0) { + CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET, + "waiting for %d threads of WI sched[%s] to terminate\n", + sched->ws_nthreads, sched->ws_name); + + spin_unlock(&cfs_wi_data.wi_glock); + cfs_pause(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + + list_del(&sched->ws_list); + + spin_unlock(&cfs_wi_data.wi_glock); + LASSERT(sched->ws_nscheduled == 0); + + LIBCFS_FREE(sched, sizeof(*sched)); +} +EXPORT_SYMBOL(cfs_wi_sched_destroy); + +int +cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, + int cpt, int nthrs, struct cfs_wi_sched **sched_pp) +{ + struct cfs_wi_sched *sched; + int rc; + + LASSERT(cfs_wi_data.wi_init); + LASSERT(!cfs_wi_data.wi_stopping); + LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || + (cpt >= 0 && cpt < cfs_cpt_number(cptab))); + + LIBCFS_ALLOC(sched, sizeof(*sched)); + if (sched == NULL) + return -ENOMEM; + + strncpy(sched->ws_name, name, CFS_WS_NAME_LEN); + sched->ws_cptab = cptab; + sched->ws_cpt = cpt; + + spin_lock_init(&sched->ws_lock); + init_waitqueue_head(&sched->ws_waitq); + INIT_LIST_HEAD(&sched->ws_runq); + INIT_LIST_HEAD(&sched->ws_rerunq); + INIT_LIST_HEAD(&sched->ws_list); + + rc = 0; + while (nthrs > 0) { + char name[16]; + task_t *task; + spin_lock(&cfs_wi_data.wi_glock); + while (sched->ws_starting > 0) { + spin_unlock(&cfs_wi_data.wi_glock); + schedule(); + spin_lock(&cfs_wi_data.wi_glock); + } + + sched->ws_starting++; + spin_unlock(&cfs_wi_data.wi_glock); + + if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { + snprintf(name, sizeof(name), "%s_%02d_%02d", + sched->ws_name, sched->ws_cpt, + sched->ws_nthreads); + } else { + snprintf(name, sizeof(name), "%s_%02d", + sched->ws_name, sched->ws_nthreads); + } + + task = kthread_run(cfs_wi_scheduler, sched, name); + if (!IS_ERR(task)) { + nthrs--; + continue; + } + rc = PTR_ERR(task); + + CERROR("Failed to create thread for WI scheduler %s: %d\n", + name, rc); + + spin_lock(&cfs_wi_data.wi_glock); + + /* make up for cfs_wi_sched_destroy */ + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + sched->ws_starting--; + + spin_unlock(&cfs_wi_data.wi_glock); + + cfs_wi_sched_destroy(sched); + return rc; + } + spin_lock(&cfs_wi_data.wi_glock); + list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); + spin_unlock(&cfs_wi_data.wi_glock); + + *sched_pp = sched; + return 0; +} +EXPORT_SYMBOL(cfs_wi_sched_create); + +int +cfs_wi_startup(void) +{ + memset(&cfs_wi_data, 0, sizeof(cfs_wi_data)); + + spin_lock_init(&cfs_wi_data.wi_glock); + INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); + cfs_wi_data.wi_init = 1; + + return 0; +} + +void +cfs_wi_shutdown (void) +{ + struct cfs_wi_sched *sched; + + spin_lock(&cfs_wi_data.wi_glock); + cfs_wi_data.wi_stopping = 1; + spin_unlock(&cfs_wi_data.wi_glock); + + /* nobody should contend on this list */ + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + sched->ws_stopping = 1; + wake_up_all(&sched->ws_waitq); + } + + list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { + spin_lock(&cfs_wi_data.wi_glock); + + while (sched->ws_nthreads != 0) { + spin_unlock(&cfs_wi_data.wi_glock); + cfs_pause(cfs_time_seconds(1) / 20); + spin_lock(&cfs_wi_data.wi_glock); + } + spin_unlock(&cfs_wi_data.wi_glock); + } + while (!list_empty(&cfs_wi_data.wi_scheds)) { + sched = list_entry(cfs_wi_data.wi_scheds.next, + struct cfs_wi_sched, ws_list); + list_del(&sched->ws_list); + LIBCFS_FREE(sched, sizeof(*sched)); + } + + cfs_wi_data.wi_stopping = 0; + cfs_wi_data.wi_init = 0; +} diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile new file mode 100644 index 000000000000..dff0c0486e77 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LUSTRE_FS) += lustre.o +obj-$(CONFIG_LUSTRE_FS) += llite_lloop.o +lustre-y := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o \ + rw.o lproc_llite.o namei.o symlink.o llite_mmap.o \ + xattr.o remote_perm.o llite_rmtacl.o llite_capa.o \ + rw26.o super25.o statahead.o \ + ../lclient/glimpse.o ../lclient/lcommon_cl.o ../lclient/lcommon_misc.o \ + vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o + +llite_lloop-y := lloop.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c new file mode 100644 index 000000000000..e048538d45e6 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/dcache.c @@ -0,0 +1,675 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include "llite_internal.h" + +static void free_dentry_data(struct rcu_head *head) +{ + struct ll_dentry_data *lld; + + lld = container_of(head, struct ll_dentry_data, lld_rcu_head); + OBD_FREE_PTR(lld); +} + +/* should NOT be called with the dcache lock, see fs/dcache.c */ +static void ll_release(struct dentry *de) +{ + struct ll_dentry_data *lld; + ENTRY; + LASSERT(de != NULL); + lld = ll_d2d(de); + if (lld == NULL) /* NFS copies the de->d_op methods (bug 4655) */ + RETURN_EXIT; + + if (lld->lld_it) { + ll_intent_release(lld->lld_it); + OBD_FREE(lld->lld_it, sizeof(*lld->lld_it)); + } + LASSERT(lld->lld_cwd_count == 0); + LASSERT(lld->lld_mnt_count == 0); + de->d_fsdata = NULL; + call_rcu(&lld->lld_rcu_head, free_dentry_data); + + EXIT; +} + +/* Compare if two dentries are the same. Don't match if the existing dentry + * is marked invalid. Returns 1 if different, 0 if the same. + * + * This avoids a race where ll_lookup_it() instantiates a dentry, but we get + * an AST before calling d_revalidate_it(). The dentry still exists (marked + * INVALID) so d_lookup() matches it, but we have no lock on it (so + * lock_match() fails) and we spin around real_lookup(). */ +int ll_dcompare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + ENTRY; + + if (len != name->len) + RETURN(1); + + if (memcmp(str, name->name, len)) + RETURN(1); + + CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", + name->len, name->name, dentry, dentry->d_flags, + d_refcount(dentry)); + + /* mountpoint is always valid */ + if (d_mountpoint((struct dentry *)dentry)) + RETURN(0); + + if (d_lustre_invalid(dentry)) + RETURN(1); + + RETURN(0); +} + +static inline int return_if_equal(struct ldlm_lock *lock, void *data) +{ + if ((lock->l_flags & + (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) == + (LDLM_FL_CANCELING | LDLM_FL_DISCARD_DATA)) + return LDLM_ITER_CONTINUE; + return LDLM_ITER_STOP; +} + +/* find any ldlm lock of the inode in mdc and lov + * return 0 not find + * 1 find one + * < 0 error */ +static int find_cbdata(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lov_stripe_md *lsm; + int rc = 0; + ENTRY; + + LASSERT(inode); + rc = md_find_cbdata(sbi->ll_md_exp, ll_inode2fid(inode), + return_if_equal, NULL); + if (rc != 0) + RETURN(rc); + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + RETURN(rc); + + rc = obd_find_cbdata(sbi->ll_dt_exp, lsm, return_if_equal, NULL); + ccc_inode_lsm_put(inode, lsm); + + RETURN(rc); +} + +/** + * Called when last reference to a dentry is dropped and dcache wants to know + * whether or not it should cache it: + * - return 1 to delete the dentry immediately + * - return 0 to cache the dentry + * Should NOT be called with the dcache lock, see fs/dcache.c + */ +static int ll_ddelete(const struct dentry *de) +{ + ENTRY; + LASSERT(de); + + CDEBUG(D_DENTRY, "%s dentry %.*s (%p, parent %p, inode %p) %s%s\n", + d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping", + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + d_unhashed((struct dentry *)de) ? "" : "hashed,", + list_empty(&de->d_subdirs) ? "" : "subdirs"); + + /* kernel >= 2.6.38 last refcount is decreased after this function. */ + LASSERT(d_refcount(de) == 1); + + /* Disable this piece of code temproarily because this is called + * inside dcache_lock so it's not appropriate to do lots of work + * here. ATTENTION: Before this piece of code enabling, LU-2487 must be + * resolved. */ +#if 0 + /* if not ldlm lock for this inode, set i_nlink to 0 so that + * this inode can be recycled later b=20433 */ + if (de->d_inode && !find_cbdata(de->d_inode)) + clear_nlink(de->d_inode); +#endif + + if (d_lustre_invalid((struct dentry *)de)) + RETURN(1); + RETURN(0); +} + +static int ll_set_dd(struct dentry *de) +{ + ENTRY; + LASSERT(de != NULL); + + CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n", + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + d_refcount(de)); + + if (de->d_fsdata == NULL) { + struct ll_dentry_data *lld; + + OBD_ALLOC_PTR(lld); + if (likely(lld != NULL)) { + spin_lock(&de->d_lock); + if (likely(de->d_fsdata == NULL)) + de->d_fsdata = lld; + else + OBD_FREE_PTR(lld); + spin_unlock(&de->d_lock); + } else { + RETURN(-ENOMEM); + } + } + + RETURN(0); +} + +int ll_dops_init(struct dentry *de, int block, int init_sa) +{ + struct ll_dentry_data *lld = ll_d2d(de); + int rc = 0; + + if (lld == NULL && block != 0) { + rc = ll_set_dd(de); + if (rc) + return rc; + + lld = ll_d2d(de); + } + + if (lld != NULL && init_sa != 0) + lld->lld_sa_generation = 0; + + /* kernel >= 2.6.38 d_op is set in d_alloc() */ + LASSERT(de->d_op == &ll_d_ops); + return rc; +} + +void ll_intent_drop_lock(struct lookup_intent *it) +{ + if (it->it_op && it->d.lustre.it_lock_mode) { + struct lustre_handle handle; + + handle.cookie = it->d.lustre.it_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64 + " from it %p\n", handle.cookie, it); + ldlm_lock_decref(&handle, it->d.lustre.it_lock_mode); + + /* bug 494: intent_release may be called multiple times, from + * this thread and we don't want to double-decref this lock */ + it->d.lustre.it_lock_mode = 0; + if (it->d.lustre.it_remote_lock_mode != 0) { + handle.cookie = it->d.lustre.it_remote_lock_handle; + + CDEBUG(D_DLMTRACE, "releasing remote lock with cookie" + LPX64" from it %p\n", handle.cookie, it); + ldlm_lock_decref(&handle, + it->d.lustre.it_remote_lock_mode); + it->d.lustre.it_remote_lock_mode = 0; + } + } +} + +void ll_intent_release(struct lookup_intent *it) +{ + ENTRY; + + CDEBUG(D_INFO, "intent %p released\n", it); + ll_intent_drop_lock(it); + /* We are still holding extra reference on a request, need to free it */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished(it->d.lustre.it_data); /* ll_file_open */ + if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ + ptlrpc_req_finished(it->d.lustre.it_data); + if (it_disposition(it, DISP_ENQ_COMPLETE)) /* saved req from revalidate + * to lookup */ + ptlrpc_req_finished(it->d.lustre.it_data); + + it->d.lustre.it_disposition = 0; + it->d.lustre.it_data = NULL; + EXIT; +} + +void ll_invalidate_aliases(struct inode *inode) +{ + struct dentry *dentry; + struct ll_d_hlist_node *p; + ENTRY; + + LASSERT(inode != NULL); + + CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n", + inode->i_ino, inode->i_generation, inode); + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) { + CDEBUG(D_DENTRY, "dentry in drop %.*s (%p) parent %p " + "inode %p flags %d\n", dentry->d_name.len, + dentry->d_name.name, dentry, dentry->d_parent, + dentry->d_inode, dentry->d_flags); + + if (dentry->d_name.len == 1 && dentry->d_name.name[0] == '/') { + CERROR("called on root (?) dentry=%p, inode=%p " + "ino=%lu\n", dentry, inode, inode->i_ino); + lustre_dump_dentry(dentry, 1); + libcfs_debug_dumpstack(NULL); + } + + d_lustre_invalidate(dentry); + } + ll_unlock_dcache(inode); + + EXIT; +} + +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, + struct dentry *de) +{ + int rc = 0; + ENTRY; + + if (!request) + RETURN(0); + + if (it_disposition(it, DISP_LOOKUP_NEG)) + RETURN(-ENOENT); + + rc = ll_prep_inode(&de->d_inode, request, NULL, it); + + RETURN(rc); +} + +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry) +{ + LASSERT(it != NULL); + LASSERT(dentry != NULL); + + if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) { + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", + inode, inode->i_ino, inode->i_generation); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + } + + /* drop lookup or getattr locks immediately */ + if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) { + /* on 2.6 there are situation when several lookups and + * revalidations may be requested during single operation. + * therefore, we don't release intent here -bzzz */ + ll_intent_drop_lock(it); + } +} + +void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft) +{ + struct lookup_intent *it = *itp; + + if (!it || it->it_op == IT_GETXATTR) + it = *itp = deft; + +} + +int ll_revalidate_it(struct dentry *de, int lookup_flags, + struct lookup_intent *it) +{ + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; + struct obd_export *exp; + struct inode *parent = de->d_parent->d_inode; + int rc; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name, + LL_IT2STR(it)); + + if (de->d_inode == NULL) { + __u64 ibits; + + /* We can only use negative dentries if this is stat or lookup, + for opens and stuff we do need to query server. */ + /* If there is IT_CREAT in intent op set, then we must throw + away this negative dentry and actually do the request to + kernel to create whatever needs to be created (if possible)*/ + if (it && (it->it_op & IT_CREAT)) + RETURN(0); + + if (d_lustre_invalid(de)) + RETURN(0); + + ibits = MDS_INODELOCK_UPDATE; + rc = ll_have_md_lock(parent, &ibits, LCK_MINMODE); + GOTO(out_sa, rc); + } + + /* Never execute intents for mount points. + * Attributes will be fixed up in ll_inode_revalidate_it */ + if (d_mountpoint(de)) + GOTO(out_sa, rc = 1); + + /* need to get attributes in case root got changed from other client */ + if (de == de->d_sb->s_root) { + rc = __ll_inode_revalidate_it(de, it, MDS_INODELOCK_LOOKUP); + if (rc == 0) + rc = 1; + GOTO(out_sa, rc); + } + + exp = ll_i2mdexp(de->d_inode); + + OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_REVALIDATE_PAUSE, 5); + ll_frob_intent(&it, &lookup_it); + LASSERT(it); + + if (it->it_op == IT_LOOKUP && !d_lustre_invalid(de)) + RETURN(1); + + if (it->it_op == IT_OPEN) { + struct inode *inode = de->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle **och_p; + __u64 *och_usecount; + __u64 ibits; + + /* + * We used to check for MDS_INODELOCK_OPEN here, but in fact + * just having LOOKUP lock is enough to justify inode is the + * same. And if inode is the same and we have suitable + * openhandle, then there is no point in doing another OPEN RPC + * just to throw away newly received openhandle. There are no + * security implications too, if file owner or access mode is + * change, LOOKUP lock is revoked. + */ + + + if (it->it_flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (it->it_flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + /* Check for the proper lock. */ + ibits = MDS_INODELOCK_LOOKUP; + if (!ll_have_md_lock(inode, &ibits, LCK_MINMODE)) + goto do_lock; + mutex_lock(&lli->lli_och_mutex); + if (*och_p) { /* Everything is open already, do nothing */ + /*(*och_usecount)++; Do not let them steal our open + handle from under us */ + SET_BUT_UNUSED(och_usecount); + /* XXX The code above was my original idea, but in case + we have the handle, but we cannot use it due to later + checks (e.g. O_CREAT|O_EXCL flags set), nobody + would decrement counter increased here. So we just + hope the lock won't be invalidated in between. But + if it would be, we'll reopen the open request to + MDS later during file open path */ + mutex_unlock(&lli->lli_och_mutex); + RETURN(1); + } else { + mutex_unlock(&lli->lli_och_mutex); + } + } + + if (it->it_op == IT_GETATTR) { + rc = ll_statahead_enter(parent, &de, 0); + if (rc == 1) + goto mark; + else if (rc != -EAGAIN && rc != 0) + GOTO(out, rc = 0); + } + +do_lock: + op_data = ll_prep_md_op_data(NULL, parent, de->d_inode, + de->d_name.name, de->d_name.len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (!IS_POSIXACL(parent) || !exp_connect_umask(exp)) + it->it_create_mode &= ~current_umask(); + it->it_create_mode |= M_CHECK_STALE; + rc = md_intent_lock(exp, op_data, NULL, 0, it, + lookup_flags, + &req, ll_md_blocking_ast, 0); + it->it_create_mode &= ~M_CHECK_STALE; + ll_finish_md_op_data(op_data); + + /* If req is NULL, then md_intent_lock only tried to do a lock match; + * if all was well, it will return 1 if it found locks, 0 otherwise. */ + if (req == NULL && rc >= 0) { + if (!rc) + goto do_lookup; + GOTO(out, rc); + } + + if (rc < 0) { + if (rc != -ESTALE) { + CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status " + "%d\n", rc, it->d.lustre.it_status); + } + GOTO(out, rc = 0); + } + +revalidate_finish: + rc = ll_revalidate_it_finish(req, it, de); + if (rc != 0) { + if (rc != -ESTALE && rc != -ENOENT) + ll_intent_release(it); + GOTO(out, rc = 0); + } + + if ((it->it_op & IT_OPEN) && de->d_inode && + !S_ISREG(de->d_inode->i_mode) && + !S_ISDIR(de->d_inode->i_mode)) { + ll_release_openhandle(de, it); + } + rc = 1; + +out: + /* We do not free request as it may be reused during following lookup + * (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will + * be freed in ll_lookup_it or in ll_intent_release. But if + * request was not completed, we need to free it. (bug 5154, 9903) */ + if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE)) + ptlrpc_req_finished(req); + if (rc == 0) { + /* mdt may grant layout lock for the newly created file, so + * release the lock to avoid leaking */ + ll_intent_drop_lock(it); + ll_invalidate_aliases(de->d_inode); + } else { + __u64 bits = 0; + __u64 matched_bits = 0; + + CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p " + "inode %p refc %d\n", de->d_name.len, + de->d_name.name, de, de->d_parent, de->d_inode, + d_refcount(de)); + + ll_set_lock_data(exp, de->d_inode, it, &bits); + + /* Note: We have to match both LOOKUP and PERM lock + * here to make sure the dentry is valid and no one + * changing the permission. + * But if the client connects < 2.4 server, which will + * only grant LOOKUP lock, so we can only Match LOOKUP + * lock for old server */ + if (exp_connect_flags(ll_i2mdexp(de->d_inode)) && + OBD_CONNECT_LVB_TYPE) + matched_bits = + MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM; + else + matched_bits = MDS_INODELOCK_LOOKUP; + + if (((bits & matched_bits) == matched_bits) && + d_lustre_invalid(de)) + d_lustre_revalidate(de); + ll_lookup_finish_locks(it, de); + } + +mark: + if (it != NULL && it->it_op == IT_GETATTR && rc > 0) + ll_statahead_mark(parent, de); + RETURN(rc); + + /* + * This part is here to combat evil-evil race in real_lookup on 2.6 + * kernels. The race details are: We enter do_lookup() looking for some + * name, there is nothing in dcache for this name yet and d_lookup() + * returns NULL. We proceed to real_lookup(), and while we do this, + * another process does open on the same file we looking up (most simple + * reproducer), open succeeds and the dentry is added. Now back to + * us. In real_lookup() we do d_lookup() again and suddenly find the + * dentry, so we call d_revalidate on it, but there is no lock, so + * without this code we would return 0, but unpatched real_lookup just + * returns -ENOENT in such a case instead of retrying the lookup. Once + * this is dealt with in real_lookup(), all of this ugly mess can go and + * we can just check locks in ->d_revalidate without doing any RPCs + * ever. + */ +do_lookup: + if (it != &lookup_it) { + /* MDS_INODELOCK_UPDATE needed for IT_GETATTR case. */ + if (it->it_op == IT_GETATTR) + lookup_it.it_op = IT_GETATTR; + ll_lookup_finish_locks(it, de); + it = &lookup_it; + } + + /* Do real lookup here. */ + op_data = ll_prep_md_op_data(NULL, parent, NULL, de->d_name.name, + de->d_name.len, 0, (it->it_op & IT_CREAT ? + LUSTRE_OPC_CREATE : + LUSTRE_OPC_ANY), NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = md_intent_lock(exp, op_data, NULL, 0, it, 0, &req, + ll_md_blocking_ast, 0); + if (rc >= 0) { + struct mdt_body *mdt_body; + struct lu_fid fid = {.f_seq = 0, .f_oid = 0, .f_ver = 0}; + mdt_body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + if (de->d_inode) + fid = *ll_inode2fid(de->d_inode); + + /* see if we got same inode, if not - return error */ + if (lu_fid_eq(&fid, &mdt_body->fid1)) { + ll_finish_md_op_data(op_data); + op_data = NULL; + goto revalidate_finish; + } + ll_intent_release(it); + } + ll_finish_md_op_data(op_data); + GOTO(out, rc = 0); + +out_sa: + /* + * For rc == 1 case, should not return directly to prevent losing + * statahead windows; for rc == 0 case, the "lookup" will be done later. + */ + if (it != NULL && it->it_op == IT_GETATTR && rc == 1) + ll_statahead_enter(parent, &de, 1); + goto mark; +} + +/* + * Always trust cached dentries. Update statahead window if necessary. + */ +int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) +{ + struct inode *parent = dentry->d_parent->d_inode; + int unplug = 0; + + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,flags=%u\n", + dentry->d_name.name, flags); + + if (!(flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) && + ll_need_statahead(parent, dentry) > 0) { + if (flags & LOOKUP_RCU) + RETURN(-ECHILD); + + if (dentry->d_inode == NULL) + unplug = 1; + do_statahead_enter(parent, &dentry, unplug); + ll_statahead_mark(parent, dentry); + } + + RETURN(1); +} + + +void ll_d_iput(struct dentry *de, struct inode *inode) +{ + LASSERT(inode); + if (!find_cbdata(inode)) + clear_nlink(inode); + iput(inode); +} + +struct dentry_operations ll_d_ops = { + .d_revalidate = ll_revalidate_nd, + .d_release = ll_release, + .d_delete = ll_ddelete, + .d_iput = ll_d_iput, + .d_compare = ll_dcompare, +}; diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c new file mode 100644 index 000000000000..23c61fe81965 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/dir.c @@ -0,0 +1,1978 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/dir.c + * + * Directory code for lustre client. + */ + +#include +#include +#include +#include +#include +#include // for wait_on_buffer +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +/* + * (new) readdir implementation overview. + * + * Original lustre readdir implementation cached exact copy of raw directory + * pages on the client. These pages were indexed in client page cache by + * logical offset in the directory file. This design, while very simple and + * intuitive had some inherent problems: + * + * . it implies that byte offset to the directory entry serves as a + * telldir(3)/seekdir(3) cookie, but that offset is not stable: in + * ext3/htree directory entries may move due to splits, and more + * importantly, + * + * . it is incompatible with the design of split directories for cmd3, + * that assumes that names are distributed across nodes based on their + * hash, and so readdir should be done in hash order. + * + * New readdir implementation does readdir in hash order, and uses hash of a + * file name as a telldir/seekdir cookie. This led to number of complications: + * + * . hash is not unique, so it cannot be used to index cached directory + * pages on the client (note, that it requires a whole pageful of hash + * collided entries to cause two pages to have identical hashes); + * + * . hash is not unique, so it cannot, strictly speaking, be used as an + * entry cookie. ext3/htree has the same problem and lustre implementation + * mimics their solution: seekdir(hash) positions directory at the first + * entry with the given hash. + * + * Client side. + * + * 0. caching + * + * Client caches directory pages using hash of the first entry as an index. As + * noted above hash is not unique, so this solution doesn't work as is: + * special processing is needed for "page hash chains" (i.e., sequences of + * pages filled with entries all having the same hash value). + * + * First, such chains have to be detected. To this end, server returns to the + * client the hash of the first entry on the page next to one returned. When + * client detects that this hash is the same as hash of the first entry on the + * returned page, page hash collision has to be handled. Pages in the + * hash chain, except first one, are termed "overflow pages". + * + * Solution to index uniqueness problem is to not cache overflow + * pages. Instead, when page hash collision is detected, all overflow pages + * from emerging chain are immediately requested from the server and placed in + * a special data structure (struct ll_dir_chain). This data structure is used + * by ll_readdir() to process entries from overflow pages. When readdir + * invocation finishes, overflow pages are discarded. If page hash collision + * chain weren't completely processed, next call to readdir will again detect + * page hash collision, again read overflow pages in, process next portion of + * entries and again discard the pages. This is not as wasteful as it looks, + * because, given reasonable hash, page hash collisions are extremely rare. + * + * 1. directory positioning + * + * When seekdir(hash) is called, original + * + * + * + * + * + * + * + * + * Server. + * + * identification of and access to overflow pages + * + * page format + * + * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains + * a header lu_dirpage which describes the start/end hash, and whether this + * page is empty (contains no dir entry) or hash collide with next page. + * After client receives reply, several pages will be integrated into dir page + * in PAGE_CACHE_SIZE (if PAGE_CACHE_SIZE greater than LU_PAGE_SIZE), and the + * lu_dirpage for this integrated page will be adjusted. See + * lmv_adjust_dirpages(). + * + */ + +/* returns the page unlocked, but with a reference */ +static int ll_dir_filler(void *_hash, struct page *page0) +{ + struct inode *inode = page0->mapping->host; + int hash64 = ll_i2sbi(inode)->ll_flags & LL_SBI_64BIT_HASH; + struct obd_export *exp = ll_i2sbi(inode)->ll_md_exp; + struct ptlrpc_request *request; + struct mdt_body *body; + struct md_op_data *op_data; + __u64 hash = *((__u64 *)_hash); + struct page **page_pool; + struct page *page; + struct lu_dirpage *dp; + int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT; + int nrdpgs = 0; /* number of pages read actually */ + int npages; + int i; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) hash "LPU64"\n", + inode->i_ino, inode->i_generation, inode, hash); + + LASSERT(max_pages > 0 && max_pages <= MD_MAX_BRW_PAGES); + + OBD_ALLOC(page_pool, sizeof(page) * max_pages); + if (page_pool != NULL) { + page_pool[0] = page0; + } else { + page_pool = &page0; + max_pages = 1; + } + for (npages = 1; npages < max_pages; npages++) { + page = page_cache_alloc_cold(inode->i_mapping); + if (!page) + break; + page_pool[npages] = page; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + op_data->op_npages = npages; + op_data->op_offset = hash; + rc = md_readpage(exp, op_data, page_pool, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + /* Checked by mdc_readpage() */ + LASSERT(body != NULL); + + if (body->valid & OBD_MD_FLSIZE) + cl_isize_write(inode, body->size); + + nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1) + >> PAGE_CACHE_SHIFT; + SetPageUptodate(page0); + } + unlock_page(page0); + ptlrpc_req_finished(request); + + CDEBUG(D_VFSTRACE, "read %d/%d pages\n", nrdpgs, npages); + + ll_pagevec_init(&lru_pvec, 0); + for (i = 1; i < npages; i++) { + unsigned long offset; + int ret; + + page = page_pool[i]; + + if (rc < 0 || i >= nrdpgs) { + page_cache_release(page); + continue; + } + + SetPageUptodate(page); + + dp = kmap(page); + hash = le64_to_cpu(dp->ldp_hash_start); + kunmap(page); + + offset = hash_x_index(hash, hash64); + + prefetchw(&page->flags); + ret = add_to_page_cache_lru(page, inode->i_mapping, offset, + GFP_KERNEL); + if (ret == 0) { + unlock_page(page); + if (ll_pagevec_add(&lru_pvec, page) == 0) + ll_pagevec_lru_add_file(&lru_pvec); + } else { + CDEBUG(D_VFSTRACE, "page %lu add to page cache failed:" + " %d\n", offset, ret); + } + page_cache_release(page); + } + ll_pagevec_lru_add_file(&lru_pvec); + + if (page_pool != &page0) + OBD_FREE(page_pool, sizeof(struct page *) * max_pages); + EXIT; + return rc; +} + +static void ll_check_page(struct inode *dir, struct page *page) +{ + /* XXX: check page format later */ + SetPageChecked(page); +} + +void ll_release_page(struct page *page, int remove) +{ + kunmap(page); + if (remove) { + lock_page(page); + if (likely(page->mapping != NULL)) + truncate_complete_page(page->mapping, page); + unlock_page(page); + } + page_cache_release(page); +} + +/* + * Find, kmap and return page that contains given hash. + */ +static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash, + __u64 *start, __u64 *end) +{ + int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; + struct address_space *mapping = dir->i_mapping; + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(*hash, hash64); + struct page *page; + int found; + + TREE_READ_LOCK_IRQ(mapping); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0) { + struct lu_dirpage *dp; + + page_cache_get(page); + TREE_READ_UNLOCK_IRQ(mapping); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * ll_dir_filler() does synchronous io. + */ + wait_on_page_locked(page); + if (PageUptodate(page)) { + dp = kmap(page); + if (BITS_PER_LONG == 32 && hash64) { + *start = le64_to_cpu(dp->ldp_hash_start) >> 32; + *end = le64_to_cpu(dp->ldp_hash_end) >> 32; + *hash = *hash >> 32; + } else { + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + } + LASSERTF(*start <= *hash, "start = "LPX64",end = " + LPX64",hash = "LPX64"\n", *start, *end, *hash); + CDEBUG(D_VFSTRACE, "page %lu [%llu %llu], hash "LPU64"\n", + offset, *start, *end, *hash); + if (*hash > *end) { + ll_release_page(page, 0); + page = NULL; + } else if (*end != *start && *hash == *end) { + /* + * upon hash collision, remove this page, + * otherwise put page reference, and + * ll_get_dir_page() will issue RPC to fetch + * the page we want. + */ + ll_release_page(page, + le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + page = NULL; + } + } else { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + + } else { + TREE_READ_UNLOCK_IRQ(mapping); + page = NULL; + } + return page; +} + +struct page *ll_get_dir_page(struct inode *dir, __u64 hash, + struct ll_dir_chain *chain) +{ + ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; + struct address_space *mapping = dir->i_mapping; + struct lustre_handle lockh; + struct lu_dirpage *dp; + struct page *page; + ldlm_mode_t mode; + int rc; + __u64 start = 0; + __u64 end = 0; + __u64 lhash = hash; + struct ll_inode_info *lli = ll_i2info(dir); + int hash64 = ll_i2sbi(dir)->ll_flags & LL_SBI_64BIT_HASH; + + mode = LCK_PR; + rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED, + ll_inode2fid(dir), LDLM_IBITS, &policy, mode, &lockh); + if (!rc) { + struct ldlm_enqueue_info einfo = {.ei_type = LDLM_IBITS, + .ei_mode = mode, + .ei_cb_bl = + ll_md_blocking_ast, + .ei_cb_cp = + ldlm_completion_ast, + .ei_cb_gl = NULL, + .ei_cb_wg = NULL, + .ei_cbdata = NULL}; + struct lookup_intent it = { .it_op = IT_READDIR }; + struct ptlrpc_request *request; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, dir, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + return (void *)op_data; + + rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, &einfo, &it, + op_data, &lockh, NULL, 0, NULL, 0); + + ll_finish_md_op_data(op_data); + + request = (struct ptlrpc_request *)it.d.lustre.it_data; + if (request) + ptlrpc_req_finished(request); + if (rc < 0) { + CERROR("lock enqueue: "DFID" at "LPU64": rc %d\n", + PFID(ll_inode2fid(dir)), hash, rc); + return ERR_PTR(rc); + } + + CDEBUG(D_INODE, "setting lr_lvb_inode to inode %p (%lu/%u)\n", + dir, dir->i_ino, dir->i_generation); + md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, + &it.d.lustre.it_lock_handle, dir, NULL); + } else { + /* for cross-ref object, l_ast_data of the lock may not be set, + * we reset it here */ + md_set_lock_data(ll_i2sbi(dir)->ll_md_exp, &lockh.cookie, + dir, NULL); + } + ldlm_lock_dump_handle(D_OTHER, &lockh); + + mutex_lock(&lli->lli_readdir_mutex); + page = ll_dir_page_locate(dir, &lhash, &start, &end); + if (IS_ERR(page)) { + CERROR("dir page locate: "DFID" at "LPU64": rc %ld\n", + PFID(ll_inode2fid(dir)), lhash, PTR_ERR(page)); + GOTO(out_unlock, page); + } else if (page != NULL) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + GOTO(hash_collision, page); + } + + page = read_cache_page(mapping, hash_x_index(hash, hash64), + ll_dir_filler, &lhash); + if (IS_ERR(page)) { + CERROR("read cache page: "DFID" at "LPU64": rc %ld\n", + PFID(ll_inode2fid(dir)), hash, PTR_ERR(page)); + GOTO(out_unlock, page); + } + + wait_on_page_locked(page); + (void)kmap(page); + if (!PageUptodate(page)) { + CERROR("page not updated: "DFID" at "LPU64": rc %d\n", + PFID(ll_inode2fid(dir)), hash, -5); + goto fail; + } + if (!PageChecked(page)) + ll_check_page(dir, page); + if (PageError(page)) { + CERROR("page error: "DFID" at "LPU64": rc %d\n", + PFID(ll_inode2fid(dir)), hash, -5); + goto fail; + } +hash_collision: + dp = page_address(page); + if (BITS_PER_LONG == 32 && hash64) { + start = le64_to_cpu(dp->ldp_hash_start) >> 32; + end = le64_to_cpu(dp->ldp_hash_end) >> 32; + lhash = hash >> 32; + } else { + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + lhash = hash; + } + if (end == start) { + LASSERT(start == lhash); + CWARN("Page-wide hash collision: "LPU64"\n", end); + if (BITS_PER_LONG == 32 && hash64) + CWARN("Real page-wide hash collision at ["LPU64" "LPU64 + "] with hash "LPU64"\n", + le64_to_cpu(dp->ldp_hash_start), + le64_to_cpu(dp->ldp_hash_end), hash); + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } +out_unlock: + mutex_unlock(&lli->lli_readdir_mutex); + ldlm_lock_decref(&lockh, mode); + return page; + +fail: + ll_release_page(page, 1); + page = ERR_PTR(-EIO); + goto out_unlock; +} + +int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie, + filldir_t filldir) +{ + struct ll_inode_info *info = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = *_pos; + int api32 = ll_need_32bit_api(sbi); + int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + struct page *page; + struct ll_dir_chain chain; + int done = 0; + int rc = 0; + ENTRY; + + ll_dir_chain_init(&chain); + + page = ll_get_dir_page(inode, pos, &chain); + + while (rc == 0 && !done) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (!IS_ERR(page)) { + /* + * If page is empty (end of directory is reached), + * use this value. + */ + __u64 hash = MDS_DIR_END_OFF; + __u64 next; + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL && !done; + ent = lu_dirent_next(ent)) { + __u16 type; + int namelen; + struct lu_fid fid; + __u64 lhash; + __u64 ino; + + /* + * XXX: implement correct swabbing here. + */ + + hash = le64_to_cpu(ent->lde_hash); + if (hash < pos) + /* + * Skip until we find target hash + * value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (namelen == 0) + /* + * Skip dummy record. + */ + continue; + + if (api32 && hash64) + lhash = hash >> 32; + else + lhash = hash; + fid_le_to_cpu(&fid, &ent->lde_fid); + ino = cl_fid_build_ino(&fid, api32); + type = ll_dirent_type_get(ent); + /* For 'll_nfs_get_name_filldir()', it will try + * to access the 'ent' through its 'lde_name', + * so the parameter 'name' for 'filldir()' must + * be part of the 'ent'. */ + done = filldir(cookie, ent->lde_name, namelen, + lhash, ino, type); + } + next = le64_to_cpu(dp->ldp_hash_end); + if (!done) { + pos = next; + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + done = 1; + ll_release_page(page, 0); + } else if (1 /* chain is exhausted*/) { + /* + * Normal case: continue to the next + * page. + */ + ll_release_page(page, + le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + next = pos; + page = ll_get_dir_page(inode, pos, + &chain); + } else { + /* + * go into overflow page. + */ + LASSERT(le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + ll_release_page(page, 1); + } + } else { + pos = hash; + ll_release_page(page, 0); + } + } else { + rc = PTR_ERR(page); + CERROR("error reading dir "DFID" at %lu: rc %d\n", + PFID(&info->lli_fid), (unsigned long)pos, rc); + } + } + + *_pos = pos; + ll_dir_chain_fini(&chain); + RETURN(rc); +} + +static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp); + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = lfd->lfd_pos; + int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; + int api32 = ll_need_32bit_api(sbi); + int rc; + struct path path; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu " + " 32bit_api %d\n", inode->i_ino, inode->i_generation, + inode, (unsigned long)pos, i_size_read(inode), api32); + + if (pos == MDS_DIR_END_OFF) + /* + * end-of-file. + */ + GOTO(out, rc = 0); + + rc = ll_dir_read(inode, &pos, cookie, filldir); + lfd->lfd_pos = pos; + if (pos == MDS_DIR_END_OFF) { + if (api32) + filp->f_pos = LL_DIR_END_OFF_32BIT; + else + filp->f_pos = LL_DIR_END_OFF; + } else { + if (api32 && hash64) + filp->f_pos = pos >> 32; + else + filp->f_pos = pos; + } + filp->f_version = inode->i_version; + path.mnt = filp->f_path.mnt; + path.dentry = filp->f_dentry; + touch_atime(&path); + +out: + if (!rc) + ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1); + + RETURN(rc); +} + +int ll_send_mgc_param(struct obd_export *mgc, char *string) +{ + struct mgs_send_param *msp; + int rc = 0; + + OBD_ALLOC_PTR(msp); + if (!msp) + return -ENOMEM; + + strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN); + rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, + sizeof(struct mgs_send_param), msp, NULL); + if (rc) + CERROR("Failed to set parameter: %d\n", rc); + OBD_FREE_PTR(msp); + + return rc; +} + +int ll_dir_setdirstripe(struct inode *dir, struct lmv_user_md *lump, + char *filename) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int mode; + int err; + + ENTRY; + + mode = (0755 & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; + op_data = ll_prep_md_op_data(NULL, dir, NULL, filename, + strlen(filename), mode, LUSTRE_OPC_MKDIR, + lump); + if (IS_ERR(op_data)) + GOTO(err_exit, err = PTR_ERR(op_data)); + + op_data->op_cli_flags |= CLI_SET_MEA; + err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode, + current_fsuid(), current_fsgid(), + cfs_curproc_cap_pack(), 0, &request); + ll_finish_md_op_data(op_data); + if (err) + GOTO(err_exit, err); +err_exit: + ptlrpc_req_finished(request); + return err; +} + +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + int rc = 0; + struct lustre_sb_info *lsi = s2lsi(inode->i_sb); + struct obd_device *mgc = lsi->lsi_mgc; + int lum_size; + ENTRY; + + if (lump != NULL) { + /* + * This is coming from userspace, so should be in + * local endian. But the MDS would like it in little + * endian, so we swab it before we send it. + */ + switch (lump->lmm_magic) { + case LOV_USER_MAGIC_V1: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) + lustre_swab_lov_user_md_v1(lump); + lum_size = sizeof(struct lov_user_md_v1); + break; + } + case LOV_USER_MAGIC_V3: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) + lustre_swab_lov_user_md_v3( + (struct lov_user_md_v3 *)lump); + lum_size = sizeof(struct lov_user_md_v3); + break; + } + default: { + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x nor %#08x\n", + lump->lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); + RETURN(-EINVAL); + } + } + } else { + lum_size = sizeof(struct lov_user_md_v1); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + if (lump != NULL && lump->lmm_magic == cpu_to_le32(LMV_USER_MAGIC)) + op_data->op_cli_flags |= CLI_SET_MEA; + + /* swabbing is done in lov_setstripe() on server side */ + rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, + NULL, 0, &req, NULL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) { + if (rc != -EPERM && rc != -EACCES) + CERROR("mdc_setattr fails: rc = %d\n", rc); + } + + /* In the following we use the fact that LOV_USER_MAGIC_V1 and + LOV_USER_MAGIC_V3 have the same initial fields so we do not + need the make the distiction between the 2 versions */ + if (set_default && mgc->u.cli.cl_mgc_mgsexp) { + char *param = NULL; + char *buf; + + OBD_ALLOC(param, MGS_PARAM_MAXLEN); + if (param == NULL) + GOTO(end, rc = -ENOMEM); + + buf = param; + /* Get fsname and assume devname to be -MDT0000. */ + ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN); + strcat(buf, "-MDT0000.lov"); + buf += strlen(buf); + + /* Set root stripesize */ + sprintf(buf, ".stripesize=%u", + lump ? le32_to_cpu(lump->lmm_stripe_size) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + GOTO(end, rc); + + /* Set root stripecount */ + sprintf(buf, ".stripecount=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_count) : 0); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + if (rc) + GOTO(end, rc); + + /* Set root stripeoffset */ + sprintf(buf, ".stripeoffset=%hd", + lump ? le16_to_cpu(lump->lmm_stripe_offset) : + (typeof(lump->lmm_stripe_offset))(-1)); + rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); + +end: + if (param != NULL) + OBD_FREE(param, MGS_PARAM_MAXLEN); + } + RETURN(rc); +} + +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, + int *lmm_size, struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + int rc, lmmsize; + struct md_op_data *op_data; + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, lmmsize, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr failed on inode " + "%lu/%u: rc %d\n", inode->i_ino, + inode->i_generation, rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + lmmsize = body->eadatasize; + + if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) { + GOTO(out, rc = -ENODATA); + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, + &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + /* We don't swab objects for directories */ + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_MAGIC_V3: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); + rc = -EPROTO; + } +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + return rc; +} + +/* + * Get MDT index for the inode. + */ +int ll_get_mdt_idx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + int rc, mdtidx; + ENTRY; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_flags |= MF_GET_MDT_IDX; + rc = md_getattr(sbi->ll_md_exp, op_data, NULL); + mdtidx = op_data->op_mds; + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + RETURN(rc); + } + return mdtidx; +} + +/** + * Generic handler to do any pre-copy work. + * + * It send a first hsm_progress (with extent length == 0) to coordinator as a + * first information for it that real work has started. + * + * Moreover, for a ARCHIVE request, it will sample the file data version and + * store it in \a copy. + * + * \return 0 on success. + */ +static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc; + ENTRY; + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; + hpk.hpk_extent.length = 0; + hpk.hpk_flags = 0; + hpk.hpk_errval = 0; + hpk.hpk_data_version = 0; + + + /* For archive request, we need to read the current file version. */ + if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { + struct inode *inode; + __u64 data_version = 0; + + /* Get inode for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval is >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + /* Read current file data version */ + rc = ll_data_version(inode, &data_version, 1); + iput(inode); + if (rc != 0) { + CDEBUG(D_HSM, "Could not read file data version of " + DFID" (rc = %d). Archive request (" + LPX64") could not be done.\n", + PFID(©->hc_hai.hai_fid), rc, + copy->hc_hai.hai_cookie); + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store it the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + copy->hc_data_version = data_version; + } + +progress: + rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + RETURN(rc); +} + +/** + * Generic handler to do any post-copy work. + * + * It will send the last hsm_progress update to coordinator to inform it + * that copy is finished and whether it was successful or not. + * + * Moreover, + * - for ARCHIVE request, it will sample the file data version and compare it + * with the version saved in ll_ioc_copy_start(). If they do not match, copy + * will be considered as failed. + * - for RESTORE request, it will sample the file data version and send it to + * coordinator which is useful if the file was imported as 'released'. + * + * \return 0 on success. + */ +static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct hsm_progress_kernel hpk; + int rc; + ENTRY; + + /* If you modify the logic here, also check llapi_hsm_copy_end(). */ + /* Take care: copy->hc_hai.hai_action, len, gid and data are not + * initialized if copy_end was called with copy == NULL. + */ + + /* Forge a hsm_progress based on data from copy. */ + hpk.hpk_fid = copy->hc_hai.hai_fid; + hpk.hpk_cookie = copy->hc_hai.hai_cookie; + hpk.hpk_extent = copy->hc_hai.hai_extent; + hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; + hpk.hpk_errval = copy->hc_errval; + hpk.hpk_data_version = 0; + + /* For archive request, we need to check the file data was not changed. + * + * For restore request, we need to send the file data version, this is + * useful when the file was created using hsm_import. + */ + if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || + (copy->hc_hai.hai_action == HSMA_RESTORE)) && + (copy->hc_errval == 0)) { + struct inode *inode; + __u64 data_version = 0; + + /* Get lsm for this fid */ + inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); + if (IS_ERR(inode)) { + hpk.hpk_flags |= HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = -PTR_ERR(inode); + GOTO(progress, rc = PTR_ERR(inode)); + } + + rc = ll_data_version(inode, &data_version, + copy->hc_hai.hai_action == HSMA_ARCHIVE); + iput(inode); + if (rc) { + CDEBUG(D_HSM, "Could not read file data version. " + "Request could not be confirmed.\n"); + if (hpk.hpk_errval == 0) + hpk.hpk_errval = -rc; + GOTO(progress, rc); + } + + /* Store it the hsm_copy for later copytool use. + * Always modified even if no lsm. */ + hpk.hpk_data_version = data_version; + + /* File could have been stripped during archiving, so we need + * to check anyway. */ + if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && + (copy->hc_data_version != data_version)) { + CDEBUG(D_HSM, "File data version mismatched. " + "File content was changed during archiving. " + DFID", start:"LPX64" current:"LPX64"\n", + PFID(©->hc_hai.hai_fid), + copy->hc_data_version, data_version); + /* File was changed, send error to cdt. Do not ask for + * retry because if a file is modified frequently, + * the cdt will loop on retried archive requests. + * The policy engine will ask for a new archive later + * when the file will not be modified for some tunable + * time */ + /* we do not notify caller */ + hpk.hpk_flags &= ~HP_FLAG_RETRY; + /* hpk_errval must be >= 0 */ + hpk.hpk_errval = EBUSY; + } + + } + +progress: + rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), + &hpk, NULL); + + RETURN(rc); +} + + +static int copy_and_ioctl(int cmd, struct obd_export *exp, void *data, int len) +{ + void *ptr; + int rc; + + OBD_ALLOC(ptr, len); + if (ptr == NULL) + return -ENOMEM; + if (copy_from_user(ptr, data, len)) { + OBD_FREE(ptr, len); + return -EFAULT; + } + rc = obd_iocontrol(cmd, exp, len, data, NULL); + OBD_FREE(ptr, len); + return rc; +} + +static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl) +{ + int cmd = qctl->qc_cmd; + int type = qctl->qc_type; + int id = qctl->qc_id; + int valid = qctl->qc_valid; + int rc = 0; + ENTRY; + + switch (cmd) { + case LUSTRE_Q_INVALIDATE: + case LUSTRE_Q_FINVALIDATE: + case Q_QUOTAON: + case Q_QUOTAOFF: + case Q_SETQUOTA: + case Q_SETINFO: + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + RETURN(-EPERM); + break; + case Q_GETQUOTA: + if (((type == USRQUOTA && current_euid() != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && + (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT)) + RETURN(-EPERM); + break; + case Q_GETINFO: + break; + default: + CERROR("unsupported quotactl op: %#x\n", cmd); + RETURN(-ENOTTY); + } + + if (valid != QC_GENERAL) { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + RETURN(-EOPNOTSUPP); + + if (cmd == Q_GETINFO) + qctl->qc_cmd = Q_GETOINFO; + else if (cmd == Q_GETQUOTA) + qctl->qc_cmd = Q_GETOQUOTA; + else + RETURN(-EINVAL); + + switch (valid) { + case QC_MDTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_OSTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_UUID: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + if (rc == -EAGAIN) + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + default: + rc = -EINVAL; + break; + } + + if (rc) + RETURN(rc); + + qctl->qc_cmd = cmd; + } else { + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (oqctl == NULL) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) { + if (rc != -EALREADY && cmd == Q_QUOTAON) { + oqctl->qc_cmd = Q_QUOTAOFF; + obd_quotactl(sbi->ll_md_exp, oqctl); + } + OBD_FREE_PTR(oqctl); + RETURN(rc); + } + /* If QIF_SPACE is not set, client should collect the + * space usage from OSSs by itself */ + if (cmd == Q_GETQUOTA && + !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && + !oqctl->qc_dqblk.dqb_curspace) { + struct obd_quotactl *oqctl_tmp; + + OBD_ALLOC_PTR(oqctl_tmp); + if (oqctl_tmp == NULL) + GOTO(out, rc = -ENOMEM); + + oqctl_tmp->qc_cmd = Q_GETOQUOTA; + oqctl_tmp->qc_id = oqctl->qc_id; + oqctl_tmp->qc_type = oqctl->qc_type; + + /* collect space usage from OSTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace = + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; + } + + /* collect space & inode usage from MDTs */ + oqctl_tmp->qc_dqblk.dqb_curspace = 0; + oqctl_tmp->qc_dqblk.dqb_curinodes = 0; + rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); + if (!rc || rc == -EREMOTEIO) { + oqctl->qc_dqblk.dqb_curspace += + oqctl_tmp->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_curinodes = + oqctl_tmp->qc_dqblk.dqb_curinodes; + oqctl->qc_dqblk.dqb_valid |= QIF_INODES; + } else { + oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; + } + + OBD_FREE_PTR(oqctl_tmp); + } +out: + QCTL_COPY(qctl, oqctl); + OBD_FREE_PTR(oqctl); + } + + RETURN(rc); +} + +static char * +ll_getname(const char __user *filename) +{ + int ret = 0, len; + char *tmp = __getname(); + + if (!tmp) + return ERR_PTR(-ENOMEM); + + len = strncpy_from_user(tmp, filename, PATH_MAX); + if (len == 0) + ret = -ENOENT; + else if (len > PATH_MAX) + ret = -ENAMETOOLONG; + + if (ret) { + __putname(tmp); + tmp = ERR_PTR(ret); + } + return tmp; +} + +#define ll_putname(filename) __putname(filename) + +static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_ioctl_data *data; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), cmd=%#x\n", + inode->i_ino, inode->i_generation, inode, cmd); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + return -ENOTTY; + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + switch(cmd) { + case FSFILT_IOC_GETFLAGS: + case FSFILT_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION_OLD: + case FSFILT_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int *)arg)); + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. + case FSFILT_IOC_SETVERSION_OLD: + case FSFILT_IOC_SETVERSION: + */ + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int*)arg)) + RETURN(-EFAULT); + + return 0; + } + case IOC_MDC_LOOKUP: { + struct ptlrpc_request *request = NULL; + int namelen, len = 0; + char *buf = NULL; + char *filename; + struct md_op_data *op_data; + + rc = obd_ioctl_getdata(&buf, &len, (void *)arg); + if (rc) + RETURN(rc); + data = (void *)buf; + + filename = data->ioc_inlbuf1; + namelen = strlen(filename); + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(out_free, rc = -EINVAL); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, namelen, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + GOTO(out_free, rc = PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLID; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name: %d\n", rc); + GOTO(out_free, rc); + } + ptlrpc_req_finished(request); + EXIT; +out_free: + obd_ioctl_freedata(buf, len); + return rc; + } + case LL_IOC_LMV_SETSTRIPE: { + struct lmv_user_md *lum; + char *buf = NULL; + char *filename; + int namelen = 0; + int lumlen = 0; + int len; + int rc; + + rc = obd_ioctl_getdata(&buf, &len, (void *)arg); + if (rc) + RETURN(rc); + + data = (void *)buf; + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL || + data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) + GOTO(lmv_out_free, rc = -EINVAL); + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1) { + CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); + GOTO(lmv_out_free, rc = -EINVAL); + } + lum = (struct lmv_user_md *)data->ioc_inlbuf2; + lumlen = data->ioc_inllen2; + + if (lum->lum_magic != LMV_USER_MAGIC || + lumlen != sizeof(*lum)) { + CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", + filename, lum->lum_magic, lumlen, -EFAULT); + GOTO(lmv_out_free, rc = -EINVAL); + } + + /** + * ll_dir_setdirstripe will be used to set dir stripe + * mdc_create--->mdt_reint_create (with dirstripe) + */ + rc = ll_dir_setdirstripe(inode, lum, filename); +lmv_out_free: + obd_ioctl_freedata(buf, len); + RETURN(rc); + + } + case LL_IOC_LOV_SETSTRIPE: { + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + + int set_default = 0; + + LASSERT(sizeof(lumv3) == sizeof(*lumv3p)); + LASSERT(sizeof(lumv3.lmm_objects[0]) == + sizeof(lumv3p->lmm_objects[0])); + /* first try with v1 which is smaller than v3 */ + if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1))) + RETURN(-EFAULT); + + if ((lumv1->lmm_magic == LOV_USER_MAGIC_V3) ) { + if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3))) + RETURN(-EFAULT); + } + + if (inode->i_sb->s_root == file->f_dentry) + set_default = 1; + + /* in v1 and v3 cases lumv1 points to data */ + rc = ll_dir_setstripe(inode, lumv1, set_default); + + RETURN(rc); + } + case LL_IOC_LMV_GETSTRIPE: { + struct lmv_user_md *lump = (struct lmv_user_md *)arg; + struct lmv_user_md lum; + struct lmv_user_md *tmp; + int lum_size; + int rc = 0; + int mdtindex; + + if (copy_from_user(&lum, lump, sizeof(struct lmv_user_md))) + RETURN(-EFAULT); + + if (lum.lum_magic != LMV_MAGIC_V1) + RETURN(-EINVAL); + + lum_size = lmv_user_md_size(1, LMV_MAGIC_V1); + OBD_ALLOC(tmp, lum_size); + if (tmp == NULL) + GOTO(free_lmv, rc = -ENOMEM); + + memcpy(tmp, &lum, sizeof(lum)); + tmp->lum_type = LMV_STRIPE_TYPE; + tmp->lum_stripe_count = 1; + mdtindex = ll_get_mdt_idx(inode); + if (mdtindex < 0) + GOTO(free_lmv, rc = -ENOMEM); + + tmp->lum_stripe_offset = mdtindex; + tmp->lum_objects[0].lum_mds = mdtindex; + memcpy(&tmp->lum_objects[0].lum_fid, ll_inode2fid(inode), + sizeof(struct lu_fid)); + if (copy_to_user((void *)arg, tmp, lum_size)) + GOTO(free_lmv, rc = -EFAULT); +free_lmv: + if (tmp) + OBD_FREE(tmp, lum_size); + RETURN(rc); + } + case LL_IOC_REMOVE_ENTRY: { + char *filename = NULL; + int namelen = 0; + int rc; + + /* Here is a little hack to avoid sending REINT_RMENTRY to + * unsupported server, which might crash the server(LU-2730), + * Because both LVB_TYPE and REINT_RMENTRY will be supported + * on 2.4, we use OBD_CONNECT_LVB_TYPE to detect whether the + * server will support REINT_RMENTRY XXX*/ + if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_LVB_TYPE)) + return -ENOTSUPP; + + filename = ll_getname((const char *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + namelen = strlen(filename); + if (namelen < 1) + GOTO(out_rmdir, rc = -EINVAL); + + rc = ll_rmdir_entry(inode, filename, namelen); +out_rmdir: + if (filename) + ll_putname(filename); + RETURN(rc); + } + case LL_IOC_LOV_SWAP_LAYOUTS: + RETURN(-EPERM); + case LL_IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void *)arg)); + case LL_IOC_LOV_GETSTRIPE: + case LL_IOC_MDC_GETINFO: + case IOC_MDC_GETFILEINFO: + case IOC_MDC_GETFILESTRIPE: { + struct ptlrpc_request *request = NULL; + struct lov_user_md *lump; + struct lov_mds_md *lmm = NULL; + struct mdt_body *body; + char *filename = NULL; + int lmmsize; + + if (cmd == IOC_MDC_GETFILEINFO || + cmd == IOC_MDC_GETFILESTRIPE) { + filename = ll_getname((const char *)arg); + if (IS_ERR(filename)) + RETURN(PTR_ERR(filename)); + + rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, + &lmmsize, &request); + } else { + rc = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + } + + if (request) { + body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + } else { + GOTO(out_req, rc); + } + + if (rc < 0) { + if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO || + cmd == LL_IOC_MDC_GETINFO)) + GOTO(skip_lmm, rc = 0); + else + GOTO(out_req, rc); + } + + if (cmd == IOC_MDC_GETFILESTRIPE || + cmd == LL_IOC_LOV_GETSTRIPE) { + lump = (struct lov_user_md *)arg; + } else { + struct lov_user_mds_data *lmdp; + lmdp = (struct lov_user_mds_data *)arg; + lump = &lmdp->lmd_lmm; + } + if (copy_to_user(lump, lmm, lmmsize)) { + if (copy_to_user(lump, lmm, sizeof(*lump))) + GOTO(out_req, rc = -EFAULT); + rc = -EOVERFLOW; + } + skip_lmm: + if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) { + struct lov_user_mds_data *lmdp; + lstat_t st = { 0 }; + + st.st_dev = inode->i_sb->s_dev; + st.st_mode = body->mode; + st.st_nlink = body->nlink; + st.st_uid = body->uid; + st.st_gid = body->gid; + st.st_rdev = body->rdev; + st.st_size = body->size; + st.st_blksize = PAGE_CACHE_SIZE; + st.st_blocks = body->blocks; + st.st_atime = body->atime; + st.st_mtime = body->mtime; + st.st_ctime = body->ctime; + st.st_ino = inode->i_ino; + + lmdp = (struct lov_user_mds_data *)arg; + if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) + GOTO(out_req, rc = -EFAULT); + } + + EXIT; + out_req: + ptlrpc_req_finished(request); + if (filename) + ll_putname(filename); + return rc; + } + case IOC_LOV_GETINFO: { + struct lov_user_mds_data *lumd; + struct lov_stripe_md *lsm; + struct lov_user_md *lum; + struct lov_mds_md *lmm; + int lmmsize; + lstat_t st; + + lumd = (struct lov_user_mds_data *)arg; + lum = &lumd->lmd_lmm; + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + + OBD_ALLOC_LARGE(lmm, lmmsize); + if (copy_from_user(lmm, lum, lmmsize)) + GOTO(free_lmm, rc = -EFAULT); + + switch (lmm->lmm_magic) { + case LOV_USER_MAGIC_V1: + if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_USER_MAGIC_V3: + if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + GOTO(free_lmm, rc = -EINVAL); + } + + rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize); + if (rc < 0) + GOTO(free_lmm, rc = -ENOMEM); + + /* Perform glimpse_size operation. */ + memset(&st, 0, sizeof(st)); + + rc = ll_glimpse_ioctl(sbi, lsm, &st); + if (rc) + GOTO(free_lsm, rc); + + if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) + GOTO(free_lsm, rc = -EFAULT); + + EXIT; + free_lsm: + obd_free_memmd(sbi->ll_dt_exp, &lsm); + free_lmm: + OBD_FREE_LARGE(lmm, lmmsize); + return rc; + } + case OBD_IOC_LLOG_CATINFO: { + RETURN(-EOPNOTSUPP); + } + case OBD_IOC_QUOTACHECK: { + struct obd_quotactl *oqctl; + int error = 0; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + RETURN(-EPERM); + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + oqctl->qc_type = arg; + rc = obd_quotacheck(sbi->ll_md_exp, oqctl); + if (rc < 0) { + CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc); + error = rc; + } + + rc = obd_quotacheck(sbi->ll_dt_exp, oqctl); + if (rc < 0) + CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc); + + OBD_FREE_PTR(oqctl); + return error ?: rc; + } + case OBD_IOC_POLL_QUOTACHECK: { + struct if_quotacheck *check; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) + RETURN(-EPERM); + + OBD_ALLOC_PTR(check); + if (!check) + RETURN(-ENOMEM); + + rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check, + NULL); + if (rc) { + CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc); + if (copy_to_user((void *)arg, check, + sizeof(*check))) + CDEBUG(D_QUOTA, "copy_to_user failed\n"); + GOTO(out_poll, rc); + } + + rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check, + NULL); + if (rc) { + CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc); + if (copy_to_user((void *)arg, check, + sizeof(*check))) + CDEBUG(D_QUOTA, "copy_to_user failed\n"); + GOTO(out_poll, rc); + } + out_poll: + OBD_FREE_PTR(check); + RETURN(rc); + } +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) + case LL_IOC_QUOTACTL_18: { + /* copy the old 1.x quota struct for internal use, then copy + * back into old format struct. For 1.8 compatibility. */ + struct if_quotactl_18 *qctl_18; + struct if_quotactl *qctl_20; + + OBD_ALLOC_PTR(qctl_18); + if (!qctl_18) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(qctl_20); + if (!qctl_20) + GOTO(out_quotactl_18, rc = -ENOMEM); + + if (copy_from_user(qctl_18, (void *)arg, sizeof(*qctl_18))) + GOTO(out_quotactl_20, rc = -ENOMEM); + + QCTL_COPY(qctl_20, qctl_18); + qctl_20->qc_idx = 0; + + /* XXX: dqb_valid was borrowed as a flag to mark that + * only mds quota is wanted */ + if (qctl_18->qc_cmd == Q_GETQUOTA && + qctl_18->qc_dqblk.dqb_valid) { + qctl_20->qc_valid = QC_MDTIDX; + qctl_20->qc_dqblk.dqb_valid = 0; + } else if (qctl_18->obd_uuid.uuid[0] != '\0') { + qctl_20->qc_valid = QC_UUID; + qctl_20->obd_uuid = qctl_18->obd_uuid; + } else { + qctl_20->qc_valid = QC_GENERAL; + } + + rc = quotactl_ioctl(sbi, qctl_20); + + if (rc == 0) { + QCTL_COPY(qctl_18, qctl_20); + qctl_18->obd_uuid = qctl_20->obd_uuid; + + if (copy_to_user((void *)arg, qctl_18, + sizeof(*qctl_18))) + rc = -EFAULT; + } + + out_quotactl_20: + OBD_FREE_PTR(qctl_20); + out_quotactl_18: + OBD_FREE_PTR(qctl_18); + RETURN(rc); + } +#else +#warning "remove old LL_IOC_QUOTACTL_18 compatibility code" +#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */ + case LL_IOC_QUOTACTL: { + struct if_quotactl *qctl; + + OBD_ALLOC_PTR(qctl); + if (!qctl) + RETURN(-ENOMEM); + + if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) + GOTO(out_quotactl, rc = -EFAULT); + + rc = quotactl_ioctl(sbi, qctl); + + if (rc == 0 && copy_to_user((void *)arg,qctl,sizeof(*qctl))) + rc = -EFAULT; + + out_quotactl: + OBD_FREE_PTR(qctl); + RETURN(rc); + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); +#ifdef CONFIG_FS_POSIX_ACL + case LL_IOC_RMTACL: { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + inode == inode->i_sb->s_root->d_inode) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(fd != NULL); + rc = rct_add(&sbi->ll_rct, current_pid(), arg); + if (!rc) + fd->fd_flags |= LL_FILE_RMTACL; + RETURN(rc); + } else + RETURN(0); + } +#endif + case LL_IOC_GETOBDCOUNT: { + int count, vallen; + struct obd_export *exp; + + if (copy_from_user(&count, (int *)arg, sizeof(int))) + RETURN(-EFAULT); + + /* get ost count when count is zero, get mdt count otherwise */ + exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; + vallen = sizeof(count); + rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), + KEY_TGT_COUNT, &vallen, &count, NULL); + if (rc) { + CERROR("get target count failed: %d\n", rc); + RETURN(rc); + } + + if (copy_to_user((int *)arg, &count, sizeof(int))) + RETURN(-EFAULT); + + RETURN(0); + } + case LL_IOC_PATH2FID: + if (copy_to_user((void *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + RETURN(0); + case LL_IOC_GET_CONNECT_FLAGS: { + RETURN(obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, (void*)arg)); + } + case OBD_IOC_CHANGELOG_SEND: + case OBD_IOC_CHANGELOG_CLEAR: + rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg, + sizeof(struct ioc_changelog)); + RETURN(rc); + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void *)arg)); + case LL_IOC_HSM_REQUEST: { + struct hsm_user_request *hur; + int totalsize; + + OBD_ALLOC_PTR(hur); + if (hur == NULL) + RETURN(-ENOMEM); + + /* We don't know the true size yet; copy the fixed-size part */ + if (copy_from_user(hur, (void *)arg, sizeof(*hur))) { + OBD_FREE_PTR(hur); + RETURN(-EFAULT); + } + + /* Compute the whole struct size */ + totalsize = hur_len(hur); + OBD_FREE_PTR(hur); + OBD_ALLOC_LARGE(hur, totalsize); + if (hur == NULL) + RETURN(-ENOMEM); + + /* Copy the whole struct */ + if (copy_from_user(hur, (void *)arg, totalsize)) { + OBD_FREE_LARGE(hur, totalsize); + RETURN(-EFAULT); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, + hur, NULL); + + OBD_FREE_LARGE(hur, totalsize); + + RETURN(rc); + } + case LL_IOC_HSM_PROGRESS: { + struct hsm_progress_kernel hpk; + struct hsm_progress hp; + + if (copy_from_user(&hp, (void *)arg, sizeof(hp))) + RETURN(-EFAULT); + + hpk.hpk_fid = hp.hp_fid; + hpk.hpk_cookie = hp.hp_cookie; + hpk.hpk_extent = hp.hp_extent; + hpk.hpk_flags = hp.hp_flags; + hpk.hpk_errval = hp.hp_errval; + hpk.hpk_data_version = 0; + + /* File may not exist in Lustre; all progress + * reported to Lustre root */ + rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, + NULL); + RETURN(rc); + } + case LL_IOC_HSM_CT_START: + rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void *)arg, + sizeof(struct lustre_kernelcomm)); + RETURN(rc); + + case LL_IOC_HSM_COPY_START: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_start(inode->i_sb, copy); + if (copy_to_user((char *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + case LL_IOC_HSM_COPY_END: { + struct hsm_copy *copy; + int rc; + + OBD_ALLOC_PTR(copy); + if (copy == NULL) + RETURN(-ENOMEM); + if (copy_from_user(copy, (char *)arg, sizeof(*copy))) { + OBD_FREE_PTR(copy); + RETURN(-EFAULT); + } + + rc = ll_ioc_copy_end(inode->i_sb, copy); + if (copy_to_user((char *)arg, copy, sizeof(*copy))) + rc = -EFAULT; + + OBD_FREE_PTR(copy); + RETURN(rc); + } + default: + RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, + (void *)arg)); + } +} + +static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int api32 = ll_need_32bit_api(sbi); + loff_t ret = -EINVAL; + ENTRY; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_SET: + break; + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_END: + if (offset > 0) + GOTO(out, ret); + if (api32) + offset += LL_DIR_END_OFF_32BIT; + else + offset += LL_DIR_END_OFF; + break; + default: + GOTO(out, ret); + } + + if (offset >= 0 && + ((api32 && offset <= LL_DIR_END_OFF_32BIT) || + (!api32 && offset <= LL_DIR_END_OFF))) { + if (offset != file->f_pos) { + if ((api32 && offset == LL_DIR_END_OFF_32BIT) || + (!api32 && offset == LL_DIR_END_OFF)) + fd->lfd_pos = MDS_DIR_END_OFF; + else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH) + fd->lfd_pos = offset << 32; + else + fd->lfd_pos = offset; + file->f_pos = offset; + file->f_version = 0; + } + ret = offset; + } + GOTO(out, ret); + +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +int ll_dir_open(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_open(inode, file)); +} + +int ll_dir_release(struct inode *inode, struct file *file) +{ + ENTRY; + RETURN(ll_file_release(inode, file)); +} + +struct file_operations ll_dir_operations = { + .llseek = ll_dir_seek, + .open = ll_dir_open, + .release = ll_dir_release, + .read = generic_read_dir, + .readdir = ll_readdir, + .unlocked_ioctl = ll_dir_ioctl, + .fsync = ll_fsync, +}; diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c new file mode 100644 index 000000000000..d423de1eb5da --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -0,0 +1,3196 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/file.c + * + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include +#include +#include +#include "llite_internal.h" +#include + +#include "cl_object.h" + +struct ll_file_data *ll_file_data_get(void) +{ + struct ll_file_data *fd; + + OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO); + fd->fd_write_failed = false; + return fd; +} + +static void ll_file_data_put(struct ll_file_data *fd) +{ + if (fd != NULL) + OBD_SLAB_FREE_PTR(fd, ll_file_data_slab); +} + +void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, + struct lustre_handle *fh) +{ + op_data->op_fid1 = ll_i2info(inode)->lli_fid; + op_data->op_attr.ia_mode = inode->i_mode; + op_data->op_attr.ia_atime = inode->i_atime; + op_data->op_attr.ia_mtime = inode->i_mtime; + op_data->op_attr.ia_ctime = inode->i_ctime; + op_data->op_attr.ia_size = i_size_read(inode); + op_data->op_attr_blocks = inode->i_blocks; + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = + ll_inode_to_ext_flags(inode->i_flags); + op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch; + if (fh) + op_data->op_handle = *fh; + op_data->op_capa1 = ll_mdscapa_get(inode); + + if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags) + op_data->op_bias |= MDS_DATA_MODIFIED; +} + +/** + * Closes the IO epoch and packs all the attributes into @op_data for + * the CLOSE rpc. + */ +static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle *och) +{ + ENTRY; + + op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET | + ATTR_MTIME_SET | ATTR_CTIME_SET; + + if (!(och->och_flags & FMODE_WRITE)) + goto out; + + if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode)) + op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS; + else + ll_ioepoch_close(inode, op_data, &och, 0); + +out: + ll_pack_inode2opdata(inode, op_data, &och->och_fh); + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + EXIT; +} + +static int ll_close_inode_openhandle(struct obd_export *md_exp, + struct inode *inode, + struct obd_client_handle *och) +{ + struct obd_export *exp = ll_i2mdexp(inode); + struct md_op_data *op_data; + struct ptlrpc_request *req = NULL; + struct obd_device *obd = class_exp2obd(exp); + int epoch_close = 1; + int rc; + ENTRY; + + if (obd == NULL) { + /* + * XXX: in case of LMV, is this correct to access + * ->exp_handle? + */ + CERROR("Invalid MDC connection handle "LPX64"\n", + ll_i2mdexp(inode)->exp_handle.h_cookie); + GOTO(out, rc = 0); + } + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here. + + ll_prepare_close(inode, op_data, och); + epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE); + rc = md_close(md_exp, op_data, och->och_mod, &req); + if (rc == -EAGAIN) { + /* This close must have the epoch closed. */ + LASSERT(epoch_close); + /* MDS has instructed us to obtain Size-on-MDS attribute from + * OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + if (rc) { + CERROR("inode %lu mdc Size-on-MDS update failed: " + "rc = %d\n", inode->i_ino, rc); + rc = 0; + } + } else if (rc) { + CERROR("inode %lu mdc close failed: rc = %d\n", + inode->i_ino, rc); + } + + /* DATA_MODIFIED flag was successfully sent on close, cancel data + * modification flag. */ + if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) { + struct ll_inode_info *lli = ll_i2info(inode); + + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + ll_finish_md_op_data(op_data); + + if (rc == 0) { + rc = ll_objects_destroy(req, inode); + if (rc) + CERROR("inode %lu ll_objects destroy: rc = %d\n", + inode->i_ino, rc); + } + + EXIT; +out: + + if (exp_connect_som(exp) && !epoch_close && + S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) { + ll_queue_done_writing(inode, LLIF_DONE_WRITING); + } else { + md_clear_open_replay_data(md_exp, och); + /* Free @och if it is not waiting for DONE_WRITING. */ + och->och_fh.cookie = DEAD_HANDLE_MAGIC; + OBD_FREE_PTR(och); + } + if (req) /* This is close request */ + ptlrpc_req_finished(req); + return rc; +} + +int ll_md_real_close(struct inode *inode, int flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_client_handle **och_p; + struct obd_client_handle *och; + __u64 *och_usecount; + int rc = 0; + ENTRY; + + if (flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + LASSERT(flags & FMODE_READ); + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_usecount) { /* There are still users of this handle, so + skip freeing it. */ + mutex_unlock(&lli->lli_och_mutex); + RETURN(0); + } + och=*och_p; + *och_p = NULL; + mutex_unlock(&lli->lli_och_mutex); + + if (och) { /* There might be a race and somebody have freed this och + already */ + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, + inode, och); + } + + RETURN(rc); +} + +int ll_md_close(struct obd_export *md_exp, struct inode *inode, + struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + ENTRY; + + /* clear group lock, if present */ + if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) + ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid); + + /* Let's see if we have good enough OPEN lock on the file and if + we can skip talking to MDS */ + if (file->f_dentry->d_inode) { /* Can this ever be false? */ + int lockmode; + int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; + struct lustre_handle lockh; + struct inode *inode = file->f_dentry->d_inode; + ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}}; + + mutex_lock(&lli->lli_och_mutex); + if (fd->fd_omode & FMODE_WRITE) { + lockmode = LCK_CW; + LASSERT(lli->lli_open_fd_write_count); + lli->lli_open_fd_write_count--; + } else if (fd->fd_omode & FMODE_EXEC) { + lockmode = LCK_PR; + LASSERT(lli->lli_open_fd_exec_count); + lli->lli_open_fd_exec_count--; + } else { + lockmode = LCK_CR; + LASSERT(lli->lli_open_fd_read_count); + lli->lli_open_fd_read_count--; + } + mutex_unlock(&lli->lli_och_mutex); + + if (!md_lock_match(md_exp, flags, ll_inode2fid(inode), + LDLM_IBITS, &policy, lockmode, + &lockh)) { + rc = ll_md_real_close(file->f_dentry->d_inode, + fd->fd_omode); + } + } else { + CERROR("Releasing a file %p with negative dentry %p. Name %s", + file, file->f_dentry, file->f_dentry->d_name.name); + } + + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + ll_capa_close(inode); + + RETURN(rc); +} + +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ +int ll_file_release(struct inode *inode, struct file *file) +{ + struct ll_file_data *fd; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + inode == inode->i_sb->s_root->d_inode) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(fd != NULL); + if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) { + fd->fd_flags &= ~LL_FILE_RMTACL; + rct_del(&sbi->ll_rct, current_pid()); + et_search_free(&sbi->ll_et, current_pid()); + } + } +#endif + + if (inode->i_sb->s_root != file->f_dentry) + ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1); + fd = LUSTRE_FPRIVATE(file); + LASSERT(fd != NULL); + + /* The last ref on @file, maybe not the the owner pid of statahead. + * Different processes can open the same dir, "ll_opendir_key" means: + * it is me that should stop the statahead thread. */ + if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd && + lli->lli_opendir_pid != 0) + ll_stop_statahead(inode, lli->lli_opendir_key); + + if (inode->i_sb->s_root == file->f_dentry) { + LUSTRE_FPRIVATE(file) = NULL; + ll_file_data_put(fd); + RETURN(0); + } + + if (!S_ISDIR(inode->i_mode)) { + lov_read_and_clear_async_rc(lli->lli_clob); + lli->lli_async_rc = 0; + } + + rc = ll_md_close(sbi->ll_md_exp, inode, file); + + if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) + libcfs_debug_dumplog(); + + RETURN(rc); +} + +static int ll_intent_file_open(struct file *file, void *lmm, + int lmmsize, struct lookup_intent *itp) +{ + struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode); + struct dentry *parent = file->f_dentry->d_parent; + const char *name = file->f_dentry->d_name.name; + const int len = file->f_dentry->d_name.len; + struct md_op_data *op_data; + struct ptlrpc_request *req; + __u32 opc = LUSTRE_OPC_ANY; + int rc; + ENTRY; + + if (!parent) + RETURN(-ENOENT); + + /* Usually we come here only for NFSD, and we want open lock. + But we can also get here with pre 2.6.15 patchless kernels, and in + that case that lock is also ok */ + /* We can also get here if there was cached open handle in revalidate_it + * but it disappeared while we were getting from there to ll_file_open. + * But this means this file was closed and immediatelly opened which + * makes a good candidate for using OPEN lock */ + /* If lmmsize & lmm are not 0, we are just setting stripe info + * parameters. No need for the open lock */ + if (lmm == NULL && lmmsize == 0) { + itp->it_flags |= MDS_OPEN_LOCK; + if (itp->it_flags & FMODE_WRITE) + opc = LUSTRE_OPC_CREATE; + } + + op_data = ll_prep_md_op_data(NULL, parent->d_inode, + file->f_dentry->d_inode, name, len, + O_RDWR, opc, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + itp->it_flags |= MDS_OPEN_BY_FID; + rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp, + 0 /*unused */, &req, ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc == -ESTALE) { + /* reason for keep own exit path - don`t flood log + * with messages with -ESTALE errors. + */ + if (!it_disposition(itp, DISP_OPEN_OPEN) || + it_open_error(DISP_OPEN_OPEN, itp)) + GOTO(out, rc); + ll_release_openhandle(file->f_dentry, itp); + GOTO(out, rc); + } + + if (it_disposition(itp, DISP_LOOKUP_NEG)) + GOTO(out, rc = -ENOENT); + + if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { + rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); + CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); + GOTO(out, rc); + } + + rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp); + if (!rc && itp->d.lustre.it_lock_mode) + ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode, + itp, NULL); + +out: + ptlrpc_req_finished(itp->d.lustre.it_data); + it_clear_disposition(itp, DISP_ENQ_COMPLETE); + ll_intent_drop_lock(itp); + + RETURN(rc); +} + +/** + * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does + * not believe attributes if a few ioepoch holders exist. Attributes for + * previous ioepoch if new one is opened are also skipped by MDS. + */ +void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch) +{ + if (ioepoch && lli->lli_ioepoch != ioepoch) { + lli->lli_ioepoch = ioepoch; + CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n", + ioepoch, PFID(&lli->lli_fid)); + } +} + +static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli, + struct lookup_intent *it, struct obd_client_handle *och) +{ + struct ptlrpc_request *req = it->d.lustre.it_data; + struct mdt_body *body; + + LASSERT(och); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); /* reply already checked out */ + + memcpy(&och->och_fh, &body->handle, sizeof(body->handle)); + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + och->och_fid = lli->lli_fid; + och->och_flags = it->it_flags; + ll_ioepoch_open(lli, body->ioepoch); + + return md_set_open_replay_data(md_exp, och, req); +} + +int ll_local_open(struct file *file, struct lookup_intent *it, + struct ll_file_data *fd, struct obd_client_handle *och) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + LASSERT(!LUSTRE_FPRIVATE(file)); + + LASSERT(fd != NULL); + + if (och) { + struct ptlrpc_request *req = it->d.lustre.it_data; + struct mdt_body *body; + int rc; + + rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och); + if (rc) + RETURN(rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if ((it->it_flags & FMODE_WRITE) && + (body->valid & OBD_MD_FLSIZE)) + CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n", + lli->lli_ioepoch, PFID(&lli->lli_fid)); + } + + LUSTRE_FPRIVATE(file) = fd; + ll_readahead_init(inode, &fd->fd_ras); + fd->fd_omode = it->it_flags; + RETURN(0); +} + +/* Open a file, and (for the very first open) create objects on the OSTs at + * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object + * creation or open until ll_lov_setstripe() ioctl is called. + * + * If we already have the stripe MD locally then we don't request it in + * md_open(), by passing a lmm_size = 0. + * + * It is up to the application to ensure no other processes open this file + * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be + * used. We might be able to avoid races of that sort by getting lli_open_sem + * before returning in the O_LOV_DELAY_CREATE case and dropping it here + * or in ll_file_release(), but I'm not sure that is desirable/necessary. + */ +int ll_file_open(struct inode *inode, struct file *file) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lookup_intent *it, oit = { .it_op = IT_OPEN, + .it_flags = file->f_flags }; + struct obd_client_handle **och_p = NULL; + __u64 *och_usecount = NULL; + struct ll_file_data *fd; + int rc = 0, opendir_set = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino, + inode->i_generation, inode, file->f_flags); + + it = file->private_data; /* XXX: compat macro */ + file->private_data = NULL; /* prevent ll_local_open assertion */ + + fd = ll_file_data_get(); + if (fd == NULL) + GOTO(out_och_free, rc = -ENOMEM); + + fd->fd_file = file; + if (S_ISDIR(inode->i_mode)) { + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL && + lli->lli_opendir_pid == 0) { + lli->lli_opendir_key = fd; + lli->lli_opendir_pid = current_pid(); + opendir_set = 1; + } + spin_unlock(&lli->lli_sa_lock); + } + + if (inode->i_sb->s_root == file->f_dentry) { + LUSTRE_FPRIVATE(file) = fd; + RETURN(0); + } + + if (!it || !it->d.lustre.it_disposition) { + /* Convert f_flags into access mode. We cannot use file->f_mode, + * because everything but O_ACCMODE mask was stripped from + * there */ + if ((oit.it_flags + 1) & O_ACCMODE) + oit.it_flags++; + if (file->f_flags & O_TRUNC) + oit.it_flags |= FMODE_WRITE; + + /* kernel only call f_op->open in dentry_open. filp_open calls + * dentry_open after call to open_namei that checks permissions. + * Only nfsd_open call dentry_open directly without checking + * permissions and because of that this code below is safe. */ + if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) + oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; + + /* We do not want O_EXCL here, presumably we opened the file + * already? XXX - NFS implications? */ + oit.it_flags &= ~O_EXCL; + + /* bug20584, if "it_flags" contains O_CREAT, the file will be + * created if necessary, then "IT_CREAT" should be set to keep + * consistent with it */ + if (oit.it_flags & O_CREAT) + oit.it_op |= IT_CREAT; + + it = &oit; + } + +restart: + /* Let's see if we have file open on MDS already. */ + if (it->it_flags & FMODE_WRITE) { + och_p = &lli->lli_mds_write_och; + och_usecount = &lli->lli_open_fd_write_count; + } else if (it->it_flags & FMODE_EXEC) { + och_p = &lli->lli_mds_exec_och; + och_usecount = &lli->lli_open_fd_exec_count; + } else { + och_p = &lli->lli_mds_read_och; + och_usecount = &lli->lli_open_fd_read_count; + } + + mutex_lock(&lli->lli_och_mutex); + if (*och_p) { /* Open handle is present */ + if (it_disposition(it, DISP_OPEN_OPEN)) { + /* Well, there's extra open request that we do not need, + let's close it somehow. This will decref request. */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) { + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + + ll_release_openhandle(file->f_dentry, it); + } + (*och_usecount)++; + + rc = ll_local_open(file, it, fd, NULL); + if (rc) { + (*och_usecount)--; + mutex_unlock(&lli->lli_och_mutex); + GOTO(out_openerr, rc); + } + } else { + LASSERT(*och_usecount == 0); + if (!it->d.lustre.it_disposition) { + /* We cannot just request lock handle now, new ELC code + means that one of other OPEN locks for this file + could be cancelled, and since blocking ast handler + would attempt to grab och_mutex as well, that would + result in a deadlock */ + mutex_unlock(&lli->lli_och_mutex); + it->it_create_mode |= M_CHECK_STALE; + rc = ll_intent_file_open(file, NULL, 0, it); + it->it_create_mode &= ~M_CHECK_STALE; + if (rc) + GOTO(out_openerr, rc); + + goto restart; + } + OBD_ALLOC(*och_p, sizeof (struct obd_client_handle)); + if (!*och_p) + GOTO(out_och_free, rc = -ENOMEM); + + (*och_usecount)++; + + /* md_intent_lock() didn't get a request ref if there was an + * open error, so don't do cleanup on the request here + * (bug 3430) */ + /* XXX (green): Should not we bail out on any error here, not + * just open error? */ + rc = it_open_error(DISP_OPEN_OPEN, it); + if (rc) + GOTO(out_och_free, rc); + + LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF)); + + rc = ll_local_open(file, it, fd, *och_p); + if (rc) + GOTO(out_och_free, rc); + } + mutex_unlock(&lli->lli_och_mutex); + fd = NULL; + + /* Must do this outside lli_och_mutex lock to prevent deadlock where + different kind of OPEN lock for this same inode gets cancelled + by ldlm_cancel_lru */ + if (!S_ISREG(inode->i_mode)) + GOTO(out_och_free, rc); + + ll_capa_open(inode); + + if (!lli->lli_has_smd) { + if (file->f_flags & O_LOV_DELAY_CREATE || + !(file->f_mode & FMODE_WRITE)) { + CDEBUG(D_INODE, "object creation was delayed\n"); + GOTO(out_och_free, rc); + } + } + file->f_flags &= ~O_LOV_DELAY_CREATE; + GOTO(out_och_free, rc); + +out_och_free: + if (rc) { + if (och_p && *och_p) { + OBD_FREE(*och_p, sizeof (struct obd_client_handle)); + *och_p = NULL; /* OBD_FREE writes some magic there */ + (*och_usecount)--; + } + mutex_unlock(&lli->lli_och_mutex); + +out_openerr: + if (opendir_set != 0) + ll_stop_statahead(inode, lli->lli_opendir_key); + if (fd != NULL) + ll_file_data_put(fd); + } else { + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); + } + + if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->d.lustre.it_data); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + + return rc; +} + +/* Fills the obdo with the attributes for the lsm */ +static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp, + struct obd_capa *capa, struct obdo *obdo, + __u64 ioepoch, int sync) +{ + struct ptlrpc_request_set *set; + struct obd_info oinfo = { { { 0 } } }; + int rc; + + ENTRY; + + LASSERT(lsm != NULL); + + oinfo.oi_md = lsm; + oinfo.oi_oa = obdo; + oinfo.oi_oa->o_oi = lsm->lsm_oi; + oinfo.oi_oa->o_mode = S_IFREG; + oinfo.oi_oa->o_ioepoch = ioepoch; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | + OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLBLKSZ | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLGROUP | OBD_MD_FLEPOCH | + OBD_MD_FLDATAVERSION; + oinfo.oi_capa = capa; + if (sync) { + oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS; + oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK; + } + + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("can't allocate ptlrpc set\n"); + rc = -ENOMEM; + } else { + rc = obd_getattr_async(exp, &oinfo, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + } + if (rc == 0) + oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE | + OBD_MD_FLDATAVERSION); + RETURN(rc); +} + +/** + * Performs the getattr on the inode and updates its fields. + * If @sync != 0, perform the getattr under the server-side lock. + */ +int ll_inode_getattr(struct inode *inode, struct obdo *obdo, + __u64 ioepoch, int sync) +{ + struct obd_capa *capa = ll_mdscapa_get(inode); + struct lov_stripe_md *lsm; + int rc; + ENTRY; + + lsm = ccc_inode_lsm_get(inode); + rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode), + capa, obdo, ioepoch, sync); + capa_put(capa); + if (rc == 0) { + struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi; + + obdo_refresh_inode(inode, obdo, obdo->o_valid); + CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu," + " blksize %lu\n", POSTID(oi), i_size_read(inode), + (unsigned long long)inode->i_blocks, + (unsigned long)ll_inode_blksize(inode)); + } + ccc_inode_lsm_put(inode, lsm); + RETURN(rc); +} + +int ll_merge_lvb(const struct lu_env *env, struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct cl_attr *attr = ccc_env_thread_attr(env); + struct ost_lvb lvb; + int rc = 0; + + ENTRY; + + ll_inode_size_lock(inode); + /* merge timestamps the most recently obtained from mds with + timestamps obtained from osts */ + LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime; + LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime; + LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime; + inode_init_lvb(inode, &lvb); + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + if (rc == 0) { + if (lvb.lvb_atime < attr->cat_atime) + lvb.lvb_atime = attr->cat_atime; + if (lvb.lvb_ctime < attr->cat_ctime) + lvb.lvb_ctime = attr->cat_ctime; + if (lvb.lvb_mtime < attr->cat_mtime) + lvb.lvb_mtime = attr->cat_mtime; + + CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n", + PFID(&lli->lli_fid), attr->cat_size); + cl_isize_write_nolock(inode, attr->cat_size); + + inode->i_blocks = attr->cat_blocks; + + LTIME_S(inode->i_mtime) = lvb.lvb_mtime; + LTIME_S(inode->i_atime) = lvb.lvb_atime; + LTIME_S(inode->i_ctime) = lvb.lvb_ctime; + } + ll_inode_size_unlock(inode); + + RETURN(rc); +} + +int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, + lstat_t *st) +{ + struct obdo obdo = { 0 }; + int rc; + + rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0); + if (rc == 0) { + st->st_size = obdo.o_size; + st->st_blocks = obdo.o_blocks; + st->st_mtime = obdo.o_mtime; + st->st_atime = obdo.o_atime; + st->st_ctime = obdo.o_ctime; + } + return rc; +} + +void ll_io_init(struct cl_io *io, const struct file *file, int write) +{ + struct inode *inode = file->f_dentry->d_inode; + + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; + if (write) { + io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); + io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || + file->f_flags & O_DIRECT || + IS_SYNC(inode); + } + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_lockreq = CILR_MAYBE; + if (ll_file_nolock(file)) { + io->ci_lockreq = CILR_NEVER; + io->ci_no_srvlock = 1; + } else if (file->f_flags & O_APPEND) { + io->ci_lockreq = CILR_MANDATORY; + } +} + +static ssize_t +ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, + struct file *file, enum cl_io_type iot, + loff_t *ppos, size_t count) +{ + struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct cl_io *io; + ssize_t result; + ENTRY; + +restart: + io = ccc_env_thread_io(env); + ll_io_init(io, file, iot == CIT_WRITE); + + if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + int write_mutex_locked = 0; + + cio->cui_fd = LUSTRE_FPRIVATE(file); + vio->cui_io_subtype = args->via_io_subtype; + + switch (vio->cui_io_subtype) { + case IO_NORMAL: + cio->cui_iov = args->u.normal.via_iov; + cio->cui_nrsegs = args->u.normal.via_nrsegs; + cio->cui_tot_nrsegs = cio->cui_nrsegs; + cio->cui_iocb = args->u.normal.via_iocb; + if ((iot == CIT_WRITE) && + !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + if (mutex_lock_interruptible(&lli-> + lli_write_mutex)) + GOTO(out, result = -ERESTARTSYS); + write_mutex_locked = 1; + } else if (iot == CIT_READ) { + down_read(&lli->lli_trunc_sem); + } + break; + case IO_SENDFILE: + vio->u.sendfile.cui_actor = args->u.sendfile.via_actor; + vio->u.sendfile.cui_target = args->u.sendfile.via_target; + break; + case IO_SPLICE: + vio->u.splice.cui_pipe = args->u.splice.via_pipe; + vio->u.splice.cui_flags = args->u.splice.via_flags; + break; + default: + CERROR("Unknow IO type - %u\n", vio->cui_io_subtype); + LBUG(); + } + result = cl_io_loop(env, io); + if (write_mutex_locked) + mutex_unlock(&lli->lli_write_mutex); + else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ) + up_read(&lli->lli_trunc_sem); + } else { + /* cl_io_rw_init() handled IO */ + result = io->ci_result; + } + + if (io->ci_nob > 0) { + result = io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; + } + GOTO(out, result); +out: + cl_io_fini(env, io); + /* If any bit been read/written (result != 0), we just return + * short read/write instead of restart io. */ + if (result == 0 && io->ci_need_restart) { + CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n", + iot == CIT_READ ? "read" : "write", + file->f_dentry->d_name.name, *ppos, count); + LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob); + goto restart; + } + + if (iot == CIT_READ) { + if (result >= 0) + ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), + LPROC_LL_READ_BYTES, result); + } else if (iot == CIT_WRITE) { + if (result >= 0) { + ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode), + LPROC_LL_WRITE_BYTES, result); + fd->fd_write_failed = false; + } else if (result != -ERESTARTSYS) { + fd->fd_write_failed = true; + } + } + + return result; +} + + +/* + * XXX: exact copy from kernel code (__generic_file_aio_write_nolock) + */ +static int ll_file_get_iov_count(const struct iovec *iov, + unsigned long *nr_segs, size_t *count) +{ + size_t cnt = 0; + unsigned long seg; + + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; + } + *count = cnt; + return 0; +} + +static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct lu_env *env; + struct vvp_io_args *args; + size_t count; + ssize_t result; + int refcheck; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + args = vvp_env_args(env, IO_NORMAL); + args->u.normal.via_iov = (struct iovec *)iov; + args->u.normal.via_nrsegs = nr_segs; + args->u.normal.via_iocb = iocb; + + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ, + &iocb->ki_pos, count); + cl_env_put(env, &refcheck); + RETURN(result); +} + +static ssize_t ll_file_read(struct file *file, char *buf, size_t count, + loff_t *ppos) +{ + struct lu_env *env; + struct iovec *local_iov; + struct kiocb *kiocb; + ssize_t result; + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + local_iov = &vvp_env_info(env)->vti_local_iov; + kiocb = &vvp_env_info(env)->vti_kiocb; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; + init_sync_kiocb(kiocb, file); + kiocb->ki_pos = *ppos; + kiocb->ki_left = count; + + result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); + *ppos = kiocb->ki_pos; + + cl_env_put(env, &refcheck); + RETURN(result); +} + +/* + * Write to a file (through the page cache). + */ +static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct lu_env *env; + struct vvp_io_args *args; + size_t count; + ssize_t result; + int refcheck; + ENTRY; + + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + args = vvp_env_args(env, IO_NORMAL); + args->u.normal.via_iov = (struct iovec *)iov; + args->u.normal.via_nrsegs = nr_segs; + args->u.normal.via_iocb = iocb; + + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, + &iocb->ki_pos, count); + cl_env_put(env, &refcheck); + RETURN(result); +} + +static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, + loff_t *ppos) +{ + struct lu_env *env; + struct iovec *local_iov; + struct kiocb *kiocb; + ssize_t result; + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + local_iov = &vvp_env_info(env)->vti_local_iov; + kiocb = &vvp_env_info(env)->vti_kiocb; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; + init_sync_kiocb(kiocb, file); + kiocb->ki_pos = *ppos; + kiocb->ki_left = count; + + result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); + *ppos = kiocb->ki_pos; + + cl_env_put(env, &refcheck); + RETURN(result); +} + + + +/* + * Send file content (through pagecache) somewhere with helper + */ +static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t count, + unsigned int flags) +{ + struct lu_env *env; + struct vvp_io_args *args; + ssize_t result; + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + args = vvp_env_args(env, IO_SPLICE); + args->u.splice.via_pipe = pipe; + args->u.splice.via_flags = flags; + + result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count); + cl_env_put(env, &refcheck); + RETURN(result); +} + +static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, + obd_count ost_idx) +{ + struct obd_export *exp = ll_i2dtexp(inode); + struct obd_trans_info oti = { 0 }; + struct obdo *oa = NULL; + int lsm_size; + int rc = 0; + struct lov_stripe_md *lsm = NULL, *lsm2; + ENTRY; + + OBDO_ALLOC(oa); + if (oa == NULL) + RETURN(-ENOMEM); + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + GOTO(out, rc = -ENOENT); + + lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) * + (lsm->lsm_stripe_count)); + + OBD_ALLOC_LARGE(lsm2, lsm_size); + if (lsm2 == NULL) + GOTO(out, rc = -ENOMEM); + + oa->o_oi = *oi; + oa->o_nlink = ost_idx; + oa->o_flags |= OBD_FL_RECREATE_OBJS; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP; + obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); + memcpy(lsm2, lsm, lsm_size); + ll_inode_size_lock(inode); + rc = obd_create(NULL, exp, oa, &lsm2, &oti); + ll_inode_size_unlock(inode); + + OBD_FREE_LARGE(lsm2, lsm_size); + GOTO(out, rc); +out: + ccc_inode_lsm_put(inode, lsm); + OBDO_FREE(oa); + return rc; +} + +static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg) +{ + struct ll_recreate_obj ucreat; + struct ost_id oi; + ENTRY; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg, + sizeof(ucreat))) + RETURN(-EFAULT); + + ostid_set_seq_mdt0(&oi); + ostid_set_id(&oi, ucreat.lrc_id); + RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx)); +} + +static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg) +{ + struct lu_fid fid; + struct ost_id oi; + obd_count ost_idx; + ENTRY; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid))) + RETURN(-EFAULT); + + fid_to_ostid(&fid, &oi); + ost_idx = (fid_seq(&fid) >> 16) & 0xffff; + RETURN(ll_lov_recreate(inode, &oi, ost_idx)); +} + +int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, + int flags, struct lov_user_md *lum, int lum_size) +{ + struct lov_stripe_md *lsm = NULL; + struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags}; + int rc = 0; + ENTRY; + + lsm = ccc_inode_lsm_get(inode); + if (lsm != NULL) { + ccc_inode_lsm_put(inode, lsm); + CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n", + inode->i_ino); + RETURN(-EEXIST); + } + + ll_inode_size_lock(inode); + rc = ll_intent_file_open(file, lum, lum_size, &oit); + if (rc) + GOTO(out, rc); + rc = oit.d.lustre.it_status; + if (rc < 0) + GOTO(out_req_free, rc); + + ll_release_openhandle(file->f_dentry, &oit); + + out: + ll_inode_size_unlock(inode); + ll_intent_release(&oit); + ccc_inode_lsm_put(inode, lsm); + RETURN(rc); +out_req_free: + ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data); + goto out; +} + +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmmp, int *lmm_size, + struct ptlrpc_request **request) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdt_body *body; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + int rc, lmmsize; + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc) + RETURN(rc); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, + strlen(filename), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc < 0) { + CDEBUG(D_INFO, "md_getattr_name failed " + "on %s: rc %d\n", filename, rc); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); /* checked by mdc_getattr_name */ + + lmmsize = body->eadatasize; + + if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || + lmmsize == 0) { + GOTO(out, rc = -ENODATA); + } + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); + LASSERT(lmm != NULL); + + if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) && + (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) { + GOTO(out, rc = -EPROTO); + } + + /* + * This is coming from the MDS, so is probably in + * little endian. We convert it to host endian before + * passing it to userspace. + */ + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) { + /* if function called for directory - we should + * avoid swab not existent lsm objects */ + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count); + } + } + +out: + *lmmp = lmm; + *lmm_size = lmmsize; + *request = req; + return rc; +} + +static int ll_lov_setea(struct inode *inode, struct file *file, + unsigned long arg) +{ + int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; + struct lov_user_md *lump; + int lum_size = sizeof(struct lov_user_md) + + sizeof(struct lov_user_ost_data); + int rc; + ENTRY; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + RETURN(-EPERM); + + OBD_ALLOC_LARGE(lump, lum_size); + if (lump == NULL) + RETURN(-ENOMEM); + + if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) { + OBD_FREE_LARGE(lump, lum_size); + RETURN(-EFAULT); + } + + rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size); + + OBD_FREE_LARGE(lump, lum_size); + RETURN(rc); +} + +static int ll_lov_setstripe(struct inode *inode, struct file *file, + unsigned long arg) +{ + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + int lum_size, rc; + int flags = FMODE_WRITE; + ENTRY; + + /* first try with v1 which is smaller than v3 */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(lumv1, lumv1p, lum_size)) + RETURN(-EFAULT); + + if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { + lum_size = sizeof(struct lov_user_md_v3); + if (copy_from_user(&lumv3, lumv3p, lum_size)) + RETURN(-EFAULT); + } + + rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size); + if (rc == 0) { + struct lov_stripe_md *lsm; + __u32 gen; + + put_user(0, &lumv1p->lmm_stripe_count); + + ll_layout_refresh(inode, &gen); + lsm = ccc_inode_lsm_get(inode); + rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), + 0, lsm, (void *)arg); + ccc_inode_lsm_put(inode, lsm); + } + RETURN(rc); +} + +static int ll_lov_getstripe(struct inode *inode, unsigned long arg) +{ + struct lov_stripe_md *lsm; + int rc = -ENODATA; + ENTRY; + + lsm = ccc_inode_lsm_get(inode); + if (lsm != NULL) + rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, + lsm, (void *)arg); + ccc_inode_lsm_put(inode, lsm); + RETURN(rc); +} + +int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ccc_grouplock grouplock; + int rc; + ENTRY; + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + CWARN("group lock already existed with gid %lu\n", + fd->fd_grouplock.cg_gid); + spin_unlock(&lli->lli_lock); + RETURN(-EINVAL); + } + LASSERT(fd->fd_grouplock.cg_lock == NULL); + spin_unlock(&lli->lli_lock); + + rc = cl_get_grouplock(cl_i2info(inode)->lli_clob, + arg, (file->f_flags & O_NONBLOCK), &grouplock); + if (rc) + RETURN(rc); + + spin_lock(&lli->lli_lock); + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + spin_unlock(&lli->lli_lock); + CERROR("another thread just won the race\n"); + cl_put_grouplock(&grouplock); + RETURN(-EINVAL); + } + + fd->fd_flags |= LL_FILE_GROUP_LOCKED; + fd->fd_grouplock = grouplock; + spin_unlock(&lli->lli_lock); + + CDEBUG(D_INFO, "group lock %lu obtained\n", arg); + RETURN(0); +} + +int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct ccc_grouplock grouplock; + ENTRY; + + spin_lock(&lli->lli_lock); + if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { + spin_unlock(&lli->lli_lock); + CWARN("no group lock held\n"); + RETURN(-EINVAL); + } + LASSERT(fd->fd_grouplock.cg_lock != NULL); + + if (fd->fd_grouplock.cg_gid != arg) { + CWARN("group lock %lu doesn't match current id %lu\n", + arg, fd->fd_grouplock.cg_gid); + spin_unlock(&lli->lli_lock); + RETURN(-EINVAL); + } + + grouplock = fd->fd_grouplock; + memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); + fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; + spin_unlock(&lli->lli_lock); + + cl_put_grouplock(&grouplock); + CDEBUG(D_INFO, "group lock %lu released\n", arg); + RETURN(0); +} + +/** + * Close inode open handle + * + * \param dentry [in] dentry which contains the inode + * \param it [in,out] intent which contains open info and result + * + * \retval 0 success + * \retval <0 failure + */ +int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it) +{ + struct inode *inode = dentry->d_inode; + struct obd_client_handle *och; + int rc; + ENTRY; + + LASSERT(inode); + + /* Root ? Do nothing. */ + if (dentry->d_inode->i_sb->s_root == dentry) + RETURN(0); + + /* No open handle to close? Move away */ + if (!it_disposition(it, DISP_OPEN_OPEN)) + RETURN(0); + + LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); + + OBD_ALLOC(och, sizeof(*och)); + if (!och) + GOTO(out, rc = -ENOMEM); + + ll_och_fill(ll_i2sbi(inode)->ll_md_exp, + ll_i2info(inode), it, och); + + rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, + inode, och); + out: + /* this one is in place of ll_file_open */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) { + ptlrpc_req_finished(it->d.lustre.it_data); + it_clear_disposition(it, DISP_ENQ_OPEN_REF); + } + RETURN(rc); +} + +/** + * Get size for inode for which FIEMAP mapping is requested. + * Make the FIEMAP get_info call and returns the result. + */ +int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap, + int num_bytes) +{ + struct obd_export *exp = ll_i2dtexp(inode); + struct lov_stripe_md *lsm = NULL; + struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, }; + int vallen = num_bytes; + int rc; + ENTRY; + + /* Checks for fiemap flags */ + if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { + fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; + return -EBADR; + } + + /* Check for FIEMAP_FLAG_SYNC */ + if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { + rc = filemap_fdatawrite(inode->i_mapping); + if (rc) + return rc; + } + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + return -ENOENT; + + /* If the stripe_count > 1 and the application does not understand + * DEVICE_ORDER flag, then it cannot interpret the extents correctly. + */ + if (lsm->lsm_stripe_count > 1 && + !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) + GOTO(out, rc = -EOPNOTSUPP); + + fm_key.oa.o_oi = lsm->lsm_oi; + fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE); + obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid); + /* If filesize is 0, then there would be no objects for mapping */ + if (fm_key.oa.o_size == 0) { + fiemap->fm_mapped_extents = 0; + GOTO(out, rc = 0); + } + + memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap)); + + rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen, + fiemap, lsm); + if (rc) + CERROR("obd_get_info failed: rc = %d\n", rc); + +out: + ccc_inode_lsm_put(inode, lsm); + RETURN(rc); +} + +int ll_fid2path(struct inode *inode, void *arg) +{ + struct obd_export *exp = ll_i2mdexp(inode); + struct getinfo_fid2path *gfout, *gfin; + int outsize, rc; + ENTRY; + + if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) && + !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) + RETURN(-EPERM); + + /* Need to get the buflen */ + OBD_ALLOC_PTR(gfin); + if (gfin == NULL) + RETURN(-ENOMEM); + if (copy_from_user(gfin, arg, sizeof(*gfin))) { + OBD_FREE_PTR(gfin); + RETURN(-EFAULT); + } + + outsize = sizeof(*gfout) + gfin->gf_pathlen; + OBD_ALLOC(gfout, outsize); + if (gfout == NULL) { + OBD_FREE_PTR(gfin); + RETURN(-ENOMEM); + } + memcpy(gfout, gfin, sizeof(*gfout)); + OBD_FREE_PTR(gfin); + + /* Call mdc_iocontrol */ + rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); + if (rc) + GOTO(gf_free, rc); + + if (copy_to_user(arg, gfout, outsize)) + rc = -EFAULT; + +gf_free: + OBD_FREE(gfout, outsize); + RETURN(rc); +} + +static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg) +{ + struct ll_user_fiemap *fiemap_s; + size_t num_bytes, ret_bytes; + unsigned int extent_count; + int rc = 0; + + /* Get the extent count so we can calculate the size of + * required fiemap buffer */ + if (get_user(extent_count, + &((struct ll_user_fiemap __user *)arg)->fm_extent_count)) + RETURN(-EFAULT); + num_bytes = sizeof(*fiemap_s) + (extent_count * + sizeof(struct ll_fiemap_extent)); + + OBD_ALLOC_LARGE(fiemap_s, num_bytes); + if (fiemap_s == NULL) + RETURN(-ENOMEM); + + /* get the fiemap value */ + if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg, + sizeof(*fiemap_s))) + GOTO(error, rc = -EFAULT); + + /* If fm_extent_count is non-zero, read the first extent since + * it is used to calculate end_offset and device from previous + * fiemap call. */ + if (extent_count) { + if (copy_from_user(&fiemap_s->fm_extents[0], + (char __user *)arg + sizeof(*fiemap_s), + sizeof(struct ll_fiemap_extent))) + GOTO(error, rc = -EFAULT); + } + + rc = ll_do_fiemap(inode, fiemap_s, num_bytes); + if (rc) + GOTO(error, rc); + + ret_bytes = sizeof(struct ll_user_fiemap); + + if (extent_count != 0) + ret_bytes += (fiemap_s->fm_mapped_extents * + sizeof(struct ll_fiemap_extent)); + + if (copy_to_user((void *)arg, fiemap_s, ret_bytes)) + rc = -EFAULT; + +error: + OBD_FREE_LARGE(fiemap_s, num_bytes); + RETURN(rc); +} + +/* + * Read the data_version for inode. + * + * This value is computed using stripe object version on OST. + * Version is computed using server side locking. + * + * @param extent_lock Take extent lock. Not needed if a process is already + * holding the OST object group locks. + */ +int ll_data_version(struct inode *inode, __u64 *data_version, + int extent_lock) +{ + struct lov_stripe_md *lsm = NULL; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obdo *obdo = NULL; + int rc; + ENTRY; + + /* If no stripe, we consider version is 0. */ + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) { + *data_version = 0; + CDEBUG(D_INODE, "No object for inode\n"); + RETURN(0); + } + + OBD_ALLOC_PTR(obdo); + if (obdo == NULL) { + ccc_inode_lsm_put(inode, lsm); + RETURN(-ENOMEM); + } + + rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock); + if (!rc) { + if (!(obdo->o_valid & OBD_MD_FLDATAVERSION)) + rc = -EOPNOTSUPP; + else + *data_version = obdo->o_data_version; + } + + OBD_FREE_PTR(obdo); + ccc_inode_lsm_put(inode, lsm); + + RETURN(rc); +} + +struct ll_swap_stack { + struct iattr ia1, ia2; + __u64 dv1, dv2; + struct inode *inode1, *inode2; + bool check_dv1, check_dv2; +}; + +static int ll_swap_layouts(struct file *file1, struct file *file2, + struct lustre_swap_layouts *lsl) +{ + struct mdc_swap_layouts msl; + struct md_op_data *op_data; + __u32 gid; + __u64 dv; + struct ll_swap_stack *llss = NULL; + int rc; + + OBD_ALLOC_PTR(llss); + if (llss == NULL) + RETURN(-ENOMEM); + + llss->inode1 = file1->f_dentry->d_inode; + llss->inode2 = file2->f_dentry->d_inode; + + if (!S_ISREG(llss->inode2->i_mode)) + GOTO(free, rc = -EINVAL); + + if (ll_permission(llss->inode1, MAY_WRITE, NULL) || + ll_permission(llss->inode2, MAY_WRITE, NULL)) + GOTO(free, rc = -EPERM); + + if (llss->inode2->i_sb != llss->inode1->i_sb) + GOTO(free, rc = -EXDEV); + + /* we use 2 bool because it is easier to swap than 2 bits */ + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) + llss->check_dv1 = true; + + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) + llss->check_dv2 = true; + + /* we cannot use lsl->sl_dvX directly because we may swap them */ + llss->dv1 = lsl->sl_dv1; + llss->dv2 = lsl->sl_dv2; + + rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); + if (rc == 0) /* same file, done! */ + GOTO(free, rc = 0); + + if (rc < 0) { /* sequentialize it */ + swap(llss->inode1, llss->inode2); + swap(file1, file2); + swap(llss->dv1, llss->dv2); + swap(llss->check_dv1, llss->check_dv2); + } + + gid = lsl->sl_gid; + if (gid != 0) { /* application asks to flush dirty cache */ + rc = ll_get_grouplock(llss->inode1, file1, gid); + if (rc < 0) + GOTO(free, rc); + + rc = ll_get_grouplock(llss->inode2, file2, gid); + if (rc < 0) { + ll_put_grouplock(llss->inode1, file1, gid); + GOTO(free, rc); + } + } + + /* to be able to restore mtime and atime after swap + * we need to first save them */ + if (lsl->sl_flags & + (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_mtime = llss->inode1->i_mtime; + llss->ia1.ia_atime = llss->inode1->i_atime; + llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME; + llss->ia2.ia_mtime = llss->inode2->i_mtime; + llss->ia2.ia_atime = llss->inode2->i_atime; + llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME; + } + + /* ultimate check, before swaping the layouts we check if + * dataversion has changed (if requested) */ + if (llss->check_dv1) { + rc = ll_data_version(llss->inode1, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv1) + GOTO(putgl, rc = -EAGAIN); + } + + if (llss->check_dv2) { + rc = ll_data_version(llss->inode2, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv2) + GOTO(putgl, rc = -EAGAIN); + } + + /* struct md_op_data is used to send the swap args to the mdt + * only flags is missing, so we use struct mdc_swap_layouts + * through the md_op_data->op_data */ + /* flags from user space have to be converted before they are send to + * server, no flag is sent today, they are only used on the client */ + msl.msl_flags = 0; + rc = -ENOMEM; + op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, + 0, LUSTRE_OPC_ANY, &msl); + if (op_data != NULL) { + rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, + ll_i2mdexp(llss->inode1), + sizeof(*op_data), op_data, NULL); + ll_finish_md_op_data(op_data); + } + +putgl: + if (gid != 0) { + ll_put_grouplock(llss->inode2, file2, gid); + ll_put_grouplock(llss->inode1, file1, gid); + } + + /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */ + if (rc != 0) + GOTO(free, rc); + + /* clear useless flags */ + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) { + llss->ia1.ia_valid &= ~ATTR_MTIME; + llss->ia2.ia_valid &= ~ATTR_MTIME; + } + + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_valid &= ~ATTR_ATIME; + llss->ia2.ia_valid &= ~ATTR_ATIME; + } + + /* update time if requested */ + rc = 0; + if (llss->ia2.ia_valid != 0) { + mutex_lock(&llss->inode1->i_mutex); + rc = ll_setattr(file1->f_dentry, &llss->ia2); + mutex_unlock(&llss->inode1->i_mutex); + } + + if (llss->ia1.ia_valid != 0) { + int rc1; + + mutex_lock(&llss->inode2->i_mutex); + rc1 = ll_setattr(file2->f_dentry, &llss->ia1); + mutex_unlock(&llss->inode2->i_mutex); + if (rc == 0) + rc = rc1; + } + +free: + if (llss != NULL) + OBD_FREE_PTR(llss); + + RETURN(rc); +} + +long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int flags, rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino, + inode->i_generation, inode, cmd); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); + + /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ + if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ + RETURN(-ENOTTY); + + switch(cmd) { + case LL_IOC_GETFLAGS: + /* Get the current value of the file flags */ + return put_user(fd->fd_flags, (int *)arg); + case LL_IOC_SETFLAGS: + case LL_IOC_CLRFLAGS: + /* Set or clear specific file flags */ + /* XXX This probably needs checks to ensure the flags are + * not abused, and to handle any flag side effects. + */ + if (get_user(flags, (int *) arg)) + RETURN(-EFAULT); + + if (cmd == LL_IOC_SETFLAGS) { + if ((flags & LL_FILE_IGNORE_LOCK) && + !(file->f_flags & O_DIRECT)) { + CERROR("%s: unable to disable locking on " + "non-O_DIRECT file\n", current->comm); + RETURN(-EINVAL); + } + + fd->fd_flags |= flags; + } else { + fd->fd_flags &= ~flags; + } + RETURN(0); + case LL_IOC_LOV_SETSTRIPE: + RETURN(ll_lov_setstripe(inode, file, arg)); + case LL_IOC_LOV_SETEA: + RETURN(ll_lov_setea(inode, file, arg)); + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct file *file2; + struct lustre_swap_layouts lsl; + + if (copy_from_user(&lsl, (char *)arg, + sizeof(struct lustre_swap_layouts))) + RETURN(-EFAULT); + + if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */ + RETURN(-EPERM); + + file2 = fget(lsl.sl_fd); + if (file2 == NULL) + RETURN(-EBADF); + + rc = -EPERM; + if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */ + rc = ll_swap_layouts(file, file2, &lsl); + fput(file2); + RETURN(rc); + } + case LL_IOC_LOV_GETSTRIPE: + RETURN(ll_lov_getstripe(inode, arg)); + case LL_IOC_RECREATE_OBJ: + RETURN(ll_lov_recreate_obj(inode, arg)); + case LL_IOC_RECREATE_FID: + RETURN(ll_lov_recreate_fid(inode, arg)); + case FSFILT_IOC_FIEMAP: + RETURN(ll_ioctl_fiemap(inode, arg)); + case FSFILT_IOC_GETFLAGS: + case FSFILT_IOC_SETFLAGS: + RETURN(ll_iocontrol(inode, file, cmd, arg)); + case FSFILT_IOC_GETVERSION_OLD: + case FSFILT_IOC_GETVERSION: + RETURN(put_user(inode->i_generation, (int *)arg)); + case LL_IOC_GROUP_LOCK: + RETURN(ll_get_grouplock(inode, file, arg)); + case LL_IOC_GROUP_UNLOCK: + RETURN(ll_put_grouplock(inode, file, arg)); + case IOC_OBD_STATFS: + RETURN(ll_obd_statfs(inode, (void *)arg)); + + /* We need to special case any other ioctls we want to handle, + * to send them to the MDS/OST as appropriate and to properly + * network encode the arg field. + case FSFILT_IOC_SETVERSION_OLD: + case FSFILT_IOC_SETVERSION: + */ + case LL_IOC_FLUSHCTX: + RETURN(ll_flush_ctx(inode)); + case LL_IOC_PATH2FID: { + if (copy_to_user((void *)arg, ll_inode2fid(inode), + sizeof(struct lu_fid))) + RETURN(-EFAULT); + + RETURN(0); + } + case OBD_IOC_FID2PATH: + RETURN(ll_fid2path(inode, (void *)arg)); + case LL_IOC_DATA_VERSION: { + struct ioc_data_version idv; + int rc; + + if (copy_from_user(&idv, (char *)arg, sizeof(idv))) + RETURN(-EFAULT); + + rc = ll_data_version(inode, &idv.idv_version, + !(idv.idv_flags & LL_DV_NOFLUSH)); + + if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv))) + RETURN(-EFAULT); + + RETURN(rc); + } + + case LL_IOC_GET_MDTIDX: { + int mdtidx; + + mdtidx = ll_get_mdt_idx(inode); + if (mdtidx < 0) + RETURN(mdtidx); + + if (put_user((int)mdtidx, (int*)arg)) + RETURN(-EFAULT); + + RETURN(0); + } + case OBD_IOC_GETDTNAME: + case OBD_IOC_GETMDNAME: + RETURN(ll_get_obd_name(inode, cmd, arg)); + case LL_IOC_HSM_STATE_GET: { + struct md_op_data *op_data; + struct hsm_user_state *hus; + int rc; + + OBD_ALLOC_PTR(hus); + if (hus == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hus); + if (op_data == NULL) { + OBD_FREE_PTR(hus); + RETURN(-ENOMEM); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((void *)arg, hus, sizeof(*hus))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hus); + RETURN(rc); + } + case LL_IOC_HSM_STATE_SET: { + struct md_op_data *op_data; + struct hsm_state_set *hss; + int rc; + + OBD_ALLOC_PTR(hss); + if (hss == NULL) + RETURN(-ENOMEM); + if (copy_from_user(hss, (char *)arg, sizeof(*hss))) { + OBD_FREE_PTR(hss); + RETURN(-EFAULT); + } + + /* Non-root users are forbidden to set or clear flags which are + * NOT defined in HSM_USER_MASK. */ + if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) + && !cfs_capable(CFS_CAP_SYS_ADMIN)) { + OBD_FREE_PTR(hss); + RETURN(-EPERM); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hss); + if (op_data == NULL) { + OBD_FREE_PTR(hss); + RETURN(-ENOMEM); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + ll_finish_md_op_data(op_data); + + OBD_FREE_PTR(hss); + RETURN(rc); + } + case LL_IOC_HSM_ACTION: { + struct md_op_data *op_data; + struct hsm_current_action *hca; + int rc; + + OBD_ALLOC_PTR(hca); + if (hca == NULL) + RETURN(-ENOMEM); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, hca); + if (op_data == NULL) { + OBD_FREE_PTR(hca); + RETURN(-ENOMEM); + } + + rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), + op_data, NULL); + + if (copy_to_user((char *)arg, hca, sizeof(*hca))) + rc = -EFAULT; + + ll_finish_md_op_data(op_data); + OBD_FREE_PTR(hca); + RETURN(rc); + } + default: { + int err; + + if (LLIOC_STOP == + ll_iocontrol_call(inode, file, cmd, arg, &err)) + RETURN(err); + + RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, + (void *)arg)); + } + } +} + + +loff_t ll_file_seek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_dentry->d_inode; + loff_t retval, eof = 0; + + ENTRY; + retval = offset + ((origin == SEEK_END) ? i_size_read(inode) : + (origin == SEEK_CUR) ? file->f_pos : 0); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n", + inode->i_ino, inode->i_generation, inode, retval, retval, + origin); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1); + + if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) { + retval = ll_glimpse_size(inode); + if (retval != 0) + RETURN(retval); + eof = i_size_read(inode); + } + + retval = ll_generic_file_llseek_size(file, offset, origin, + ll_file_maxbytes(inode), eof); + RETURN(retval); +} + +int ll_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + int rc, err; + + LASSERT(!S_ISDIR(inode->i_mode)); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + rc = lli->lli_async_rc; + lli->lli_async_rc = 0; + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + + /* The application has been told write failure already. + * Do not report failure again. */ + if (fd->fd_write_failed) + return 0; + return rc ? -EIO : 0; +} + +/** + * Called to make sure a portion of file has been written out. + * if @local_only is not true, it will send OST_SYNC RPCs to ost. + * + * Return how many pages have been written. + */ +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct obd_capa *capa = NULL; + struct cl_fsync_io *fio; + int result; + ENTRY; + + if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && + mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) + RETURN(-EINVAL); + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE); + + io = ccc_env_thread_io(env); + io->ci_obj = cl_i2info(inode)->lli_clob; + io->ci_ignore_layout = 1; + + /* initialize parameters for sync */ + fio = &io->u.ci_fsync; + fio->fi_capa = capa; + fio->fi_start = start; + fio->fi_end = end; + fio->fi_fid = ll_inode2fid(inode); + fio->fi_mode = mode; + fio->fi_nr_written = 0; + + if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + if (result == 0) + result = fio->fi_nr_written; + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + capa_put(capa); + + RETURN(result); +} + +/* + * When dentry is provided (the 'else' case), *file->f_dentry may be + * null and dentry must be used directly rather than pulled from + * *file->f_dentry as is done otherwise. + */ + +int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *req; + struct obd_capa *oc; + int rc, err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + mutex_lock(&inode->i_mutex); + + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + if (!S_ISDIR(inode->i_mode)) { + err = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (rc == 0) + rc = err; + err = lov_read_and_clear_async_rc(lli->lli_clob); + if (rc == 0) + rc = err; + } + + oc = ll_mdscapa_get(inode); + err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc, + &req); + capa_put(oc); + if (!rc) + rc = err; + if (!err) + ptlrpc_req_finished(req); + + if (datasync && S_ISREG(inode->i_mode)) { + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, + CL_FSYNC_ALL); + if (rc == 0 && err < 0) + rc = err; + if (rc < 0) + fd->fd_write_failed = true; + else + fd->fd_write_failed = false; + } + + mutex_unlock(&inode->i_mutex); + RETURN(rc); +} + +int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK, + .ei_cb_cp =ldlm_flock_completion_ast, + .ei_cbdata = file_lock }; + struct md_op_data *op_data; + struct lustre_handle lockh = {0}; + ldlm_policy_data_t flock = {{0}}; + int flags = 0; + int rc; + int rc2 = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n", + inode->i_ino, file_lock); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); + + if (file_lock->fl_flags & FL_FLOCK) { + LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); + /* flocks are whole-file locks */ + flock.l_flock.end = OFFSET_MAX; + /* For flocks owner is determined by the local file desctiptor*/ + flock.l_flock.owner = (unsigned long)file_lock->fl_file; + } else if (file_lock->fl_flags & FL_POSIX) { + flock.l_flock.owner = (unsigned long)file_lock->fl_owner; + flock.l_flock.start = file_lock->fl_start; + flock.l_flock.end = file_lock->fl_end; + } else { + RETURN(-EINVAL); + } + flock.l_flock.pid = file_lock->fl_pid; + + /* Somewhat ugly workaround for svc lockd. + * lockd installs custom fl_lmops->lm_compare_owner that checks + * for the fl_owner to be the same (which it always is on local node + * I guess between lockd processes) and then compares pid. + * As such we assign pid to the owner field to make it all work, + * conflict with normal locks is unlikely since pid space and + * pointer space for current->files are not intersecting */ + if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) + flock.l_flock.owner = (unsigned long)file_lock->fl_pid; + + switch (file_lock->fl_type) { + case F_RDLCK: + einfo.ei_mode = LCK_PR; + break; + case F_UNLCK: + /* An unlock request may or may not have any relation to + * existing locks so we may not be able to pass a lock handle + * via a normal ldlm_lock_cancel() request. The request may even + * unlock a byte range in the middle of an existing lock. In + * order to process an unlock request we need all of the same + * information that is given with a normal read or write record + * lock request. To avoid creating another ldlm unlock (cancel) + * message we'll treat a LCK_NL flock request as an unlock. */ + einfo.ei_mode = LCK_NL; + break; + case F_WRLCK: + einfo.ei_mode = LCK_PW; + break; + default: + CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", + file_lock->fl_type); + RETURN (-ENOTSUPP); + } + + switch (cmd) { + case F_SETLKW: +#ifdef F_SETLKW64 + case F_SETLKW64: +#endif + flags = 0; + break; + case F_SETLK: +#ifdef F_SETLK64 + case F_SETLK64: +#endif + flags = LDLM_FL_BLOCK_NOWAIT; + break; + case F_GETLK: +#ifdef F_GETLK64 + case F_GETLK64: +#endif + flags = LDLM_FL_TEST_LOCK; + /* Save the old mode so that if the mode in the lock changes we + * can decrement the appropriate reader or writer refcount. */ + file_lock->fl_type = einfo.ei_mode; + break; + default: + CERROR("unknown fcntl lock command: %d\n", cmd); + RETURN (-EINVAL); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, " + "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid, + flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL, + op_data, &lockh, &flock, 0, NULL /* req */, flags); + + if ((file_lock->fl_flags & FL_FLOCK) && + (rc == 0 || file_lock->fl_type == F_UNLCK)) + rc2 = flock_lock_file_wait(file, file_lock); + if ((file_lock->fl_flags & FL_POSIX) && + (rc == 0 || file_lock->fl_type == F_UNLCK) && + !(flags & LDLM_FL_TEST_LOCK)) + rc2 = posix_lock_file_wait(file, file_lock); + + if (rc2 && file_lock->fl_type != F_UNLCK) { + einfo.ei_mode = LCK_NL; + md_enqueue(sbi->ll_md_exp, &einfo, NULL, + op_data, &lockh, &flock, 0, NULL /* req */, flags); + rc = rc2; + } + + ll_finish_md_op_data(op_data); + + RETURN(rc); +} + +int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) +{ + ENTRY; + + RETURN(-ENOSYS); +} + +/** + * test if some locks matching bits and l_req_mode are acquired + * - bits can be in different locks + * - if found clear the common lock bits in *bits + * - the bits not found, are kept in *bits + * \param inode [IN] + * \param bits [IN] searched lock bits [IN] + * \param l_req_mode [IN] searched lock mode + * \retval boolean, true iff all bits are found + */ +int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode) +{ + struct lustre_handle lockh; + ldlm_policy_data_t policy; + ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ? + (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode; + struct lu_fid *fid; + __u64 flags; + int i; + ENTRY; + + if (!inode) + RETURN(0); + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid), + ldlm_lockname[mode]); + + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; + for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) { + policy.l_inodebits.bits = *bits & (1 << i); + if (policy.l_inodebits.bits == 0) + continue; + + if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, + &policy, mode, &lockh)) { + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(&lockh); + if (lock) { + *bits &= + ~(lock->l_policy_data.l_inodebits.bits); + LDLM_LOCK_PUT(lock); + } else { + *bits &= ~policy.l_inodebits.bits; + } + } + } + RETURN(*bits == 0); +} + +ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags) +{ + ldlm_policy_data_t policy = { .l_inodebits = {bits}}; + struct lu_fid *fid; + ldlm_mode_t rc; + ENTRY; + + fid = &ll_i2info(inode)->lli_fid; + CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid)); + + rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags, + fid, LDLM_IBITS, &policy, + LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh); + RETURN(rc); +} + +static int ll_inode_revalidate_fini(struct inode *inode, int rc) +{ + /* Already unlinked. Just update nlink and return success */ + if (rc == -ENOENT) { + clear_nlink(inode); + /* This path cannot be hit for regular files unless in + * case of obscure races, so no need to to validate + * size. */ + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return 0; + } else if (rc != 0) { + CERROR("%s: revalidate FID "DFID" error: rc = %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(ll_inode2fid(inode)), rc); + } + + return rc; +} + +int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it, + __u64 ibits) +{ + struct inode *inode = dentry->d_inode; + struct ptlrpc_request *req = NULL; + struct obd_export *exp; + int rc = 0; + ENTRY; + + LASSERT(inode != NULL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n", + inode->i_ino, inode->i_generation, inode, dentry->d_name.name); + + exp = ll_i2mdexp(inode); + + /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC. + * But under CMD case, it caused some lock issues, should be fixed + * with new CMD ibits lock. See bug 12718 */ + if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) { + struct lookup_intent oit = { .it_op = IT_GETATTR }; + struct md_op_data *op_data; + + if (ibits == MDS_INODELOCK_LOOKUP) + oit.it_op = IT_LOOKUP; + + /* Call getattr by fid, so do not provide name at all. */ + op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode, + dentry->d_inode, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + oit.it_create_mode |= M_CHECK_STALE; + rc = md_intent_lock(exp, op_data, NULL, 0, + /* we are not interested in name + based lookup */ + &oit, 0, &req, + ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + oit.it_create_mode &= ~M_CHECK_STALE; + if (rc < 0) { + rc = ll_inode_revalidate_fini(inode, rc); + GOTO (out, rc); + } + + rc = ll_revalidate_it_finish(req, &oit, dentry); + if (rc != 0) { + ll_intent_release(&oit); + GOTO(out, rc); + } + + /* Unlinked? Unhash dentry, so it is not picked up later by + do_lookup() -> ll_revalidate_it(). We cannot use d_drop + here to preserve get_cwd functionality on 2.6. + Bug 10503 */ + if (!dentry->d_inode->i_nlink) + d_lustre_invalidate(dentry); + + ll_lookup_finish_locks(&oit, dentry); + } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) { + struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); + obd_valid valid = OBD_MD_FLGETATTR; + struct md_op_data *op_data; + int ealen = 0; + + if (S_ISREG(inode->i_mode)) { + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + RETURN(rc); + valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE; + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, ealen, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = valid; + /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one + * capa for this inode. Because we only keep capas of dirs + * fresh. */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + rc = ll_inode_revalidate_fini(inode, rc); + RETURN(rc); + } + + rc = ll_prep_inode(&inode, req, NULL, NULL); + } +out: + ptlrpc_req_finished(req); + return rc; +} + +int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it, + __u64 ibits) +{ + struct inode *inode = dentry->d_inode; + int rc; + ENTRY; + + rc = __ll_inode_revalidate_it(dentry, it, ibits); + if (rc != 0) + RETURN(rc); + + /* if object isn't regular file, don't validate size */ + if (!S_ISREG(inode->i_mode)) { + LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime; + LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime; + LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime; + } else { + rc = ll_glimpse_size(inode); + } + RETURN(rc); +} + +int ll_getattr_it(struct vfsmount *mnt, struct dentry *de, + struct lookup_intent *it, struct kstat *stat) +{ + struct inode *inode = de->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int res = 0; + + res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LOOKUP); + ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1); + + if (res) + return res; + + stat->dev = inode->i_sb->s_dev; + if (ll_need_32bit_api(sbi)) + stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); + else + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = 1 << inode->i_blkbits; + + stat->size = i_size_read(inode); + stat->blocks = inode->i_blocks; + + return 0; +} +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) +{ + struct lookup_intent it = { .it_op = IT_GETATTR }; + + return ll_getattr_it(mnt, de, &it, stat); +} + + +struct posix_acl * ll_get_acl(struct inode *inode, int type) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl = NULL; + ENTRY; + + spin_lock(&lli->lli_lock); + /* VFS' acl_permission_check->check_acl will release the refcount */ + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + RETURN(acl); +} + + +int ll_inode_permission(struct inode *inode, int mask) +{ + int rc = 0; + ENTRY; + +#ifdef MAY_NOT_BLOCK + if (mask & MAY_NOT_BLOCK) + return -ECHILD; +#endif + + /* as root inode are NOT getting validated in lookup operation, + * need to do it before permission check. */ + + if (inode == inode->i_sb->s_root->d_inode) { + struct lookup_intent it = { .it_op = IT_LOOKUP }; + + rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it, + MDS_INODELOCK_LOOKUP); + if (rc) + RETURN(rc); + } + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n", + inode->i_ino, inode->i_generation, inode, inode->i_mode, mask); + + if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT) + return lustre_check_remote_perm(inode, mask); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1); + rc = ll_generic_permission(inode, mask, flags, ll_check_acl); + + RETURN(rc); +} + +#define READ_METHOD aio_read +#define READ_FUNCTION ll_file_aio_read +#define WRITE_METHOD aio_write +#define WRITE_FUNCTION ll_file_aio_write + +/* -o localflock - only provides locally consistent flock locks */ +struct file_operations ll_file_operations = { + .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, + .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush +}; + +struct file_operations ll_file_operations_flock = { + .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, + .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_flock, + .lock = ll_file_flock +}; + +/* These are for -o noflock - to return ENOSYS on flock calls */ +struct file_operations ll_file_operations_noflock = { + .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, + .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, + .unlocked_ioctl = ll_file_ioctl, + .open = ll_file_open, + .release = ll_file_release, + .mmap = ll_file_mmap, + .llseek = ll_file_seek, + .splice_read = ll_file_splice_read, + .fsync = ll_fsync, + .flush = ll_flush, + .flock = ll_file_noflock, + .lock = ll_file_noflock +}; + +struct inode_operations ll_file_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .get_acl = ll_get_acl, +}; + +/* dynamic ioctl number support routins */ +static struct llioc_ctl_data { + struct rw_semaphore ioc_sem; + struct list_head ioc_head; +} llioc = { + __RWSEM_INITIALIZER(llioc.ioc_sem), + LIST_HEAD_INIT(llioc.ioc_head) +}; + + +struct llioc_data { + struct list_head iocd_list; + unsigned int iocd_size; + llioc_callback_t iocd_cb; + unsigned int iocd_count; + unsigned int iocd_cmd[0]; +}; + +void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd) +{ + unsigned int size; + struct llioc_data *in_data = NULL; + ENTRY; + + if (cb == NULL || cmd == NULL || + count > LLIOC_MAX_CMD || count < 0) + RETURN(NULL); + + size = sizeof(*in_data) + count * sizeof(unsigned int); + OBD_ALLOC(in_data, size); + if (in_data == NULL) + RETURN(NULL); + + memset(in_data, 0, sizeof(*in_data)); + in_data->iocd_size = size; + in_data->iocd_cb = cb; + in_data->iocd_count = count; + memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count); + + down_write(&llioc.ioc_sem); + list_add_tail(&in_data->iocd_list, &llioc.ioc_head); + up_write(&llioc.ioc_sem); + + RETURN(in_data); +} + +void ll_iocontrol_unregister(void *magic) +{ + struct llioc_data *tmp; + + if (magic == NULL) + return; + + down_write(&llioc.ioc_sem); + list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) { + if (tmp == magic) { + unsigned int size = tmp->iocd_size; + + list_del(&tmp->iocd_list); + up_write(&llioc.ioc_sem); + + OBD_FREE(tmp, size); + return; + } + } + up_write(&llioc.ioc_sem); + + CWARN("didn't find iocontrol register block with magic: %p\n", magic); +} + +EXPORT_SYMBOL(ll_iocontrol_register); +EXPORT_SYMBOL(ll_iocontrol_unregister); + +enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg, int *rcp) +{ + enum llioc_iter ret = LLIOC_CONT; + struct llioc_data *data; + int rc = -EINVAL, i; + + down_read(&llioc.ioc_sem); + list_for_each_entry(data, &llioc.ioc_head, iocd_list) { + for (i = 0; i < data->iocd_count; i++) { + if (cmd != data->iocd_cmd[i]) + continue; + + ret = data->iocd_cb(inode, file, cmd, arg, data, &rc); + break; + } + + if (ret == LLIOC_STOP) + break; + } + up_read(&llioc.ioc_sem); + + if (rcp) + *rcp = rc; + return ret; +} + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_env_nest nest; + struct lu_env *env; + int result; + ENTRY; + + if (lli->lli_clob == NULL) + RETURN(0); + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + result = cl_conf_set(env, lli->lli_clob, conf); + cl_env_nested_put(&nest, env); + + if (conf->coc_opc == OBJECT_CONF_SET) { + struct ldlm_lock *lock = conf->coc_lock; + + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + if (result == 0) { + /* it can only be allowed to match after layout is + * applied to inode otherwise false layout would be + * seen. Applying layout shoud happen before dropping + * the intent lock. */ + ldlm_lock_allow_match(lock); + } + } + RETURN(result); +} + +/* Fetch layout from MDT with getxattr request, if it's not ready yet */ +static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) + +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_capa *oc; + struct ptlrpc_request *req; + struct mdt_body *body; + void *lvbdata; + void *lmm; + int lmmsize; + int rc; + ENTRY; + + if (lock->l_lvb_data != NULL) + RETURN(0); + + /* if layout lock was granted right away, the layout is returned + * within DLM_LVB of dlm reply; otherwise if the lock was ever + * blocked and then granted via completion ast, we have to fetch + * layout here. Please note that we can't use the LVB buffer in + * completion AST because it doesn't have a large enough buffer */ + oc = ll_mdscapa_get(inode); + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc == 0) + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0, + lmmsize, 0, &req); + capa_put(oc); + if (rc < 0) + RETURN(rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL || body->eadatasize > lmmsize) + GOTO(out, rc = -EPROTO); + + lmmsize = body->eadatasize; + if (lmmsize == 0) /* empty layout */ + GOTO(out, rc = 0); + + lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); + if (lmm == NULL) + GOTO(out, rc = -EFAULT); + + OBD_ALLOC_LARGE(lvbdata, lmmsize); + if (lvbdata == NULL) + GOTO(out, rc = -ENOMEM); + + memcpy(lvbdata, lmm, lmmsize); + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_data = lvbdata; + lock->l_lvb_len = lmmsize; + lvbdata = NULL; + } + unlock_res_and_lock(lock); + + if (lvbdata != NULL) + OBD_FREE_LARGE(lvbdata, lmmsize); + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +/** + * Apply the layout to the inode. Layout lock is held and will be released + * in this function. + */ +static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode, + struct inode *inode, __u32 *gen, bool reconf) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ldlm_lock *lock; + struct lustre_md md = { NULL }; + struct cl_object_conf conf; + int rc = 0; + bool lvb_ready; + bool wait_layout = false; + ENTRY; + + LASSERT(lustre_handle_is_used(lockh)); + + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + LASSERT(ldlm_has_layout(lock)); + + LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n", + inode, PFID(&lli->lli_fid), reconf); + + lock_res_and_lock(lock); + lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY); + unlock_res_and_lock(lock); + /* checking lvb_ready is racy but this is okay. The worst case is + * that multi processes may configure the file on the same time. */ + if (lvb_ready || !reconf) { + rc = -ENODATA; + if (lvb_ready) { + /* layout_gen must be valid if layout lock is not + * cancelled and stripe has already set */ + *gen = lli->lli_layout_gen; + rc = 0; + } + GOTO(out, rc); + } + + rc = ll_layout_fetch(inode, lock); + if (rc < 0) + GOTO(out, rc); + + /* for layout lock, lmm is returned in lock's lvb. + * lvb_data is immutable if the lock is held so it's safe to access it + * without res lock. See the description in ldlm_lock_decref_internal() + * for the condition to free lvb_data of layout lock */ + if (lock->l_lvb_data != NULL) { + rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm, + lock->l_lvb_data, lock->l_lvb_len); + if (rc >= 0) { + *gen = LL_LAYOUT_GEN_EMPTY; + if (md.lsm != NULL) + *gen = md.lsm->lsm_layout_gen; + rc = 0; + } else { + CERROR("%s: file "DFID" unpackmd error: %d\n", + ll_get_fsname(inode->i_sb, NULL, 0), + PFID(&lli->lli_fid), rc); + } + } + if (rc < 0) + GOTO(out, rc); + + /* set layout to file. Unlikely this will fail as old layout was + * surely eliminated */ + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = inode; + conf.coc_lock = lock; + conf.u.coc_md = &md; + rc = ll_layout_conf(inode, &conf); + + if (md.lsm != NULL) + obd_free_memmd(sbi->ll_dt_exp, &md.lsm); + + /* refresh layout failed, need to wait */ + wait_layout = rc == -EBUSY; + EXIT; + +out: + LDLM_LOCK_PUT(lock); + ldlm_lock_decref(lockh, mode); + + /* wait for IO to complete if it's still being used. */ + if (wait_layout) { + CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n", + ll_get_fsname(inode->i_sb, NULL, 0), + inode, PFID(&lli->lli_fid)); + + memset(&conf, 0, sizeof conf); + conf.coc_opc = OBJECT_CONF_WAIT; + conf.coc_inode = inode; + rc = ll_layout_conf(inode, &conf); + if (rc == 0) + rc = -EAGAIN; + + CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n", + PFID(&lli->lli_fid), rc); + } + RETURN(rc); +} + +/** + * This function checks if there exists a LAYOUT lock on the client side, + * or enqueues it if it doesn't have one in cache. + * + * This function will not hold layout lock so it may be revoked any time after + * this function returns. Any operations depend on layout should be redone + * in that case. + * + * This function should be called before lov_io_init() to get an uptodate + * layout version, the caller should save the version number and after IO + * is finished, this function should be called again to verify that layout + * is not changed during IO time. + */ +int ll_layout_refresh(struct inode *inode, __u32 *gen) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct md_op_data *op_data; + struct lookup_intent it; + struct lustre_handle lockh; + ldlm_mode_t mode; + struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS, + .ei_mode = LCK_CR, + .ei_cb_bl = ll_md_blocking_ast, + .ei_cb_cp = ldlm_completion_ast, + .ei_cbdata = NULL }; + int rc; + ENTRY; + + *gen = lli->lli_layout_gen; + if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK)) + RETURN(0); + + /* sanity checks */ + LASSERT(fid_is_sane(ll_inode2fid(inode))); + LASSERT(S_ISREG(inode->i_mode)); + + /* mostly layout lock is caching on the local side, so try to match + * it before grabbing layout lock mutex. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0); + if (mode != 0) { /* hit cached lock */ + rc = ll_layout_lock_set(&lockh, mode, inode, gen, false); + if (rc == 0) + RETURN(0); + + /* better hold lli_layout_mutex to try again otherwise + * it will have starvation problem. */ + } + + /* take layout lock mutex to enqueue layout lock exclusively. */ + mutex_lock(&lli->lli_layout_mutex); + +again: + /* try again. Maybe somebody else has done this. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0); + if (mode != 0) { /* hit cached lock */ + rc = ll_layout_lock_set(&lockh, mode, inode, gen, true); + if (rc == -EAGAIN) + goto again; + + mutex_unlock(&lli->lli_layout_mutex); + RETURN(rc); + } + + op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + mutex_unlock(&lli->lli_layout_mutex); + RETURN(PTR_ERR(op_data)); + } + + /* have to enqueue one */ + memset(&it, 0, sizeof(it)); + it.it_op = IT_LAYOUT; + lockh.cookie = 0ULL; + + LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n", + ll_get_fsname(inode->i_sb, NULL, 0), inode, + PFID(&lli->lli_fid)); + + rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh, + NULL, 0, NULL, 0); + if (it.d.lustre.it_data != NULL) + ptlrpc_req_finished(it.d.lustre.it_data); + it.d.lustre.it_data = NULL; + + ll_finish_md_op_data(op_data); + + md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL); + + mode = it.d.lustre.it_lock_mode; + it.d.lustre.it_lock_mode = 0; + ll_intent_drop_lock(&it); + + if (rc == 0) { + /* set lock data in case this is a new lock */ + ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); + rc = ll_layout_lock_set(&lockh, mode, inode, gen, true); + if (rc == -EAGAIN) + goto again; + } + mutex_unlock(&lli->lli_layout_mutex); + + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/llite/llite_capa.c b/drivers/staging/lustre/lustre/llite/llite_capa.c new file mode 100644 index 000000000000..b6fd9593325a --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_capa.c @@ -0,0 +1,661 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_capa.c + * + * Author: Lai Siyao + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include + +#include +#include "llite_internal.h" + +/* for obd_capa.c_list, client capa might stay in three places: + * 1. ll_capa_list. + * 2. ll_idle_capas. + * 3. stand alone: just allocated. + */ + +/* capas for oss writeback and those failed to renew */ +static LIST_HEAD(ll_idle_capas); +static struct ptlrpc_thread ll_capa_thread; +static struct list_head *ll_capa_list = &capa_list[CAPA_SITE_CLIENT]; + +/* llite capa renewal timer */ +struct timer_list ll_capa_timer; +/* for debug: indicate whether capa on llite is enabled or not */ +static atomic_t ll_capa_debug = ATOMIC_INIT(0); +static unsigned long long ll_capa_renewed = 0; +static unsigned long long ll_capa_renewal_noent = 0; +static unsigned long long ll_capa_renewal_failed = 0; +static unsigned long long ll_capa_renewal_retries = 0; + +static inline void update_capa_timer(struct obd_capa *ocapa, cfs_time_t expiry) +{ + if (cfs_time_before(expiry, ll_capa_timer.expires) || + !timer_pending(&ll_capa_timer)) { + mod_timer(&ll_capa_timer, expiry); + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "ll_capa_timer update: %lu/%lu by", expiry, jiffies); + } +} + +static inline cfs_time_t capa_renewal_time(struct obd_capa *ocapa) +{ + return cfs_time_sub(ocapa->c_expiry, + cfs_time_seconds(ocapa->c_capa.lc_timeout) / 2); +} + +static inline int capa_is_to_expire(struct obd_capa *ocapa) +{ + return cfs_time_beforeq(capa_renewal_time(ocapa), cfs_time_current()); +} + +static inline int have_expired_capa(void) +{ + struct obd_capa *ocapa = NULL; + int expired = 0; + + /* if ll_capa_list has client capa to expire or ll_idle_capas has + * expired capa, return 1. + */ + spin_lock(&capa_lock); + if (!list_empty(ll_capa_list)) { + ocapa = list_entry(ll_capa_list->next, struct obd_capa, + c_list); + expired = capa_is_to_expire(ocapa); + if (!expired) + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + } else if (!list_empty(&ll_idle_capas)) { + ocapa = list_entry(ll_idle_capas.next, struct obd_capa, + c_list); + expired = capa_is_expired(ocapa); + if (!expired) + update_capa_timer(ocapa, ocapa->c_expiry); + } + spin_unlock(&capa_lock); + + if (expired) + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired"); + return expired; +} + +static void sort_add_capa(struct obd_capa *ocapa, struct list_head *head) +{ + struct obd_capa *tmp; + struct list_head *before = NULL; + + /* TODO: client capa is sorted by expiry, this could be optimized */ + list_for_each_entry_reverse(tmp, head, c_list) { + if (cfs_time_aftereq(ocapa->c_expiry, tmp->c_expiry)) { + before = &tmp->c_list; + break; + } + } + + LASSERT(&ocapa->c_list != before); + list_add(&ocapa->c_list, before ?: head); +} + +static inline int obd_capa_open_count(struct obd_capa *oc) +{ + struct ll_inode_info *lli = ll_i2info(oc->u.cli.inode); + return atomic_read(&lli->lli_open_count); +} + +static void ll_delete_capa(struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(ocapa->u.cli.inode); + + if (capa_for_mds(&ocapa->c_capa)) { + LASSERT(lli->lli_mds_capa == ocapa); + lli->lli_mds_capa = NULL; + } else if (capa_for_oss(&ocapa->c_capa)) { + list_del_init(&ocapa->u.cli.lli_list); + } + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client"); + list_del_init(&ocapa->c_list); + capa_count[CAPA_SITE_CLIENT]--; + /* release the ref when alloc */ + capa_put(ocapa); +} + +/* three places where client capa is deleted: + * 1. capa_thread_main(), main place to delete expired capa. + * 2. ll_clear_inode_capas() in ll_clear_inode(). + * 3. ll_truncate_free_capa() delete truncate capa explicitly in ll_setattr_ost(). + */ +static int capa_thread_main(void *unused) +{ + struct obd_capa *ocapa, *tmp, *next; + struct inode *inode = NULL; + struct l_wait_info lwi = { 0 }; + int rc; + ENTRY; + + thread_set_flags(&ll_capa_thread, SVC_RUNNING); + wake_up(&ll_capa_thread.t_ctl_waitq); + + while (1) { + l_wait_event(ll_capa_thread.t_ctl_waitq, + !thread_is_running(&ll_capa_thread) || + have_expired_capa(), + &lwi); + + if (!thread_is_running(&ll_capa_thread)) + break; + + next = NULL; + + spin_lock(&capa_lock); + list_for_each_entry_safe(ocapa, tmp, ll_capa_list, c_list) { + __u64 ibits; + + LASSERT(ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC); + + if (!capa_is_to_expire(ocapa)) { + next = ocapa; + break; + } + + list_del_init(&ocapa->c_list); + + /* for MDS capability, only renew those which belong to + * dir, or its inode is opened, or client holds LOOKUP + * lock. + */ + /* ibits may be changed by ll_have_md_lock() so we have + * to set it each time */ + ibits = MDS_INODELOCK_LOOKUP; + if (capa_for_mds(&ocapa->c_capa) && + !S_ISDIR(ocapa->u.cli.inode->i_mode) && + obd_capa_open_count(ocapa) == 0 && + !ll_have_md_lock(ocapa->u.cli.inode, + &ibits, LCK_MINMODE)) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "skip renewal for"); + sort_add_capa(ocapa, &ll_idle_capas); + continue; + } + + /* for OSS capability, only renew those whose inode is + * opened. + */ + if (capa_for_oss(&ocapa->c_capa) && + obd_capa_open_count(ocapa) == 0) { + /* oss capa with open count == 0 won't renew, + * move to idle list */ + sort_add_capa(ocapa, &ll_idle_capas); + continue; + } + + /* NB iput() is in ll_update_capa() */ + inode = igrab(ocapa->u.cli.inode); + if (inode == NULL) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "igrab failed for"); + continue; + } + + capa_get(ocapa); + ll_capa_renewed++; + spin_unlock(&capa_lock); + rc = md_renew_capa(ll_i2mdexp(inode), ocapa, + ll_update_capa); + spin_lock(&capa_lock); + if (rc) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renew failed: %d", rc); + ll_capa_renewal_failed++; + } + } + + if (next) + update_capa_timer(next, capa_renewal_time(next)); + + list_for_each_entry_safe(ocapa, tmp, &ll_idle_capas, + c_list) { + if (!capa_is_expired(ocapa)) { + if (!next) + update_capa_timer(ocapa, + ocapa->c_expiry); + break; + } + + if (atomic_read(&ocapa->c_refc) > 1) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "expired(c_refc %d), don't release", + atomic_read(&ocapa->c_refc)); + /* don't try to renew any more */ + list_del_init(&ocapa->c_list); + continue; + } + + /* expired capa is released. */ + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "release expired"); + ll_delete_capa(ocapa); + } + + spin_unlock(&capa_lock); + } + + thread_set_flags(&ll_capa_thread, SVC_STOPPED); + wake_up(&ll_capa_thread.t_ctl_waitq); + RETURN(0); +} + +void ll_capa_timer_callback(unsigned long unused) +{ + wake_up(&ll_capa_thread.t_ctl_waitq); +} + +int ll_capa_thread_start(void) +{ + task_t *task; + ENTRY; + + init_waitqueue_head(&ll_capa_thread.t_ctl_waitq); + + task = kthread_run(capa_thread_main, NULL, "ll_capa"); + if (IS_ERR(task)) { + CERROR("cannot start expired capa thread: rc %ld\n", + PTR_ERR(task)); + RETURN(PTR_ERR(task)); + } + wait_event(ll_capa_thread.t_ctl_waitq, + thread_is_running(&ll_capa_thread)); + + RETURN(0); +} + +void ll_capa_thread_stop(void) +{ + thread_set_flags(&ll_capa_thread, SVC_STOPPING); + wake_up(&ll_capa_thread.t_ctl_waitq); + wait_event(ll_capa_thread.t_ctl_waitq, + thread_is_stopped(&ll_capa_thread)); +} + +struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + int found = 0; + + ENTRY; + + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0) + RETURN(NULL); + + LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW || + opc == CAPA_OPC_OSS_TRUNC); + + spin_lock(&capa_lock); + list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { + if (capa_is_expired(ocapa)) + continue; + if ((opc & CAPA_OPC_OSS_WRITE) && + capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) { + found = 1; + break; + } else if ((opc & CAPA_OPC_OSS_READ) && + capa_opc_supported(&ocapa->c_capa, + CAPA_OPC_OSS_READ)) { + found = 1; + break; + } else if ((opc & CAPA_OPC_OSS_TRUNC) && + capa_opc_supported(&ocapa->c_capa, opc)) { + found = 1; + break; + } + } + + if (found) { + LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), + ll_inode2fid(inode))); + LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); + + capa_get(ocapa); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); + } else { + ocapa = NULL; + + if (atomic_read(&ll_capa_debug)) { + CERROR("no capability for "DFID" opc "LPX64"\n", + PFID(&lli->lli_fid), opc); + atomic_set(&ll_capa_debug, 0); + } + } + spin_unlock(&capa_lock); + + RETURN(ocapa); +} +EXPORT_SYMBOL(ll_osscapa_get); + +struct obd_capa *ll_mdscapa_get(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + ENTRY; + + LASSERT(inode != NULL); + + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0) + RETURN(NULL); + + spin_lock(&capa_lock); + ocapa = capa_get(lli->lli_mds_capa); + spin_unlock(&capa_lock); + if (!ocapa && atomic_read(&ll_capa_debug)) { + CERROR("no mds capability for "DFID"\n", PFID(&lli->lli_fid)); + atomic_set(&ll_capa_debug, 0); + } + + RETURN(ocapa); +} + +static struct obd_capa *do_add_mds_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *old = lli->lli_mds_capa; + struct lustre_capa *capa = &ocapa->c_capa; + + if (!old) { + ocapa->u.cli.inode = inode; + lli->lli_mds_capa = ocapa; + capa_count[CAPA_SITE_CLIENT]++; + + DEBUG_CAPA(D_SEC, capa, "add MDS"); + } else { + spin_lock(&old->c_lock); + old->c_capa = *capa; + spin_unlock(&old->c_lock); + + DEBUG_CAPA(D_SEC, capa, "update MDS"); + + capa_put(ocapa); + ocapa = old; + } + return ocapa; +} + +static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + + /* inside capa_lock */ + list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { + if ((capa_opc(&ocapa->c_capa) & opc) != opc) + continue; + + LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), + ll_inode2fid(inode))); + LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); + return ocapa; + } + + return NULL; +} + +static inline void inode_add_oss_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *tmp; + struct list_head *next = NULL; + + /* capa is sorted in lli_oss_capas so lookup can always find the + * latest one */ + list_for_each_entry(tmp, &lli->lli_oss_capas, u.cli.lli_list) { + if (cfs_time_after(ocapa->c_expiry, tmp->c_expiry)) { + next = &tmp->u.cli.lli_list; + break; + } + } + LASSERT(&ocapa->u.cli.lli_list != next); + list_move_tail(&ocapa->u.cli.lli_list, next ?: &lli->lli_oss_capas); +} + +static struct obd_capa *do_add_oss_capa(struct inode *inode, + struct obd_capa *ocapa) +{ + struct obd_capa *old; + struct lustre_capa *capa = &ocapa->c_capa; + + LASSERTF(S_ISREG(inode->i_mode), + "inode has oss capa, but not regular file, mode: %d\n", + inode->i_mode); + + /* FIXME: can't replace it so easily with fine-grained opc */ + old = do_lookup_oss_capa(inode, capa_opc(capa) & CAPA_OPC_OSS_ONLY); + if (!old) { + ocapa->u.cli.inode = inode; + INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + capa_count[CAPA_SITE_CLIENT]++; + + DEBUG_CAPA(D_SEC, capa, "add OSS"); + } else { + spin_lock(&old->c_lock); + old->c_capa = *capa; + spin_unlock(&old->c_lock); + + DEBUG_CAPA(D_SEC, capa, "update OSS"); + + capa_put(ocapa); + ocapa = old; + } + + inode_add_oss_capa(inode, ocapa); + return ocapa; +} + +struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa) +{ + spin_lock(&capa_lock); + ocapa = capa_for_mds(&ocapa->c_capa) ? do_add_mds_capa(inode, ocapa) : + do_add_oss_capa(inode, ocapa); + + /* truncate capa won't renew */ + if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) { + set_capa_expiry(ocapa); + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, ll_capa_list); + + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + } + + spin_unlock(&capa_lock); + + atomic_set(&ll_capa_debug, 1); + return ocapa; +} + +static inline void delay_capa_renew(struct obd_capa *oc, cfs_time_t delay) +{ + /* NB: set a fake expiry for this capa to prevent it renew too soon */ + oc->c_expiry = cfs_time_add(oc->c_expiry, cfs_time_seconds(delay)); +} + +int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa) +{ + struct inode *inode = ocapa->u.cli.inode; + int rc = 0; + ENTRY; + + LASSERT(ocapa); + + if (IS_ERR(capa)) { + /* set error code */ + rc = PTR_ERR(capa); + spin_lock(&capa_lock); + if (rc == -ENOENT) { + DEBUG_CAPA(D_SEC, &ocapa->c_capa, + "renewal canceled because object removed"); + ll_capa_renewal_noent++; + } else { + ll_capa_renewal_failed++; + + /* failed capa won't be renewed any longer, but if -EIO, + * client might be doing recovery, retry in 2 min. */ + if (rc == -EIO && !capa_is_expired(ocapa)) { + delay_capa_renew(ocapa, 120); + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renewal failed: -EIO, " + "retry in 2 mins"); + ll_capa_renewal_retries++; + GOTO(retry, rc); + } else { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, + "renewal failed(rc: %d) for", rc); + } + } + + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, &ll_idle_capas); + spin_unlock(&capa_lock); + + capa_put(ocapa); + iput(inode); + RETURN(rc); + } + + spin_lock(&ocapa->c_lock); + LASSERT(!memcmp(&ocapa->c_capa, capa, + offsetof(struct lustre_capa, lc_opc))); + ocapa->c_capa = *capa; + set_capa_expiry(ocapa); + spin_unlock(&ocapa->c_lock); + + spin_lock(&capa_lock); + if (capa_for_oss(capa)) + inode_add_oss_capa(inode, ocapa); + DEBUG_CAPA(D_SEC, capa, "renew"); + EXIT; +retry: + list_del_init(&ocapa->c_list); + sort_add_capa(ocapa, ll_capa_list); + update_capa_timer(ocapa, capa_renewal_time(ocapa)); + spin_unlock(&capa_lock); + + capa_put(ocapa); + iput(inode); + return rc; +} + +void ll_capa_open(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + == 0) + return; + + if (!S_ISREG(inode->i_mode)) + return; + + atomic_inc(&lli->lli_open_count); +} + +void ll_capa_close(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + + if ((ll_i2sbi(inode)->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + == 0) + return; + + if (!S_ISREG(inode->i_mode)) + return; + + atomic_dec(&lli->lli_open_count); +} + +/* delete CAPA_OPC_OSS_TRUNC only */ +void ll_truncate_free_capa(struct obd_capa *ocapa) +{ + if (!ocapa) + return; + + LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC); + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate"); + + /* release ref when find */ + capa_put(ocapa); + if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) { + spin_lock(&capa_lock); + ll_delete_capa(ocapa); + spin_unlock(&capa_lock); + } +} + +void ll_clear_inode_capas(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa, *tmp; + + spin_lock(&capa_lock); + ocapa = lli->lli_mds_capa; + if (ocapa) + ll_delete_capa(ocapa); + + list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas, + u.cli.lli_list) + ll_delete_capa(ocapa); + spin_unlock(&capa_lock); +} + +void ll_print_capa_stat(struct ll_sb_info *sbi) +{ + if (sbi->ll_flags & (LL_SBI_MDS_CAPA | LL_SBI_OSS_CAPA)) + LCONSOLE_INFO("Fid capabilities renewed: %llu\n" + "Fid capabilities renewal ENOENT: %llu\n" + "Fid capabilities failed to renew: %llu\n" + "Fid capabilities renewal retries: %llu\n", + ll_capa_renewed, ll_capa_renewal_noent, + ll_capa_renewal_failed, ll_capa_renewal_retries); +} diff --git a/drivers/staging/lustre/lustre/llite/llite_close.c b/drivers/staging/lustre/lustre/llite/llite_close.c new file mode 100644 index 000000000000..00b2b38d4c97 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_close.c @@ -0,0 +1,412 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_close.c + * + * Lustre Lite routines to issue a secondary close after writeback + */ + +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" + +/** records that a write is in flight */ +void vvp_write_pending(struct ccc_object *club, struct ccc_page *page) +{ + struct ll_inode_info *lli = ll_i2info(club->cob_inode); + + ENTRY; + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_SOM_DIRTY; + if (page != NULL && list_empty(&page->cpg_pending_linkage)) + list_add(&page->cpg_pending_linkage, + &club->cob_pending_list); + spin_unlock(&lli->lli_lock); + EXIT; +} + +/** records that a write has completed */ +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page) +{ + struct ll_inode_info *lli = ll_i2info(club->cob_inode); + int rc = 0; + + ENTRY; + spin_lock(&lli->lli_lock); + if (page != NULL && !list_empty(&page->cpg_pending_linkage)) { + list_del_init(&page->cpg_pending_linkage); + rc = 1; + } + spin_unlock(&lli->lli_lock); + if (rc) + ll_queue_done_writing(club->cob_inode, 0); + EXIT; +} + +/** Queues DONE_WRITING if + * - done writing is allowed; + * - inode has no no dirty pages; */ +void ll_queue_done_writing(struct inode *inode, unsigned long flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob); + ENTRY; + + spin_lock(&lli->lli_lock); + lli->lli_flags |= flags; + + if ((lli->lli_flags & LLIF_DONE_WRITING) && + list_empty(&club->cob_pending_list)) { + struct ll_close_queue *lcq = ll_i2sbi(inode)->ll_lcq; + + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CWARN("ino %lu/%u(flags %u) som valid it just after " + "recovery\n", + inode->i_ino, inode->i_generation, + lli->lli_flags); + /* DONE_WRITING is allowed and inode has no dirty page. */ + spin_lock(&lcq->lcq_lock); + + LASSERT(list_empty(&lli->lli_close_list)); + CDEBUG(D_INODE, "adding inode %lu/%u to close list\n", + inode->i_ino, inode->i_generation); + list_add_tail(&lli->lli_close_list, &lcq->lcq_head); + + /* Avoid a concurrent insertion into the close thread queue: + * an inode is already in the close thread, open(), write(), + * close() happen, epoch is closed as the inode is marked as + * LLIF_EPOCH_PENDING. When pages are written inode should not + * be inserted into the queue again, clear this flag to avoid + * it. */ + lli->lli_flags &= ~LLIF_DONE_WRITING; + + wake_up(&lcq->lcq_waitq); + spin_unlock(&lcq->lcq_lock); + } + spin_unlock(&lli->lli_lock); + EXIT; +} + +/** Pack SOM attributes info @opdata for CLOSE, DONE_WRITING rpc. */ +void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data) +{ + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + op_data->op_flags |= MF_SOM_CHANGE; + /* Check if Size-on-MDS attributes are valid. */ + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CERROR("ino %lu/%u(flags %u) som valid it just after " + "recovery\n", inode->i_ino, inode->i_generation, + lli->lli_flags); + + if (!cl_local_size(inode)) { + /* Send Size-on-MDS Attributes if valid. */ + op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET | + ATTR_ATIME_SET | ATTR_SIZE | ATTR_BLOCKS; + } + EXIT; +} + +/** Closes ioepoch and packs Size-on-MDS attribute if needed into @op_data. */ +void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle **och, unsigned long flags) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_object *club = cl2ccc(ll_i2info(inode)->lli_clob); + ENTRY; + + spin_lock(&lli->lli_lock); + if (!(list_empty(&club->cob_pending_list))) { + if (!(lli->lli_flags & LLIF_EPOCH_PENDING)) { + LASSERT(*och != NULL); + LASSERT(lli->lli_pending_och == NULL); + /* Inode is dirty and there is no pending write done + * request yet, DONE_WRITE is to be sent later. */ + lli->lli_flags |= LLIF_EPOCH_PENDING; + lli->lli_pending_och = *och; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + GOTO(out, 0); + } + if (flags & LLIF_DONE_WRITING) { + /* Some pages are still dirty, it is early to send + * DONE_WRITE. Wait untill all pages will be flushed + * and try DONE_WRITE again later. */ + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + lli->lli_flags |= LLIF_DONE_WRITING; + spin_unlock(&lli->lli_lock); + + inode = igrab(inode); + LASSERT(inode); + GOTO(out, 0); + } + } + CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID"\n", + ll_i2info(inode)->lli_ioepoch, PFID(&lli->lli_fid)); + op_data->op_flags |= MF_EPOCH_CLOSE; + + if (flags & LLIF_DONE_WRITING) { + LASSERT(lli->lli_flags & LLIF_SOM_DIRTY); + LASSERT(!(lli->lli_flags & LLIF_DONE_WRITING)); + *och = lli->lli_pending_och; + lli->lli_pending_och = NULL; + lli->lli_flags &= ~LLIF_EPOCH_PENDING; + } else { + /* Pack Size-on-MDS inode attributes only if they has changed */ + if (!(lli->lli_flags & LLIF_SOM_DIRTY)) { + spin_unlock(&lli->lli_lock); + GOTO(out, 0); + } + + /* There is a pending DONE_WRITE -- close epoch with no + * attribute change. */ + if (lli->lli_flags & LLIF_EPOCH_PENDING) { + spin_unlock(&lli->lli_lock); + GOTO(out, 0); + } + } + + LASSERT(list_empty(&club->cob_pending_list)); + lli->lli_flags &= ~LLIF_SOM_DIRTY; + spin_unlock(&lli->lli_lock); + ll_done_writing_attr(inode, op_data); + + EXIT; +out: + return; +} + +/** + * Cliens updates SOM attributes on MDS (including llog cookies): + * obd_getattr with no lock and md_setattr. + */ +int ll_som_update(struct inode *inode, struct md_op_data *op_data) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request *request = NULL; + __u32 old_flags; + struct obdo *oa; + int rc; + ENTRY; + + LASSERT(op_data != NULL); + if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) + CERROR("ino %lu/%u(flags %u) som valid it just after " + "recovery\n", inode->i_ino, inode->i_generation, + lli->lli_flags); + + OBDO_ALLOC(oa); + if (!oa) { + CERROR("can't allocate memory for Size-on-MDS update.\n"); + RETURN(-ENOMEM); + } + + old_flags = op_data->op_flags; + op_data->op_flags = MF_SOM_CHANGE; + + /* If inode is already in another epoch, skip getattr from OSTs. */ + if (lli->lli_ioepoch == op_data->op_ioepoch) { + rc = ll_inode_getattr(inode, oa, op_data->op_ioepoch, + old_flags & MF_GETATTR_LOCK); + if (rc) { + oa->o_valid = 0; + if (rc != -ENOENT) + CERROR("inode_getattr failed (%d): unable to " + "send a Size-on-MDS attribute update " + "for inode %lu/%u\n", rc, inode->i_ino, + inode->i_generation); + } else { + CDEBUG(D_INODE, "Size-on-MDS update on "DFID"\n", + PFID(&lli->lli_fid)); + } + /* Install attributes into op_data. */ + md_from_obdo(op_data, oa, oa->o_valid); + } + + rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, + NULL, 0, NULL, 0, &request, NULL); + ptlrpc_req_finished(request); + + OBDO_FREE(oa); + RETURN(rc); +} + +/** + * Closes the ioepoch and packs all the attributes into @op_data for + * DONE_WRITING rpc. + */ +static void ll_prepare_done_writing(struct inode *inode, + struct md_op_data *op_data, + struct obd_client_handle **och) +{ + ll_ioepoch_close(inode, op_data, och, LLIF_DONE_WRITING); + /* If there is no @och, we do not do D_W yet. */ + if (*och == NULL) + return; + + ll_pack_inode2opdata(inode, op_data, &(*och)->och_fh); + ll_prep_md_op_data(op_data, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, NULL); +} + +/** Send a DONE_WRITING rpc. */ +static void ll_done_writing(struct inode *inode) +{ + struct obd_client_handle *och = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + + LASSERT(exp_connect_som(ll_i2mdexp(inode))); + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) { + CERROR("can't allocate op_data\n"); + EXIT; + return; + } + + ll_prepare_done_writing(inode, op_data, &och); + /* If there is no @och, we do not do D_W yet. */ + if (och == NULL) + GOTO(out, 0); + + rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, NULL); + if (rc == -EAGAIN) { + /* MDS has instructed us to obtain Size-on-MDS attribute from + * OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + } else if (rc) { + CERROR("inode %lu mdc done_writing failed: rc = %d\n", + inode->i_ino, rc); + } +out: + ll_finish_md_op_data(op_data); + if (och) { + md_clear_open_replay_data(ll_i2sbi(inode)->ll_md_exp, och); + OBD_FREE_PTR(och); + } + EXIT; +} + +static struct ll_inode_info *ll_close_next_lli(struct ll_close_queue *lcq) +{ + struct ll_inode_info *lli = NULL; + + spin_lock(&lcq->lcq_lock); + + if (!list_empty(&lcq->lcq_head)) { + lli = list_entry(lcq->lcq_head.next, struct ll_inode_info, + lli_close_list); + list_del_init(&lli->lli_close_list); + } else if (atomic_read(&lcq->lcq_stop)) + lli = ERR_PTR(-EALREADY); + + spin_unlock(&lcq->lcq_lock); + return lli; +} + +static int ll_close_thread(void *arg) +{ + struct ll_close_queue *lcq = arg; + ENTRY; + + complete(&lcq->lcq_comp); + + while (1) { + struct l_wait_info lwi = { 0 }; + struct ll_inode_info *lli; + struct inode *inode; + + l_wait_event_exclusive(lcq->lcq_waitq, + (lli = ll_close_next_lli(lcq)) != NULL, + &lwi); + if (IS_ERR(lli)) + break; + + inode = ll_info2i(lli); + CDEBUG(D_INFO, "done_writting for inode %lu/%u\n", + inode->i_ino, inode->i_generation); + ll_done_writing(inode); + iput(inode); + } + + CDEBUG(D_INFO, "ll_close exiting\n"); + complete(&lcq->lcq_comp); + RETURN(0); +} + +int ll_close_thread_start(struct ll_close_queue **lcq_ret) +{ + struct ll_close_queue *lcq; + task_t *task; + + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CLOSE_THREAD)) + return -EINTR; + + OBD_ALLOC(lcq, sizeof(*lcq)); + if (lcq == NULL) + return -ENOMEM; + + spin_lock_init(&lcq->lcq_lock); + INIT_LIST_HEAD(&lcq->lcq_head); + init_waitqueue_head(&lcq->lcq_waitq); + init_completion(&lcq->lcq_comp); + + task = kthread_run(ll_close_thread, lcq, "ll_close"); + if (IS_ERR(task)) { + OBD_FREE(lcq, sizeof(*lcq)); + return PTR_ERR(task); + } + + wait_for_completion(&lcq->lcq_comp); + *lcq_ret = lcq; + return 0; +} + +void ll_close_thread_shutdown(struct ll_close_queue *lcq) +{ + init_completion(&lcq->lcq_comp); + atomic_inc(&lcq->lcq_stop); + wake_up(&lcq->lcq_waitq); + wait_for_completion(&lcq->lcq_comp); + OBD_FREE(lcq, sizeof(*lcq)); +} diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h new file mode 100644 index 000000000000..177b4dbd1c67 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -0,0 +1,1578 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LLITE_INTERNAL_H +#define LLITE_INTERNAL_H +#include +#include +#include /* for s2sbi */ +#include + +/* for struct cl_lock_descr and struct cl_io */ +#include +#include +#include +#include + +#ifndef FMODE_EXEC +#define FMODE_EXEC 0 +#endif + +#ifndef VM_FAULT_RETRY +#define VM_FAULT_RETRY 0 +#endif + +/* Kernel 3.1 kills LOOKUP_CONTINUE, LOOKUP_PARENT is equivalent to it. + * seem kernel commit 49084c3bb2055c401f3493c13edae14d49128ca0 */ +#ifndef LOOKUP_CONTINUE +#define LOOKUP_CONTINUE LOOKUP_PARENT +#endif + +/** Only used on client-side for indicating the tail of dir hash/offset. */ +#define LL_DIR_END_OFF 0x7fffffffffffffffULL +#define LL_DIR_END_OFF_32BIT 0x7fffffffUL + +#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") +#define LUSTRE_FPRIVATE(file) ((file)->private_data) + +struct ll_dentry_data { + int lld_cwd_count; + int lld_mnt_count; + struct obd_client_handle lld_cwd_och; + struct obd_client_handle lld_mnt_och; + struct lookup_intent *lld_it; + unsigned int lld_sa_generation; + unsigned int lld_invalid:1; + struct rcu_head lld_rcu_head; +}; + +#define ll_d2d(de) ((struct ll_dentry_data*)((de)->d_fsdata)) + +extern struct file_operations ll_pgcache_seq_fops; + +#define LLI_INODE_MAGIC 0x111d0de5 +#define LLI_INODE_DEAD 0xdeadd00d + +/* remote client permission cache */ +#define REMOTE_PERM_HASHSIZE 16 + +struct ll_getname_data { + char *lgd_name; /* points to a buffer with NAME_MAX+1 size */ + struct lu_fid lgd_fid; /* target fid we are looking for */ + int lgd_found; /* inode matched? */ +}; + +/* llite setxid/access permission for user on remote client */ +struct ll_remote_perm { + struct hlist_node lrp_list; + uid_t lrp_uid; + gid_t lrp_gid; + uid_t lrp_fsuid; + gid_t lrp_fsgid; + int lrp_access_perm; /* MAY_READ/WRITE/EXEC, this + is access permission with + lrp_fsuid/lrp_fsgid. */ +}; + +enum lli_flags { + /* MDS has an authority for the Size-on-MDS attributes. */ + LLIF_MDS_SIZE_LOCK = (1 << 0), + /* Epoch close is postponed. */ + LLIF_EPOCH_PENDING = (1 << 1), + /* DONE WRITING is allowed. */ + LLIF_DONE_WRITING = (1 << 2), + /* Sizeon-on-MDS attributes are changed. An attribute update needs to + * be sent to MDS. */ + LLIF_SOM_DIRTY = (1 << 3), + /* File is contented */ + LLIF_CONTENDED = (1 << 4), + /* Truncate uses server lock for this file */ + LLIF_SRVLOCK = (1 << 5), + /* File data is modified. */ + LLIF_DATA_MODIFIED = (1 << 6), +}; + +struct ll_inode_info { + __u32 lli_inode_magic; + __u32 lli_flags; + __u64 lli_ioepoch; + + spinlock_t lli_lock; + struct posix_acl *lli_posix_acl; + + struct hlist_head *lli_remote_perms; + struct mutex lli_rmtperm_mutex; + + /* identifying fields for both metadata and data stacks. */ + struct lu_fid lli_fid; + /* Parent fid for accessing default stripe data on parent directory + * for allocating OST objects after a mknod() and later open-by-FID. */ + struct lu_fid lli_pfid; + + struct list_head lli_close_list; + struct list_head lli_oss_capas; + /* open count currently used by capability only, indicate whether + * capability needs renewal */ + atomic_t lli_open_count; + struct obd_capa *lli_mds_capa; + cfs_time_t lli_rmtperm_time; + + /* handle is to be sent to MDS later on done_writing and setattr. + * Open handle data are needed for the recovery to reconstruct + * the inode state on the MDS. XXX: recovery is not ready yet. */ + struct obd_client_handle *lli_pending_och; + + /* We need all three because every inode may be opened in different + * modes */ + struct obd_client_handle *lli_mds_read_och; + struct obd_client_handle *lli_mds_write_och; + struct obd_client_handle *lli_mds_exec_och; + __u64 lli_open_fd_read_count; + __u64 lli_open_fd_write_count; + __u64 lli_open_fd_exec_count; + /* Protects access to och pointers and their usage counters */ + struct mutex lli_och_mutex; + + struct inode lli_vfs_inode; + + /* the most recent timestamps obtained from mds */ + struct ost_lvb lli_lvb; + spinlock_t lli_agl_lock; + + /* Try to make the d::member and f::member are aligned. Before using + * these members, make clear whether it is directory or not. */ + union { + /* for directory */ + struct { + /* serialize normal readdir and statahead-readdir. */ + struct mutex d_readdir_mutex; + + /* metadata statahead */ + /* since parent-child threads can share the same @file + * struct, "opendir_key" is the token when dir close for + * case of parent exit before child -- it is me should + * cleanup the dir readahead. */ + void *d_opendir_key; + struct ll_statahead_info *d_sai; + struct posix_acl *d_def_acl; + /* protect statahead stuff. */ + spinlock_t d_sa_lock; + /* "opendir_pid" is the token when lookup/revalid + * -- I am the owner of dir statahead. */ + pid_t d_opendir_pid; + } d; + +#define lli_readdir_mutex u.d.d_readdir_mutex +#define lli_opendir_key u.d.d_opendir_key +#define lli_sai u.d.d_sai +#define lli_def_acl u.d.d_def_acl +#define lli_sa_lock u.d.d_sa_lock +#define lli_opendir_pid u.d.d_opendir_pid + + /* for non-directory */ + struct { + struct semaphore f_size_sem; + void *f_size_sem_owner; + char *f_symlink_name; + __u64 f_maxbytes; + /* + * struct rw_semaphore { + * signed long count; // align d.d_def_acl + * spinlock_t wait_lock; // align d.d_sa_lock + * struct list_head wait_list; + * } + */ + struct rw_semaphore f_trunc_sem; + struct mutex f_write_mutex; + + struct rw_semaphore f_glimpse_sem; + cfs_time_t f_glimpse_time; + struct list_head f_agl_list; + __u64 f_agl_index; + + /* for writepage() only to communicate to fsync */ + int f_async_rc; + + /* volatile file criteria is based on file name, this + * flag is used to keep the test result, so the strcmp + * is done only once + */ + bool f_volatile; + /* + * whenever a process try to read/write the file, the + * jobid of the process will be saved here, and it'll + * be packed into the write PRC when flush later. + * + * so the read/write statistics for jobid will not be + * accurate if the file is shared by different jobs. + */ + char f_jobid[JOBSTATS_JOBID_SIZE]; + } f; + +#define lli_size_sem u.f.f_size_sem +#define lli_size_sem_owner u.f.f_size_sem_owner +#define lli_symlink_name u.f.f_symlink_name +#define lli_maxbytes u.f.f_maxbytes +#define lli_trunc_sem u.f.f_trunc_sem +#define lli_write_mutex u.f.f_write_mutex +#define lli_glimpse_sem u.f.f_glimpse_sem +#define lli_glimpse_time u.f.f_glimpse_time +#define lli_agl_list u.f.f_agl_list +#define lli_agl_index u.f.f_agl_index +#define lli_async_rc u.f.f_async_rc +#define lli_jobid u.f.f_jobid +#define lli_volatile u.f.f_volatile + + } u; + + /* XXX: For following frequent used members, although they maybe special + * used for non-directory object, it is some time-wasting to check + * whether the object is directory or not before using them. On the + * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce + * the "ll_inode_info" size even if moving those members into u.f. + * So keep them out side. + * + * In the future, if more members are added only for directory, + * some of the following members can be moved into u.f. + */ + bool lli_has_smd; + struct cl_object *lli_clob; + + /* mutex to request for layout lock exclusively. */ + struct mutex lli_layout_mutex; + /* valid only inside LAYOUT ibits lock, protected by lli_layout_mutex */ + __u32 lli_layout_gen; +}; + +/* + * Locking to guarantee consistency of non-atomic updates to long long i_size, + * consistency between file size and KMS. + * + * Implemented by ->lli_size_sem and ->lsm_lock, nested in that order. + */ + +void ll_inode_size_lock(struct inode *inode); +void ll_inode_size_unlock(struct inode *inode); + +// FIXME: replace the name of this with LL_I to conform to kernel stuff +// static inline struct ll_inode_info *LL_I(struct inode *inode) +static inline struct ll_inode_info *ll_i2info(struct inode *inode) +{ + return container_of(inode, struct ll_inode_info, lli_vfs_inode); +} + +/* default to about 40meg of readahead on a given system. That much tied + * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */ +#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT)) + +/* default to read-ahead full files smaller than 2MB on the second read */ +#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT)) + +enum ra_stat { + RA_STAT_HIT = 0, + RA_STAT_MISS, + RA_STAT_DISTANT_READPAGE, + RA_STAT_MISS_IN_WINDOW, + RA_STAT_FAILED_GRAB_PAGE, + RA_STAT_FAILED_MATCH, + RA_STAT_DISCARDED, + RA_STAT_ZERO_LEN, + RA_STAT_ZERO_WINDOW, + RA_STAT_EOF, + RA_STAT_MAX_IN_FLIGHT, + RA_STAT_WRONG_GRAB_PAGE, + _NR_RA_STAT, +}; + +struct ll_ra_info { + atomic_t ra_cur_pages; + unsigned long ra_max_pages; + unsigned long ra_max_pages_per_file; + unsigned long ra_max_read_ahead_whole_pages; +}; + +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + unsigned long ria_start; /* start offset of read-ahead*/ + unsigned long ria_end; /* end offset of read-ahead*/ + /* If stride read pattern is detected, ria_stoff means where + * stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + pgoff_t ria_stoff; + /* ria_length and ria_pages are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + unsigned long ria_length; + unsigned long ria_pages; +}; + +/* LL_HIST_MAX=32 causes an overflow */ +#define LL_HIST_MAX 28 +#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ +#define LL_PROCESS_HIST_MAX 10 +struct per_process_info { + pid_t pid; + struct obd_histogram pp_r_hist; + struct obd_histogram pp_w_hist; +}; + +/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ +struct ll_rw_extents_info { + struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; +}; + +#define LL_OFFSET_HIST_MAX 100 +struct ll_rw_process_info { + pid_t rw_pid; + int rw_op; + loff_t rw_range_start; + loff_t rw_range_end; + loff_t rw_last_file_pos; + loff_t rw_offset; + size_t rw_smallest_extent; + size_t rw_largest_extent; + struct ll_file_data *rw_last_file; +}; + +enum stats_track_type { + STATS_TRACK_ALL = 0, /* track all processes */ + STATS_TRACK_PID, /* track process with this pid */ + STATS_TRACK_PPID, /* track processes with this ppid */ + STATS_TRACK_GID, /* track processes with this gid */ + STATS_TRACK_LAST, +}; + +/* flags for sbi->ll_flags */ +#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */ +#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */ +#define LL_SBI_FLOCK 0x04 +#define LL_SBI_USER_XATTR 0x08 /* support user xattr */ +#define LL_SBI_ACL 0x10 /* support ACL */ +#define LL_SBI_RMT_CLIENT 0x40 /* remote client */ +#define LL_SBI_MDS_CAPA 0x80 /* support mds capa */ +#define LL_SBI_OSS_CAPA 0x100 /* support oss capa */ +#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ +#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */ +#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */ +#define LL_SBI_SOM_PREVIEW 0x1000 /* SOM preview mount option */ +#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */ +#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */ +#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */ +#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */ +#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ +#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ + +#define LL_SBI_FLAGS { \ + "nolck", \ + "checksum", \ + "flock", \ + "xattr", \ + "acl", \ + "rmt_client", \ + "mds_capa", \ + "oss_capa", \ + "flock", \ + "lru_resize", \ + "lazy_statfs", \ + "som", \ + "32bit_api", \ + "64bit_hash", \ + "agl", \ + "verbose", \ + "layout", \ + "user_fid2path" } + +/* default value for ll_sb_info->contention_time */ +#define SBI_DEFAULT_CONTENTION_SECONDS 60 +/* default value for lockless_truncate_enable */ +#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1 +#define RCE_HASHES 32 + +struct rmtacl_ctl_entry { + struct list_head rce_list; + pid_t rce_key; /* hash key */ + int rce_ops; /* acl operation type */ +}; + +struct rmtacl_ctl_table { + spinlock_t rct_lock; + struct list_head rct_entries[RCE_HASHES]; +}; + +#define EE_HASHES 32 + +struct eacl_entry { + struct list_head ee_list; + pid_t ee_key; /* hash key */ + struct lu_fid ee_fid; + int ee_type; /* ACL type for ACCESS or DEFAULT */ + ext_acl_xattr_header *ee_acl; +}; + +struct eacl_table { + spinlock_t et_lock; + struct list_head et_entries[EE_HASHES]; +}; + +struct ll_sb_info { + struct list_head ll_list; + /* this protects pglist and ra_info. It isn't safe to + * grab from interrupt contexts */ + spinlock_t ll_lock; + spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ + spinlock_t ll_process_lock; /* ll_rw_process_info */ + struct obd_uuid ll_sb_uuid; + struct obd_export *ll_md_exp; + struct obd_export *ll_dt_exp; + struct proc_dir_entry* ll_proc_root; + struct lu_fid ll_root_fid; /* root object fid */ + + int ll_flags; + struct list_head ll_conn_chain; /* per-conn chain of SBs */ + struct lustre_client_ocd ll_lco; + + struct list_head ll_orphan_dentry_list; /*please don't ask -p*/ + struct ll_close_queue *ll_lcq; + + struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ + + /* Used to track "unstable" pages on a client, and maintain a + * LRU list of clean pages. An "unstable" page is defined as + * any page which is sent to a server as part of a bulk request, + * but is uncommitted to stable storage. */ + struct cl_client_cache ll_cache; + + struct lprocfs_stats *ll_ra_stats; + + struct ll_ra_info ll_ra_info; + unsigned int ll_namelen; + struct file_operations *ll_fop; + + /* =0 - hold lock over whole read/write + * >0 - max. chunk to be read/written w/o lock re-acquiring */ + unsigned long ll_max_rw_chunk; + unsigned int ll_md_brw_size; /* used by readdir */ + + struct lu_site *ll_site; + struct cl_device *ll_cl; + /* Statistics */ + struct ll_rw_extents_info ll_rw_extents_info; + int ll_extent_process_count; + struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX]; + unsigned int ll_offset_process_count; + struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX]; + unsigned int ll_rw_offset_entry_count; + int ll_stats_track_id; + enum stats_track_type ll_stats_track_type; + int ll_rw_stats_on; + + /* metadata stat-ahead */ + unsigned int ll_sa_max; /* max statahead RPCs */ + atomic_t ll_sa_total; /* statahead thread started + * count */ + atomic_t ll_sa_wrong; /* statahead thread stopped for + * low hit ratio */ + atomic_t ll_agl_total; /* AGL thread started count */ + + dev_t ll_sdev_orig; /* save s_dev before assign for + * clustred nfs */ + struct rmtacl_ctl_table ll_rct; + struct eacl_table ll_et; +}; + +#define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) + +struct ll_ra_read { + pgoff_t lrr_start; + pgoff_t lrr_count; + struct task_struct *lrr_reader; + struct list_head lrr_linkage; +}; + +/* + * per file-descriptor read-ahead data. + */ +struct ll_readahead_state { + spinlock_t ras_lock; + /* + * index of the last page that read(2) needed and that wasn't in the + * cache. Used by ras_update() to detect seeks. + * + * XXX nikita: if access seeks into cached region, Lustre doesn't see + * this. + */ + unsigned long ras_last_readpage; + /* + * number of pages read after last read-ahead window reset. As window + * is reset on each seek, this is effectively a number of consecutive + * accesses. Maybe ->ras_accessed_in_window is better name. + * + * XXX nikita: window is also reset (by ras_update()) when Lustre + * believes that memory pressure evicts read-ahead pages. In that + * case, it probably doesn't make sense to expand window to + * PTLRPC_MAX_BRW_PAGES on the third access. + */ + unsigned long ras_consecutive_pages; + /* + * number of read requests after the last read-ahead window reset + * As window is reset on each seek, this is effectively the number + * on consecutive read request and is used to trigger read-ahead. + */ + unsigned long ras_consecutive_requests; + /* + * Parameters of current read-ahead window. Handled by + * ras_update(). On the initial access to the file or after a seek, + * window is reset to 0. After 3 consecutive accesses, window is + * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by + * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. + */ + unsigned long ras_window_start, ras_window_len; + /* + * Where next read-ahead should start at. This lies within read-ahead + * window. Read-ahead window is read in pieces rather than at once + * because: 1. lustre limits total number of pages under read-ahead by + * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages + * not covered by DLM lock. + */ + unsigned long ras_next_readahead; + /* + * Total number of ll_file_read requests issued, reads originating + * due to mmap are not counted in this total. This value is used to + * trigger full file read-ahead after multiple reads to a small file. + */ + unsigned long ras_requests; + /* + * Page index with respect to the current request, these value + * will not be accurate when dealing with reads issued via mmap. + */ + unsigned long ras_request_index; + /* + * list of struct ll_ra_read's one per read(2) call current in + * progress against this file descriptor. Used by read-ahead code, + * protected by ->ras_lock. + */ + struct list_head ras_read_beads; + /* + * The following 3 items are used for detecting the stride I/O + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_pages-|-stride_gap-| + * ras_stride_offset = offset; + * ras_stride_length = stride_pages + stride_gap; + * ras_stride_pages = stride_pages; + * Note: all these three items are counted by pages. + */ + unsigned long ras_stride_length; + unsigned long ras_stride_pages; + pgoff_t ras_stride_offset; + /* + * number of consecutive stride request count, and it is similar as + * ras_consecutive_requests, but used for stride I/O mode. + * Note: only more than 2 consecutive stride request are detected, + * stride read-ahead will be enable + */ + unsigned long ras_consecutive_stride_requests; +}; + +extern struct kmem_cache *ll_file_data_slab; +struct lustre_handle; +struct ll_file_data { + struct ll_readahead_state fd_ras; + int fd_omode; + struct ccc_grouplock fd_grouplock; + __u64 lfd_pos; + __u32 fd_flags; + struct file *fd_file; + /* Indicate whether need to report failure when close. + * true: failure is known, not report again. + * false: unknown failure, should report. */ + bool fd_write_failed; +}; + +struct lov_stripe_md; + +extern spinlock_t inode_lock; + +extern struct proc_dir_entry *proc_lustre_fs_root; + +static inline struct inode *ll_info2i(struct ll_inode_info *lli) +{ + return &lli->lli_vfs_inode; +} + +struct it_cb_data { + struct inode *icbd_parent; + struct dentry **icbd_childp; + obd_id hash; +}; + +__u32 ll_i2suppgid(struct inode *i); +void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); + +static inline int ll_need_32bit_api(struct ll_sb_info *sbi) +{ +#if BITS_PER_LONG == 32 + return 1; +#else + return unlikely(current_is_32bit() || (sbi->ll_flags & LL_SBI_32BIT_API)); +#endif +} + +#define LLAP_MAGIC 98764321 + +extern struct kmem_cache *ll_async_page_slab; +extern size_t ll_async_page_slab_size; + +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar); +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar); +struct ll_ra_read *ll_ra_read_get(struct file *f); + +/* llite/lproc_llite.c */ +#ifdef LPROCFS +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc); +void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi); +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count); +void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc){return 0;} +static inline void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) {} +static void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) {} +static void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + + +/* llite/dir.c */ +void ll_release_page(struct page *page, int remove); +extern struct file_operations ll_dir_operations; +extern struct inode_operations ll_dir_inode_operations; +struct page *ll_get_dir_page(struct inode *dir, __u64 hash, + struct ll_dir_chain *chain); +int ll_dir_read(struct inode *inode, __u64 *_pos, void *cookie, + filldir_t filldir); + +int ll_get_mdt_idx(struct inode *inode); +/* llite/namei.c */ +int ll_objects_destroy(struct ptlrpc_request *request, + struct inode *dir); +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *lic); +int ll_md_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); +int ll_rmdir_entry(struct inode *dir, char *name, int namelen); + +/* llite/rw.c */ +int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to); +int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); +int ll_writepage(struct page *page, struct writeback_control *wbc); +int ll_writepages(struct address_space *, struct writeback_control *wbc); +void ll_removepage(struct page *page); +int ll_readpage(struct file *file, struct page *page); +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); +int ll_file_punch(struct inode *, loff_t, int); +ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int); +void ll_clear_file_contended(struct inode*); +int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t); +int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct ll_readahead_state *ras, struct address_space *mapping, + struct cl_page_list *queue, int flags); + +/* llite/file.c */ +extern struct file_operations ll_file_operations; +extern struct file_operations ll_file_operations_flock; +extern struct file_operations ll_file_operations_noflock; +extern struct inode_operations ll_file_inode_operations; +extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *, + __u64); +extern int ll_have_md_lock(struct inode *inode, __u64 *bits, + ldlm_mode_t l_req_mode); +extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, + struct lustre_handle *lockh, __u64 flags); +int __ll_inode_revalidate_it(struct dentry *, struct lookup_intent *, + __u64 bits); +int ll_revalidate_nd(struct dentry *dentry, unsigned int flags); +int ll_file_open(struct inode *inode, struct file *file); +int ll_file_release(struct inode *inode, struct file *file); +int ll_glimpse_ioctl(struct ll_sb_info *sbi, + struct lov_stripe_md *lsm, lstat_t *st); +void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch); +int ll_local_open(struct file *file, + struct lookup_intent *it, struct ll_file_data *fd, + struct obd_client_handle *och); +int ll_release_openhandle(struct dentry *, struct lookup_intent *); +int ll_md_close(struct obd_export *md_exp, struct inode *inode, + struct file *file); +int ll_md_real_close(struct inode *inode, int flags); +void ll_ioepoch_close(struct inode *inode, struct md_op_data *op_data, + struct obd_client_handle **och, unsigned long flags); +void ll_done_writing_attr(struct inode *inode, struct md_op_data *op_data); +int ll_som_update(struct inode *inode, struct md_op_data *op_data); +int ll_inode_getattr(struct inode *inode, struct obdo *obdo, + __u64 ioepoch, int sync); +int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, + struct md_open_data **mod); +void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, + struct lustre_handle *fh); +extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw); +int ll_getattr_it(struct vfsmount *mnt, struct dentry *de, + struct lookup_intent *it, struct kstat *stat); +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); +struct ll_file_data *ll_file_data_get(void); +struct posix_acl * ll_get_acl(struct inode *inode, int type); + +int ll_inode_permission(struct inode *inode, int mask); + +int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, + int flags, struct lov_user_md *lum, + int lum_size); +int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, + struct lov_mds_md **lmm, int *lmm_size, + struct ptlrpc_request **request); +int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, + int set_default); +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, + int *lmm_size, struct ptlrpc_request **request); +int ll_fsync(struct file *file, loff_t start, loff_t end, int data); +int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap, + int num_bytes); +int ll_merge_lvb(const struct lu_env *env, struct inode *inode); +int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg); +int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); +int ll_fid2path(struct inode *inode, void *arg); +int ll_data_version(struct inode *inode, __u64 *data_version, int extent_lock); + +/* llite/dcache.c */ + +int ll_dops_init(struct dentry *de, int block, int init_sa); +extern struct dentry_operations ll_d_ops; +void ll_intent_drop_lock(struct lookup_intent *); +void ll_intent_release(struct lookup_intent *); +void ll_invalidate_aliases(struct inode *); +void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft); +void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry); +int ll_dcompare(const struct dentry *parent, const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *d_name); +int ll_revalidate_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, struct dentry *de); + +/* llite/llite_lib.c */ +extern struct super_operations lustre_super_operations; + +char *ll_read_opt(const char *opt, char *data); +void ll_lli_init(struct ll_inode_info *lli); +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt); +void ll_put_super(struct super_block *sb); +void ll_kill_super(struct super_block *sb); +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); +struct inode *ll_inode_from_lock(struct ldlm_lock *lock); +void ll_clear_inode(struct inode *inode); +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr); +int ll_setattr(struct dentry *de, struct iattr *attr); +int ll_statfs(struct dentry *de, struct kstatfs *sfs); +int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, + __u64 max_age, __u32 flags); +void ll_update_inode(struct inode *inode, struct lustre_md *md); +void ll_read_inode2(struct inode *inode, void *opaque); +void ll_delete_inode(struct inode *inode); +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +int ll_flush_ctx(struct inode *inode); +void ll_umount_begin(struct super_block *sb); +int ll_remount_fs(struct super_block *sb, int *flags, char *data); +int ll_show_options(struct seq_file *seq, struct dentry *dentry); +void ll_dirty_page_discard_warn(struct page *page, int ioret); +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *, struct lookup_intent *); +void lustre_dump_dentry(struct dentry *, int recur); +void lustre_dump_inode(struct inode *); +int ll_obd_statfs(struct inode *inode, void *arg); +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); +int ll_process_config(struct lustre_cfg *lcfg); +struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, int namelen, + int mode, __u32 opc, void *data); +void ll_finish_md_op_data(struct md_op_data *op_data); +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); + +/* llite/llite_nfs.c */ +extern struct export_operations lustre_export_operations; +__u32 get_uuid2int(const char *name, int len); +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid); + +/* llite/special.c */ +extern struct inode_operations ll_special_inode_operations; +extern struct file_operations ll_special_chr_inode_fops; +extern struct file_operations ll_special_chr_file_fops; +extern struct file_operations ll_special_blk_inode_fops; +extern struct file_operations ll_special_fifo_inode_fops; +extern struct file_operations ll_special_fifo_file_fops; +extern struct file_operations ll_special_sock_inode_fops; + +/* llite/symlink.c */ +extern struct inode_operations ll_fast_symlink_inode_operations; + +/* llite/llite_close.c */ +struct ll_close_queue { + spinlock_t lcq_lock; + struct list_head lcq_head; + wait_queue_head_t lcq_waitq; + struct completion lcq_comp; + atomic_t lcq_stop; +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode); + + +void vvp_write_pending (struct ccc_object *club, struct ccc_page *page); +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page); + +/* specific achitecture can implement only part of this list */ +enum vvp_io_subtype { + /** normal IO */ + IO_NORMAL, + /** io called from .sendfile */ + IO_SENDFILE, + /** io started from splice_{read|write} */ + IO_SPLICE +}; + +/* IO subtypes */ +struct vvp_io { + /** io subtype */ + enum vvp_io_subtype cui_io_subtype; + + union { + struct { + read_actor_t cui_actor; + void *cui_target; + } sendfile; + struct { + struct pipe_inode_info *cui_pipe; + unsigned int cui_flags; + } splice; + struct vvp_fault_io { + /** + * Inode modification time that is checked across DLM + * lock request. + */ + time_t ft_mtime; + struct vm_area_struct *ft_vma; + /** + * locked page returned from vvp_io + */ + struct page *ft_vmpage; + struct vm_fault_api { + /** + * kernel fault info + */ + struct vm_fault *ft_vmf; + /** + * fault API used bitflags for return code. + */ + unsigned int ft_flags; + } fault; + } fault; + } u; + /** + * Read-ahead state used by read and page-fault IO contexts. + */ + struct ll_ra_read cui_bead; + /** + * Set when cui_bead has been initialized. + */ + int cui_ra_window_set; + /** + * Partially truncated page, that vvp_io_trunc_start() keeps locked + * across truncate. + */ + struct cl_page *cui_partpage; +}; + +/** + * IO arguments for various VFS I/O interfaces. + */ +struct vvp_io_args { + /** normal/sendfile/splice */ + enum vvp_io_subtype via_io_subtype; + + union { + struct { + struct kiocb *via_iocb; + struct iovec *via_iov; + unsigned long via_nrsegs; + } normal; + struct { + read_actor_t via_actor; + void *via_target; + } sendfile; + struct { + struct pipe_inode_info *via_pipe; + unsigned int via_flags; + } splice; + } u; +}; + +struct ll_cl_context { + void *lcc_cookie; + struct cl_io *lcc_io; + struct cl_page *lcc_page; + struct lu_env *lcc_env; + int lcc_refcheck; + int lcc_created; +}; + +struct vvp_thread_info { + struct ost_lvb vti_lvb; + struct cl_2queue vti_queue; + struct iovec vti_local_iov; + struct vvp_io_args vti_args; + struct ra_io_arg vti_ria; + struct kiocb vti_kiocb; + struct ll_cl_context vti_io_ctx; +}; + +static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) +{ + extern struct lu_context_key vvp_key; + struct vvp_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &vvp_key); + LASSERT(info != NULL); + return info; +} + +static inline struct vvp_io_args *vvp_env_args(const struct lu_env *env, + enum vvp_io_subtype type) +{ + struct vvp_io_args *ret = &vvp_env_info(env)->vti_args; + + ret->via_io_subtype = type; + + return ret; +} + +struct vvp_session { + struct vvp_io vs_ios; +}; + +static inline struct vvp_session *vvp_env_session(const struct lu_env *env) +{ + extern struct lu_context_key vvp_session_key; + struct vvp_session *ses; + + ses = lu_context_key_get(env->le_ses, &vvp_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct vvp_io *vvp_env_io(const struct lu_env *env) +{ + return &vvp_env_session(env)->vs_ios; +} + +void ll_queue_done_writing(struct inode *inode, unsigned long flags); +void ll_close_thread_shutdown(struct ll_close_queue *lcq); +int ll_close_thread_start(struct ll_close_queue **lcq_ret); + +/* llite/llite_mmap.c */ +typedef struct rb_root rb_root_t; +typedef struct rb_node rb_node_t; + +struct ll_lock_tree_node; +struct ll_lock_tree { + rb_root_t lt_root; + struct list_head lt_locked_list; + struct ll_file_data *lt_fd; +}; + +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); +int ll_file_mmap(struct file * file, struct vm_area_struct * vma); +struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, + __u64 end, ldlm_mode_t mode); +void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, size_t count); +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count); + +static inline void ll_invalidate_page(struct page *vmpage) +{ + struct address_space *mapping = vmpage->mapping; + loff_t offset = vmpage->index << PAGE_CACHE_SHIFT; + + LASSERT(PageLocked(vmpage)); + if (mapping == NULL) + return; + + ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE); + truncate_complete_page(mapping, vmpage); +} + +#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2dtexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_dt_exp; +} + +/* don't need an addref as the sb_info should be holding one */ +static inline struct obd_export *ll_s2mdexp(struct super_block *sb) +{ + return ll_s2sbi(sb)->ll_md_exp; +} + +static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) +{ + struct obd_device *obd = sbi->ll_md_exp->exp_obd; + if (obd == NULL) + LBUG(); + return &obd->u.cli; +} + +// FIXME: replace the name of this with LL_SB to conform to kernel stuff +static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) +{ + return ll_s2sbi(inode->i_sb); +} + +static inline struct obd_export *ll_i2dtexp(struct inode *inode) +{ + return ll_s2dtexp(inode->i_sb); +} + +static inline struct obd_export *ll_i2mdexp(struct inode *inode) +{ + return ll_s2mdexp(inode->i_sb); +} + +static inline struct lu_fid *ll_inode2fid(struct inode *inode) +{ + struct lu_fid *fid; + + LASSERT(inode != NULL); + fid = &ll_i2info(inode)->lli_fid; + + return fid; +} + +static inline int ll_mds_max_easize(struct super_block *sb) +{ + return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize; +} + +static inline __u64 ll_file_maxbytes(struct inode *inode) +{ + return ll_i2info(inode)->lli_maxbytes; +} + +/* llite/xattr.c */ +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ll_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ll_removexattr(struct dentry *dentry, const char *name); + +/* llite/remote_perm.c */ +extern struct kmem_cache *ll_remote_perm_cachep; +extern struct kmem_cache *ll_rmtperm_hash_cachep; + +struct hlist_head *alloc_rmtperm_hash(void); +void free_rmtperm_hash(struct hlist_head *hash); +int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm); +int lustre_check_remote_perm(struct inode *inode, int mask); + +/* llite/llite_capa.c */ +extern timer_list_t ll_capa_timer; + +int ll_capa_thread_start(void); +void ll_capa_thread_stop(void); +void ll_capa_timer_callback(unsigned long unused); + +struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa); +int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa); + +void ll_capa_open(struct inode *inode); +void ll_capa_close(struct inode *inode); + +struct obd_capa *ll_mdscapa_get(struct inode *inode); +struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc); + +void ll_truncate_free_capa(struct obd_capa *ocapa); +void ll_clear_inode_capas(struct inode *inode); +void ll_print_capa_stat(struct ll_sb_info *sbi); + +/* llite/llite_cl.c */ +extern struct lu_device_type vvp_device_type; + +/** + * Common IO arguments for various VFS I/O interfaces. + */ +int cl_sb_init(struct super_block *sb); +int cl_sb_fini(struct super_block *sb); +enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma); +void ll_io_init(struct cl_io *io, const struct file *file, int write); + +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit); +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); +int ll_is_file_contended(struct file *file); +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which); + +/* llite/llite_rmtacl.c */ +#ifdef CONFIG_FS_POSIX_ACL +obd_valid rce_ops2valid(int ops); +struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key); +int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops); +int rct_del(struct rmtacl_ctl_table *rct, pid_t key); +void rct_init(struct rmtacl_ctl_table *rct); +void rct_fini(struct rmtacl_ctl_table *rct); + +void ee_free(struct eacl_entry *ee); +int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header); +struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type); +void et_search_free(struct eacl_table *et, pid_t key); +void et_init(struct eacl_table *et); +void et_fini(struct eacl_table *et); +#endif + +/* statahead.c */ + +#define LL_SA_RPC_MIN 2 +#define LL_SA_RPC_DEF 32 +#define LL_SA_RPC_MAX 8192 + +#define LL_SA_CACHE_BIT 5 +#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) +#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) + +/* per inode struct, for dir only */ +struct ll_statahead_info { + struct inode *sai_inode; + atomic_t sai_refcount; /* when access this struct, hold + * refcount */ + unsigned int sai_generation; /* generation for statahead */ + unsigned int sai_max; /* max ahead of lookup */ + __u64 sai_sent; /* stat requests sent count */ + __u64 sai_replied; /* stat requests which received + * reply */ + __u64 sai_index; /* index of statahead entry */ + __u64 sai_index_wait; /* index of entry which is the + * caller is waiting for */ + __u64 sai_hit; /* hit count */ + __u64 sai_miss; /* miss count: + * for "ls -al" case, it includes + * hidden dentry miss; + * for "ls -l" case, it does not + * include hidden dentry miss. + * "sai_miss_hidden" is used for + * the later case. + */ + unsigned int sai_consecutive_miss; /* consecutive miss */ + unsigned int sai_miss_hidden;/* "ls -al", but first dentry + * is not a hidden one */ + unsigned int sai_skip_hidden;/* skipped hidden dentry count */ + unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for + * hidden entries */ + sai_in_readpage:1,/* statahead is in readdir()*/ + sai_agl_valid:1;/* AGL is valid for the dir */ + wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ + struct ptlrpc_thread sai_thread; /* stat-ahead thread */ + struct ptlrpc_thread sai_agl_thread; /* AGL thread */ + struct list_head sai_entries; /* entry list */ + struct list_head sai_entries_received; /* entries returned */ + struct list_head sai_entries_stated; /* entries stated */ + struct list_head sai_entries_agl; /* AGL entries to be sent */ + struct list_head sai_cache[LL_SA_CACHE_SIZE]; + spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; + atomic_t sai_cache_count; /* entry count in cache */ +}; + +int do_statahead_enter(struct inode *dir, struct dentry **dentry, + int only_unplug); +void ll_stop_statahead(struct inode *dir, void *key); + +static inline int ll_glimpse_size(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + + down_read(&lli->lli_glimpse_sem); + rc = cl_glimpse_size(inode); + lli->lli_glimpse_time = cfs_time_current(); + up_read(&lli->lli_glimpse_sem); + return rc; +} + +static inline void +ll_statahead_mark(struct inode *dir, struct dentry *dentry) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct ll_dentry_data *ldd = ll_d2d(dentry); + + /* not the same process, don't mark */ + if (lli->lli_opendir_pid != current_pid()) + return; + + if (sai != NULL && ldd != NULL) + ldd->lld_sa_generation = sai->sai_generation; +} + +static inline int +ll_need_statahead(struct inode *dir, struct dentry *dentryp) +{ + struct ll_inode_info *lli; + struct ll_dentry_data *ldd; + + if (ll_i2sbi(dir)->ll_sa_max == 0) + return -EAGAIN; + + lli = ll_i2info(dir); + /* not the same process, don't statahead */ + if (lli->lli_opendir_pid != current_pid()) + return -EAGAIN; + + /* statahead has been stopped */ + if (lli->lli_opendir_key == NULL) + return -EAGAIN; + + ldd = ll_d2d(dentryp); + /* + * When stats a dentry, the system trigger more than once "revalidate" + * or "lookup", for "getattr", for "getxattr", and maybe for others. + * Under patchless client mode, the operation intent is not accurate, + * which maybe misguide the statahead thread. For example: + * The "revalidate" call for "getattr" and "getxattr" of a dentry maybe + * have the same operation intent -- "IT_GETATTR". + * In fact, one dentry should has only one chance to interact with the + * statahead thread, otherwise the statahead windows will be confused. + * The solution is as following: + * Assign "lld_sa_generation" with "sai_generation" when a dentry + * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR" + * will bypass interacting with statahead thread for checking: + * "lld_sa_generation == lli_sai->sai_generation" + */ + if (ldd && lli->lli_sai && + ldd->lld_sa_generation == lli->lli_sai->sai_generation) + return -EAGAIN; + + return 1; +} + +static inline int +ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int only_unplug) +{ + int ret; + + ret = ll_need_statahead(dir, *dentryp); + if (ret <= 0) + return ret; + + return do_statahead_enter(dir, dentryp, only_unplug); +} + +/* llite ioctl register support rountine */ +enum llioc_iter { + LLIOC_CONT = 0, + LLIOC_STOP +}; + +#define LLIOC_MAX_CMD 256 + +/* + * Rules to write a callback function: + * + * Parameters: + * @magic: Dynamic ioctl call routine will feed this vaule with the pointer + * returned to ll_iocontrol_register. Callback functions should use this + * data to check the potential collasion of ioctl cmd. If collasion is + * found, callback function should return LLIOC_CONT. + * @rcp: The result of ioctl command. + * + * Return values: + * If @magic matches the pointer returned by ll_iocontrol_data, the + * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. + */ +typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg, + void *magic, int *rcp); + +enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg, int *rcp); + +/* export functions */ +/* Register ioctl block dynamatically for a regular file. + * + * @cmd: the array of ioctl command set + * @count: number of commands in the @cmd + * @cb: callback function, it will be called if an ioctl command is found to + * belong to the command list @cmd. + * + * Return vaule: + * A magic pointer will be returned if success; + * otherwise, NULL will be returned. + * */ +void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); +void ll_iocontrol_unregister(void *magic); + + +/* lclient compat stuff */ +#define cl_inode_info ll_inode_info +#define cl_i2info(info) ll_i2info(info) +#define cl_inode_mode(inode) ((inode)->i_mode) +#define cl_i2sbi ll_i2sbi + +static inline struct ll_file_data *cl_iattr2fd(struct inode *inode, + const struct iattr *attr) +{ + LASSERT(attr->ia_valid & ATTR_FILE); + return LUSTRE_FPRIVATE(attr->ia_file); +} + +static inline void cl_isize_lock(struct inode *inode) +{ + ll_inode_size_lock(inode); +} + +static inline void cl_isize_unlock(struct inode *inode) +{ + ll_inode_size_unlock(inode); +} + +static inline void cl_isize_write_nolock(struct inode *inode, loff_t kms) +{ + LASSERT(down_trylock(&ll_i2info(inode)->lli_size_sem) != 0); + i_size_write(inode, kms); +} + +static inline void cl_isize_write(struct inode *inode, loff_t kms) +{ + ll_inode_size_lock(inode); + i_size_write(inode, kms); + ll_inode_size_unlock(inode); +} + +#define cl_isize_read(inode) i_size_read(inode) + +static inline int cl_merge_lvb(const struct lu_env *env, struct inode *inode) +{ + return ll_merge_lvb(env, inode); +} + +#define cl_inode_atime(inode) LTIME_S((inode)->i_atime) +#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime) +#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime) + +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt); + +int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, + enum cl_fsync_mode mode); + +/** direct write pages */ +struct ll_dio_pages { + /** page array to be written. we don't support + * partial pages except the last one. */ + struct page **ldp_pages; + /* offset of each page */ + loff_t *ldp_offsets; + /** if ldp_offsets is NULL, it means a sequential + * pages to be written, then this is the file offset + * of the * first page. */ + loff_t ldp_start_offset; + /** how many bytes are to be written. */ + size_t ldp_size; + /** # of pages in the array. */ + int ldp_nr; +}; + +static inline void cl_stats_tally(struct cl_device *dev, enum cl_req_type crt, + int rc) +{ + int opc = (crt == CRT_READ) ? LPROC_LL_OSC_READ : + LPROC_LL_OSC_WRITE; + + ll_stats_ops_tally(ll_s2sbi(cl2ccc_dev(dev)->cdv_sb), opc, rc); +} + +extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv); + +static inline int ll_file_nolock(const struct file *file) +{ + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct inode *inode = file->f_dentry->d_inode; + + LASSERT(fd != NULL); + return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || + (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK)); +} + +static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, + struct lookup_intent *it, __u64 *bits) +{ + if (!it->d.lustre.it_lock_set) { + struct lustre_handle handle; + + /* If this inode is a remote object, it will get two + * separate locks in different namespaces, Master MDT, + * where the name entry is, will grant LOOKUP lock, + * remote MDT, where the object is, will grant + * UPDATE|PERM lock. The inode will be attched to both + * LOOKUP and PERM locks, so revoking either locks will + * case the dcache being cleared */ + if (it->d.lustre.it_remote_lock_mode) { + handle.cookie = it->d.lustre.it_remote_lock_handle; + CDEBUG(D_DLMTRACE, "setting l_data to inode %p" + "(%lu/%u) for remote lock "LPX64"\n", inode, + inode->i_ino, inode->i_generation, + handle.cookie); + md_set_lock_data(exp, &handle.cookie, inode, NULL); + } + + handle.cookie = it->d.lustre.it_lock_handle; + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)" + " for lock "LPX64"\n", inode, inode->i_ino, + inode->i_generation, handle.cookie); + + md_set_lock_data(exp, &handle.cookie, inode, + &it->d.lustre.it_lock_bits); + it->d.lustre.it_lock_set = 1; + } + + if (bits != NULL) + *bits = it->d.lustre.it_lock_bits; +} + +static inline void ll_lock_dcache(struct inode *inode) +{ + spin_lock(&inode->i_lock); +} + +static inline void ll_unlock_dcache(struct inode *inode) +{ + spin_unlock(&inode->i_lock); +} + +static inline int d_lustre_invalid(const struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + + return (lld == NULL) || lld->lld_invalid; +} + +static inline void __d_lustre_invalidate(struct dentry *dentry) +{ + struct ll_dentry_data *lld = ll_d2d(dentry); + + if (lld != NULL) + lld->lld_invalid = 1; +} + +/* + * Mark dentry INVALID, if dentry refcount is zero (this is normally case for + * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later; + * else dput() of the last refcount will unhash this dentry and kill it. + */ +static inline void d_lustre_invalidate(struct dentry *dentry) +{ + CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p " + "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry, + dentry->d_parent, dentry->d_inode, d_refcount(dentry)); + + spin_lock(&dentry->d_lock); + __d_lustre_invalidate(dentry); + if (d_refcount(dentry) == 0) + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void d_lustre_revalidate(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + LASSERT(ll_d2d(dentry) != NULL); + ll_d2d(dentry)->lld_invalid = 0; + spin_unlock(&dentry->d_lock); +} + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) +/* Compatibility for old (1.8) compiled userspace quota code */ +struct if_quotactl_18 { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; +#define LL_IOC_QUOTACTL_18 _IOWR('f', 162, struct if_quotactl_18 *) +/* End compatibility for old (1.8) compiled userspace quota code */ +#else +#warning "remove old LL_IOC_QUOTACTL_18 compatibility code" +#endif /* LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) */ + +enum { + LL_LAYOUT_GEN_NONE = ((__u32)-2), /* layout lock was cancelled */ + LL_LAYOUT_GEN_EMPTY = ((__u32)-1) /* for empty layout */ +}; + +int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); +int ll_layout_refresh(struct inode *inode, __u32 *gen); + +#endif /* LLITE_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c new file mode 100644 index 000000000000..278b97dd94c9 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -0,0 +1,2424 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_lib.c + * + * Lustre Light Super operations + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +struct kmem_cache *ll_file_data_slab; + +LIST_HEAD(ll_super_blocks); +DEFINE_SPINLOCK(ll_sb_lock); + +#ifndef MS_HAS_NEW_AOPS +extern struct address_space_operations ll_aops; +#else +extern struct address_space_operations_ext ll_aops; +#endif + +#ifndef log2 +#define log2(n) ffz(~(n)) +#endif + +static struct ll_sb_info *ll_init_sbi(void) +{ + struct ll_sb_info *sbi = NULL; + unsigned long pages; + unsigned long lru_page_max; + struct sysinfo si; + class_uuid_t uuid; + int i; + ENTRY; + + OBD_ALLOC(sbi, sizeof(*sbi)); + if (!sbi) + RETURN(NULL); + + spin_lock_init(&sbi->ll_lock); + mutex_init(&sbi->ll_lco.lco_lock); + spin_lock_init(&sbi->ll_pp_extent_lock); + spin_lock_init(&sbi->ll_process_lock); + sbi->ll_rw_stats_on = 0; + + si_meminfo(&si); + pages = si.totalram - si.totalhigh; + if (pages >> (20 - PAGE_CACHE_SHIFT) < 512) { + lru_page_max = pages / 2; + } else { + lru_page_max = (pages / 4) * 3; + } + + /* initialize ll_cache data */ + atomic_set(&sbi->ll_cache.ccc_users, 0); + sbi->ll_cache.ccc_lru_max = lru_page_max; + atomic_set(&sbi->ll_cache.ccc_lru_left, lru_page_max); + spin_lock_init(&sbi->ll_cache.ccc_lru_lock); + INIT_LIST_HEAD(&sbi->ll_cache.ccc_lru); + + atomic_set(&sbi->ll_cache.ccc_unstable_nr, 0); + init_waitqueue_head(&sbi->ll_cache.ccc_unstable_waitq); + + sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, + SBI_DEFAULT_READAHEAD_MAX); + sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = + SBI_DEFAULT_READAHEAD_WHOLE_MAX; + INIT_LIST_HEAD(&sbi->ll_conn_chain); + INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); + + ll_generate_random_uuid(uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); + + spin_lock(&ll_sb_lock); + list_add_tail(&sbi->ll_list, &ll_super_blocks); + spin_unlock(&ll_sb_lock); + + sbi->ll_flags |= LL_SBI_VERBOSE; + sbi->ll_flags |= LL_SBI_CHECKSUM; + + sbi->ll_flags |= LL_SBI_LRU_RESIZE; + + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_r_hist.oh_lock); + spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. + pp_w_hist.oh_lock); + } + + /* metadata statahead is enabled by default */ + sbi->ll_sa_max = LL_SA_RPC_DEF; + atomic_set(&sbi->ll_sa_total, 0); + atomic_set(&sbi->ll_sa_wrong, 0); + atomic_set(&sbi->ll_agl_total, 0); + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + + RETURN(sbi); +} + +void ll_free_sbi(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + + if (sbi != NULL) { + spin_lock(&ll_sb_lock); + list_del(&sbi->ll_list); + spin_unlock(&ll_sb_lock); + OBD_FREE(sbi, sizeof(*sbi)); + } + EXIT; +} + +static struct dentry_operations ll_d_root_ops = { + .d_compare = ll_dcompare, + .d_revalidate = ll_revalidate_nd, +}; + +static int client_common_fill_super(struct super_block *sb, char *md, char *dt, + struct vfsmount *mnt) +{ + struct inode *root = 0; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_capa *oc = NULL; + struct obd_statfs *osfs = NULL; + struct ptlrpc_request *request = NULL; + struct obd_connect_data *data = NULL; + struct obd_uuid *uuid; + struct md_op_data *op_data; + struct lustre_md lmd; + obd_valid valid; + int size, err, checksum; + ENTRY; + + obd = class_name2obd(md); + if (!obd) { + CERROR("MD %s: not setup or attached\n", md); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(data); + if (data == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC_PTR(osfs); + if (osfs == NULL) { + OBD_FREE_PTR(data); + RETURN(-ENOMEM); + } + + if (proc_lustre_fs_root) { + err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, + dt, md); + if (err < 0) + CERROR("could not register mount in /proc/fs/lustre\n"); + } + + /* indicate the features supported by this client */ + data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | + OBD_CONNECT_ATTRFID | + OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | + OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR | + OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH| + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; + + if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) + data->ocd_connect_flags |= OBD_CONNECT_SOM; + + if (sbi->ll_flags & LL_SBI_LRU_RESIZE) + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; +#ifdef CONFIG_FS_POSIX_ACL + data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK; +#endif + + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) + /* flag mdc connection as lightweight, only used for test + * purpose, use with care */ + data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; + + data->ocd_ibits_known = MDS_INODELOCK_FULL; + data->ocd_version = LUSTRE_VERSION_CODE; + + if (sb->s_flags & MS_RDONLY) + data->ocd_connect_flags |= OBD_CONNECT_RDONLY; + if (sbi->ll_flags & LL_SBI_USER_XATTR) + data->ocd_connect_flags |= OBD_CONNECT_XATTR; + +#ifdef HAVE_MS_FLOCK_LOCK + /* force vfs to use lustre handler for flock() calls - bug 10743 */ + sb->s_flags |= MS_FLOCK_LOCK; +#endif +#ifdef MS_HAS_NEW_AOPS + sb->s_flags |= MS_HAS_NEW_AOPS; +#endif + + if (sbi->ll_flags & LL_SBI_FLOCK) + sbi->ll_fop = &ll_file_operations_flock; + else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + sbi->ll_fop = &ll_file_operations; + else + sbi->ll_fop = &ll_file_operations_noflock; + + /* real client */ + data->ocd_connect_flags |= OBD_CONNECT_REAL; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; + + data->ocd_brw_size = MD_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to complete," + " abort, or time out.\n", md); + GOTO(out, err); + } else if (err) { + CERROR("cannot connect to %s: rc = %d\n", md, err); + GOTO(out, err); + } + + sbi->ll_md_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init metadata layer FID infrastructure, " + "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md, err); + } + + /* For mount, we only need fs info from MDT0, and also in DNE, it + * can make sure the client can be mounted as long as MDT0 is + * avaible */ + err = obd_statfs(NULL, sbi->ll_md_exp, osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_FOR_MDT0); + if (err) + GOTO(out_md_fid, err); + + /* This needs to be after statfs to ensure connect has finished. + * Note that "data" does NOT contain the valid connect reply. + * If connecting to a 1.8 server there will be no LMV device, so + * we can access the MDC export directly and exp_connect_flags will + * be non-zero, but if accessing an upgraded 2.1 server it will + * have the correct flags filled in. + * XXX: fill in the LMV exp_connect_flags from MDC(s). */ + valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; + if (exp_connect_flags(sbi->ll_md_exp) != 0 && + valid != CLIENT_CONNECT_MDT_REQD) { + char *buf; + + OBD_ALLOC_WAIT(buf, PAGE_CACHE_SIZE); + obd_connect_flags2str(buf, PAGE_CACHE_SIZE, + valid ^ CLIENT_CONNECT_MDT_REQD, ","); + LCONSOLE_ERROR_MSG(0x170, "Server %s does not support " + "feature(s) needed for correct operation " + "of this client (%s). Please upgrade " + "server or downgrade client.\n", + sbi->ll_md_exp->exp_obd->obd_name, buf); + OBD_FREE(buf, PAGE_CACHE_SIZE); + GOTO(out_md_fid, err = -EPROTO); + } + + size = sizeof(*data); + err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), + KEY_CONN_DATA, &size, data, NULL); + if (err) { + CERROR("%s: Get connect data failed: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md_fid, err); + } + + LASSERT(osfs->os_bsize); + sb->s_blocksize = osfs->os_bsize; + sb->s_blocksize_bits = log2(osfs->os_bsize); + sb->s_magic = LL_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sbi->ll_namelen = osfs->os_namelen; + sbi->ll_max_rw_chunk = LL_DEFAULT_MAX_RW_CHUNK; + + if ((sbi->ll_flags & LL_SBI_USER_XATTR) && + !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + + if (data->ocd_connect_flags & OBD_CONNECT_ACL) { +#ifdef MS_POSIXACL + sb->s_flags |= MS_POSIXACL; +#endif + sbi->ll_flags |= LL_SBI_ACL; + } else { + LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); +#ifdef MS_POSIXACL + sb->s_flags &= ~MS_POSIXACL; +#endif + sbi->ll_flags &= ~LL_SBI_ACL; + } + + if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) { + if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) { + sbi->ll_flags |= LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client is set as remote by default.\n"); + } + } else { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + sbi->ll_flags &= ~LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client claims to be remote, but server " + "rejected, forced to be local.\n"); + } + } + + if (data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) { + LCONSOLE_INFO("client enabled MDS capability!\n"); + sbi->ll_flags |= LL_SBI_MDS_CAPA; + } + + if (data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA) { + LCONSOLE_INFO("client enabled OSS capability!\n"); + sbi->ll_flags |= LL_SBI_OSS_CAPA; + } + + if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) + sbi->ll_flags |= LL_SBI_64BIT_HASH; + + if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + sbi->ll_md_brw_size = data->ocd_brw_size; + else + sbi->ll_md_brw_size = PAGE_CACHE_SIZE; + + if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) { + LCONSOLE_INFO("Layout lock feature supported.\n"); + sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; + } + + obd = class_name2obd(dt); + if (!obd) { + CERROR("DT %s: not setup or attached\n", dt); + GOTO(out_md_fid, err = -ENODEV); + } + + data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| + OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT | + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_VBR| + OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH | + OBD_CONNECT_MAXBYTES | + OBD_CONNECT_EINPROGRESS | + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; + + if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) + data->ocd_connect_flags |= OBD_CONNECT_SOM; + + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { + /* OBD_CONNECT_CKSUM should always be set, even if checksums are + * disabled by default, because it can still be enabled on the + * fly via /proc. As a consequence, we still need to come to an + * agreement on the supported algorithms at connect time */ + data->ocd_connect_flags |= OBD_CONNECT_CKSUM; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) + data->ocd_cksum_types = OBD_CKSUM_ADLER; + else + data->ocd_cksum_types = cksum_types_supported_client(); + } + + data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d " + "ocd_grant: %d\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant); + + obd->obd_upcall.onu_owner = &sbi->ll_lco; + obd->obd_upcall.onu_upcall = cl_ocd_update; + + data->ocd_brw_size = DT_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, + NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to " + "complete, abort, or time out.\n", dt); + GOTO(out_md, err); + } else if (err) { + CERROR("%s: Cannot connect to %s: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, dt, err); + GOTO(out_md, err); + } + + sbi->ll_dt_exp->exp_connect_data = *data; + + err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init data layer FID infrastructure, " + "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_dt, err); + } + + mutex_lock(&sbi->ll_lco.lco_lock); + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; + sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; + mutex_unlock(&sbi->ll_lco.lco_lock); + + fid_zero(&sbi->ll_root_fid); + err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + GOTO(out_lock_cn_cb, err); + } + if (!fid_is_sane(&sbi->ll_root_fid)) { + CERROR("%s: Invalid root fid "DFID" during mount\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(&sbi->ll_root_fid)); + GOTO(out_lock_cn_cb, err = -EINVAL); + } + CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); + + sb->s_op = &lustre_super_operations; +#if THREAD_SIZE >= 8192 /*b=17630*/ + sb->s_export_op = &lustre_export_operations; +#endif + + /* make root inode + * XXX: move this to after cbd setup? */ + valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + valid |= OBD_MD_FLRMTPERM; + else if (sbi->ll_flags & LL_SBI_ACL) + valid |= OBD_MD_FLACL; + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out_lock_cn_cb, err = -ENOMEM); + + op_data->op_fid1 = sbi->ll_root_fid; + op_data->op_mode = 0; + op_data->op_capa1 = oc; + op_data->op_valid = valid; + + err = md_getattr(sbi->ll_md_exp, op_data, &request); + if (oc) + capa_put(oc); + OBD_FREE_PTR(op_data); + if (err) { + CERROR("%s: md_getattr failed for root: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_lock_cn_cb, err); + } + + err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &lmd); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n", err); + ptlrpc_req_finished(request); + GOTO(out_lock_cn_cb, err); + } + + LASSERT(fid_is_sane(&sbi->ll_root_fid)); + root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, + ll_need_32bit_api(sbi)), + &lmd); + md_free_lustre_md(sbi->ll_md_exp, &lmd); + ptlrpc_req_finished(request); + + if (root == NULL || IS_ERR(root)) { + if (lmd.lsm) + obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm); +#ifdef CONFIG_FS_POSIX_ACL + if (lmd.posix_acl) { + posix_acl_release(lmd.posix_acl); + lmd.posix_acl = NULL; + } +#endif + err = IS_ERR(root) ? PTR_ERR(root) : -EBADF; + root = NULL; + CERROR("lustre_lite: bad iget4 for root\n"); + GOTO(out_root, err); + } + + err = ll_close_thread_start(&sbi->ll_lcq); + if (err) { + CERROR("cannot start close thread: rc %d\n", err); + GOTO(out_root, err); + } + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + rct_init(&sbi->ll_rct); + et_init(&sbi->ll_et); + } +#endif + + checksum = sbi->ll_flags & LL_SBI_CHECKSUM; + err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(checksum), &checksum, + NULL); + cl_sb_init(sb); + + err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), + KEY_CACHE_SET, sizeof(sbi->ll_cache), + &sbi->ll_cache, NULL); + + sb->s_root = d_make_root(root); + if (sb->s_root == NULL) { + CERROR("%s: can't make root dentry\n", + ll_get_fsname(sb, NULL, 0)); + GOTO(out_root, err = -ENOMEM); + } + + /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ + d_set_d_op(sb->s_root, &ll_d_root_ops); + sb->s_d_op = &ll_d_ops; + + sbi->ll_sdev_orig = sb->s_dev; + + /* We set sb->s_dev equal on all lustre clients in order to support + * NFS export clustering. NFSD requires that the FSID be the same + * on all clients. */ + /* s_dev is also used in lt_compare() to compare two fs, but that is + * only a node-local comparison. */ + uuid = obd_get_uuid(sbi->ll_md_exp); + if (uuid != NULL) + sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); + + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + + RETURN(err); +out_root: + if (root) + iput(root); +out_lock_cn_cb: + obd_fid_fini(sbi->ll_dt_exp->exp_obd); +out_dt: + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + /* Make sure all OScs are gone, since cl_cache is accessing sbi. */ + obd_zombie_barrier(); +out_md_fid: + obd_fid_fini(sbi->ll_md_exp->exp_obd); +out_md: + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (osfs != NULL) + OBD_FREE_PTR(osfs); + lprocfs_unregister_mountpoint(sbi); + return err; +} + +int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) +{ + int size, rc; + + *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL); + size = sizeof(int); + rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize, NULL); + if (rc) + CERROR("Get max mdsize error rc %d \n", rc); + + RETURN(rc); +} + +void ll_dump_inode(struct inode *inode) +{ + struct ll_d_hlist_node *tmp; + int dentry_count = 0; + + LASSERT(inode != NULL); + + ll_d_hlist_for_each(tmp, &inode->i_dentry) + dentry_count++; + + CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n", + inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino, + inode->i_mode, atomic_read(&inode->i_count), dentry_count); +} + +void lustre_dump_dentry(struct dentry *dentry, int recur) +{ + struct list_head *tmp; + int subdirs = 0; + + LASSERT(dentry != NULL); + + list_for_each(tmp, &dentry->d_subdirs) + subdirs++; + + CERROR("dentry %p dump: name=%.*s parent=%.*s (%p), inode=%p, count=%u," + " flags=0x%x, fsdata=%p, %d subdirs\n", dentry, + dentry->d_name.len, dentry->d_name.name, + dentry->d_parent->d_name.len, dentry->d_parent->d_name.name, + dentry->d_parent, dentry->d_inode, d_refcount(dentry), + dentry->d_flags, dentry->d_fsdata, subdirs); + if (dentry->d_inode != NULL) + ll_dump_inode(dentry->d_inode); + + if (recur == 0) + return; + + list_for_each(tmp, &dentry->d_subdirs) { + struct dentry *d = list_entry(tmp, struct dentry, d_u.d_child); + lustre_dump_dentry(d, recur - 1); + } +} + +void client_common_put_super(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + ENTRY; + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + et_fini(&sbi->ll_et); + rct_fini(&sbi->ll_rct); + } +#endif + + ll_close_thread_shutdown(sbi->ll_lcq); + + cl_sb_fini(sb); + + list_del(&sbi->ll_conn_chain); + + obd_fid_fini(sbi->ll_dt_exp->exp_obd); + obd_disconnect(sbi->ll_dt_exp); + sbi->ll_dt_exp = NULL; + /* wait till all OSCs are gone, since cl_cache is accessing sbi. + * see LU-2543. */ + obd_zombie_barrier(); + + lprocfs_unregister_mountpoint(sbi); + + obd_fid_fini(sbi->ll_md_exp->exp_obd); + obd_disconnect(sbi->ll_md_exp); + sbi->ll_md_exp = NULL; + + EXIT; +} + +void ll_kill_super(struct super_block *sb) +{ + struct ll_sb_info *sbi; + + ENTRY; + + /* not init sb ?*/ + if (!(sb->s_flags & MS_ACTIVE)) + return; + + sbi = ll_s2sbi(sb); + /* we need restore s_dev from changed for clustred NFS before put_super + * because new kernels have cached s_dev and change sb->s_dev in + * put_super not affected real removing devices */ + if (sbi) + sb->s_dev = sbi->ll_sdev_orig; + EXIT; +} + +char *ll_read_opt(const char *opt, char *data) +{ + char *value; + char *retval; + ENTRY; + + CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data); + if (strncmp(opt, data, strlen(opt))) + RETURN(NULL); + if ((value = strchr(data, '=')) == NULL) + RETURN(NULL); + + value++; + OBD_ALLOC(retval, strlen(value) + 1); + if (!retval) { + CERROR("out of memory!\n"); + RETURN(NULL); + } + + memcpy(retval, value, strlen(value)+1); + CDEBUG(D_SUPER, "Assigned option: %s, value %s\n", opt, retval); + RETURN(retval); +} + +static inline int ll_set_opt(const char *opt, char *data, int fl) +{ + if (strncmp(opt, data, strlen(opt)) != 0) + return(0); + else + return(fl); +} + +/* non-client-specific mount options are parsed in lmd_parse */ +static int ll_options(char *options, int *flags) +{ + int tmp; + char *s1 = options, *s2; + ENTRY; + + if (!options) + RETURN(0); + + CDEBUG(D_CONFIG, "Parsing opts %s\n", options); + + while (*s1) { + CDEBUG(D_SUPER, "next opt=%s\n", s1); + tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); + if (tmp) { + *flags &= ~tmp; + goto next; + } +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 50, 0) + tmp = ll_set_opt("acl", s1, LL_SBI_ACL); + if (tmp) { + /* Ignore deprecated mount option. The client will + * always try to mount with ACL support, whether this + * is used depends on whether server supports it. */ + LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated " + "mount option 'acl'.\n"); + goto next; + } + tmp = ll_set_opt("noacl", s1, LL_SBI_ACL); + if (tmp) { + LCONSOLE_ERROR_MSG(0x152, "Ignoring deprecated " + "mount option 'noacl'.\n"); + goto next; + } +#else +#warning "{no}acl options have been deprecated since 1.8, please remove them" +#endif + tmp = ll_set_opt("remote_client", s1, LL_SBI_RMT_CLIENT); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH); + if (tmp) { + *flags &= ~tmp; + goto next; + } + + tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS); + if (tmp) { + *flags &= ~tmp; + goto next; + } + tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags |= tmp; + goto next; + } + tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE); + if (tmp) { + *flags &= ~tmp; + goto next; + } + LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n", + s1); + RETURN(-EINVAL); + +next: + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) + break; + s1 = s2 + 1; + } + RETURN(0); +} + +void ll_lli_init(struct ll_inode_info *lli) +{ + lli->lli_inode_magic = LLI_INODE_MAGIC; + lli->lli_flags = 0; + lli->lli_ioepoch = 0; + lli->lli_maxbytes = MAX_LFS_FILESIZE; + spin_lock_init(&lli->lli_lock); + lli->lli_posix_acl = NULL; + lli->lli_remote_perms = NULL; + mutex_init(&lli->lli_rmtperm_mutex); + /* Do not set lli_fid, it has been initialized already. */ + fid_zero(&lli->lli_pfid); + INIT_LIST_HEAD(&lli->lli_close_list); + INIT_LIST_HEAD(&lli->lli_oss_capas); + atomic_set(&lli->lli_open_count, 0); + lli->lli_mds_capa = NULL; + lli->lli_rmtperm_time = 0; + lli->lli_pending_och = NULL; + lli->lli_mds_read_och = NULL; + lli->lli_mds_write_och = NULL; + lli->lli_mds_exec_och = NULL; + lli->lli_open_fd_read_count = 0; + lli->lli_open_fd_write_count = 0; + lli->lli_open_fd_exec_count = 0; + mutex_init(&lli->lli_och_mutex); + spin_lock_init(&lli->lli_agl_lock); + lli->lli_has_smd = false; + lli->lli_layout_gen = LL_LAYOUT_GEN_NONE; + lli->lli_clob = NULL; + + LASSERT(lli->lli_vfs_inode.i_mode != 0); + if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { + mutex_init(&lli->lli_readdir_mutex); + lli->lli_opendir_key = NULL; + lli->lli_sai = NULL; + lli->lli_def_acl = NULL; + spin_lock_init(&lli->lli_sa_lock); + lli->lli_opendir_pid = 0; + } else { + sema_init(&lli->lli_size_sem, 1); + lli->lli_size_sem_owner = NULL; + lli->lli_symlink_name = NULL; + init_rwsem(&lli->lli_trunc_sem); + mutex_init(&lli->lli_write_mutex); + init_rwsem(&lli->lli_glimpse_sem); + lli->lli_glimpse_time = 0; + INIT_LIST_HEAD(&lli->lli_agl_list); + lli->lli_agl_index = 0; + lli->lli_async_rc = 0; + lli->lli_volatile = false; + } + mutex_init(&lli->lli_layout_mutex); +} + +static inline int ll_bdi_register(struct backing_dev_info *bdi) +{ + static atomic_t ll_bdi_num = ATOMIC_INIT(0); + + bdi->name = "lustre"; + return bdi_register(bdi, NULL, "lustre-%d", + atomic_inc_return(&ll_bdi_num)); +} + +int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) +{ + struct lustre_profile *lprof = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi; + char *dt = NULL, *md = NULL; + char *profilenm = get_profile_name(sb); + struct config_llog_instance *cfg; + /* %p for void* in printf needs 16+2 characters: 0xffffffffffffffff */ + const int instlen = sizeof(cfg->cfg_instance) * 2 + 2; + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); + + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + RETURN(-ENOMEM); + + try_module_get(THIS_MODULE); + + /* client additional sb info */ + lsi->lsi_llsbi = sbi = ll_init_sbi(); + if (!sbi) { + module_put(THIS_MODULE); + OBD_FREE_PTR(cfg); + RETURN(-ENOMEM); + } + + err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags); + if (err) + GOTO(out_free, err); + + err = bdi_init(&lsi->lsi_bdi); + if (err) + GOTO(out_free, err); + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + err = ll_bdi_register(&lsi->lsi_bdi); + if (err) + GOTO(out_free, err); + + sb->s_bdi = &lsi->lsi_bdi; + + /* Generate a string unique to this super, in case some joker tries + to mount the same fs at two mount points. + Use the address of the super itself.*/ + cfg->cfg_instance = sb; + cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; + cfg->cfg_callback = class_config_llog_handler; + /* set up client obds */ + err = lustre_process_log(sb, profilenm, cfg); + if (err < 0) { + CERROR("Unable to process log: %d\n", err); + GOTO(out_free, err); + } + + /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ + lprof = class_get_profile(profilenm); + if (lprof == NULL) { + LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be" + " read from the MGS. Does that filesystem " + "exist?\n", profilenm); + GOTO(out_free, err = -EINVAL); + } + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, + lprof->lp_md, lprof->lp_dt); + + OBD_ALLOC(dt, strlen(lprof->lp_dt) + instlen + 2); + if (!dt) + GOTO(out_free, err = -ENOMEM); + sprintf(dt, "%s-%p", lprof->lp_dt, cfg->cfg_instance); + + OBD_ALLOC(md, strlen(lprof->lp_md) + instlen + 2); + if (!md) + GOTO(out_free, err = -ENOMEM); + sprintf(md, "%s-%p", lprof->lp_md, cfg->cfg_instance); + + /* connections, registrations, sb setup */ + err = client_common_fill_super(sb, md, dt, mnt); + +out_free: + if (md) + OBD_FREE(md, strlen(lprof->lp_md) + instlen + 2); + if (dt) + OBD_FREE(dt, strlen(lprof->lp_dt) + instlen + 2); + if (err) + ll_put_super(sb); + else if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Mounted %s\n", profilenm); + + OBD_FREE_PTR(cfg); + RETURN(err); +} /* ll_fill_super */ + + +void lu_context_keys_dump(void); + +void ll_put_super(struct super_block *sb) +{ + struct config_llog_instance cfg; + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int ccc_count, next, force = 1, rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); + + ll_print_capa_stat(sbi); + + cfg.cfg_instance = sb; + lustre_end_log(sb, profilenm, &cfg); + + if (sbi->ll_md_exp) { + obd = class_exp2obd(sbi->ll_md_exp); + if (obd) + force = obd->obd_force; + } + + /* Wait for unstable pages to be committed to stable storage */ + if (force == 0) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sbi->ll_cache.ccc_unstable_waitq, + atomic_read(&sbi->ll_cache.ccc_unstable_nr) == 0, + &lwi); + } + + ccc_count = atomic_read(&sbi->ll_cache.ccc_unstable_nr); + if (force == 0 && rc != -EINTR) + LASSERTF(ccc_count == 0, "count: %i\n", ccc_count); + + + /* We need to set force before the lov_disconnect in + lustre_common_put_super, since l_d cleans up osc's as well. */ + if (force) { + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, + &next)) != NULL) { + obd->obd_force = force; + } + } + + if (sbi->ll_lcq) { + /* Only if client_common_fill_super succeeded */ + client_common_put_super(sb); + } + + next = 0; + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) !=NULL) { + class_manual_cleanup(obd); + } + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); + + if (profilenm) + class_del_profile(profilenm); + + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } + + ll_free_sbi(sb); + lsi->lsi_llsbi = NULL; + + lustre_common_put_super(sb); + + module_put(THIS_MODULE); + + EXIT; +} /* client_put_super */ + +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode) { + struct ll_inode_info * lli; + lli = ll_i2info(lock->l_resource->lr_lvb_inode); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_resource->lr_lvb_inode); + } else { + inode = lock->l_resource->lr_lvb_inode; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "lr_lvb_inode %p is " + "bogus: magic %08x", + lock->l_resource->lr_lvb_inode, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + +struct inode *ll_inode_from_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_ast_data) { + struct ll_inode_info *lli = ll_i2info(lock->l_ast_data); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_ast_data); + } else { + inode = lock->l_ast_data; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "l_ast_data %p is " + "bogus: magic %08x", lock->l_ast_data, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + +void ll_clear_inode(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + + if (S_ISDIR(inode->i_mode)) { + /* these should have been cleared in ll_file_release */ + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(lli->lli_sai == NULL); + LASSERT(lli->lli_opendir_pid == 0); + } + + ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK; + md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); + + LASSERT(!lli->lli_open_fd_write_count); + LASSERT(!lli->lli_open_fd_read_count); + LASSERT(!lli->lli_open_fd_exec_count); + + if (lli->lli_mds_write_och) + ll_md_real_close(inode, FMODE_WRITE); + if (lli->lli_mds_exec_och) + ll_md_real_close(inode, FMODE_EXEC); + if (lli->lli_mds_read_och) + ll_md_real_close(inode, FMODE_READ); + + if (S_ISLNK(inode->i_mode) && lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + LASSERT(lli->lli_posix_acl == NULL); + if (lli->lli_remote_perms) { + free_rmtperm_hash(lli->lli_remote_perms); + lli->lli_remote_perms = NULL; + } + } +#ifdef CONFIG_FS_POSIX_ACL + else if (lli->lli_posix_acl) { + LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1); + LASSERT(lli->lli_remote_perms == NULL); + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = NULL; + } +#endif + lli->lli_inode_magic = LLI_INODE_DEAD; + + ll_clear_inode_capas(inode); + if (!S_ISDIR(inode->i_mode)) + LASSERT(list_empty(&lli->lli_agl_list)); + + /* + * XXX This has to be done before lsm is freed below, because + * cl_object still uses inode lsm. + */ + cl_inode_fini(inode); + lli->lli_has_smd = false; + + EXIT; +} + +int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, + struct md_open_data **mod) +{ + struct lustre_md md; + struct inode *inode = dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + int rc, ia_valid; + ENTRY; + + op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0, + &request, mod); + if (rc) { + ptlrpc_req_finished(request); + if (rc == -ENOENT) { + clear_nlink(inode); + /* Unlinked special device node? Or just a race? + * Pretend we done everything. */ + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode)) { + ia_valid = op_data->op_attr.ia_valid; + op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; + rc = simple_setattr(dentry, &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + } + } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { + CERROR("md_setattr fails: rc = %d\n", rc); + } + RETURN(rc); + } + + rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) { + ptlrpc_req_finished(request); + RETURN(rc); + } + + ia_valid = op_data->op_attr.ia_valid; + /* inode size will be in ll_setattr_ost, can't do it now since dirty + * cache is not cleared yet. */ + op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); + rc = simple_setattr(dentry, &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; + + /* Extract epoch data if obtained. */ + op_data->op_handle = md.body->handle; + op_data->op_ioepoch = md.body->ioepoch; + + ll_update_inode(inode, &md); + ptlrpc_req_finished(request); + + RETURN(rc); +} + +/* Close IO epoch and send Size-on-MDS attribute update. */ +static int ll_setattr_done_writing(struct inode *inode, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct ll_inode_info *lli = ll_i2info(inode); + int rc = 0; + ENTRY; + + LASSERT(op_data != NULL); + if (!S_ISREG(inode->i_mode)) + RETURN(0); + + CDEBUG(D_INODE, "Epoch "LPU64" closed on "DFID" for truncate\n", + op_data->op_ioepoch, PFID(&lli->lli_fid)); + + op_data->op_flags = MF_EPOCH_CLOSE; + ll_done_writing_attr(inode, op_data); + ll_pack_inode2opdata(inode, op_data, NULL); + + rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod); + if (rc == -EAGAIN) { + /* MDS has instructed us to obtain Size-on-MDS attribute + * from OSTs and send setattr to back to MDS. */ + rc = ll_som_update(inode, op_data); + } else if (rc) { + CERROR("inode %lu mdc truncate failed: rc = %d\n", + inode->i_ino, rc); + } + RETURN(rc); +} + +static int ll_setattr_ost(struct inode *inode, struct iattr *attr) +{ + struct obd_capa *capa; + int rc; + + if (attr->ia_valid & ATTR_SIZE) + capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC); + else + capa = ll_mdscapa_get(inode); + + rc = cl_setattr_ost(inode, attr, capa); + + if (attr->ia_valid & ATTR_SIZE) + ll_truncate_free_capa(capa); + else + capa_put(capa); + + return rc; +} + + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + */ +int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + struct md_op_data *op_data = NULL; + struct md_open_data *mod = NULL; + int rc = 0, rc1 = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "%s: setattr inode %p/fid:"DFID" from %llu to %llu, " + "valid %x\n", ll_get_fsname(inode->i_sb, NULL, 0), inode, + PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size, + attr->ia_valid); + + if (attr->ia_valid & ATTR_SIZE) { + /* Check new size against VFS/VM file size limit and rlimit */ + rc = inode_newsize_ok(inode, attr->ia_size); + if (rc) + RETURN(rc); + + /* The maximum Lustre file size is variable, based on the + * OST maximum object size and number of stripes. This + * needs another check in addition to the VFS check above. */ + if (attr->ia_size > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE,"file "DFID" too large %llu > "LPU64"\n", + PFID(&lli->lli_fid), attr->ia_size, + ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + + /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */ + if (attr->ia_valid & TIMES_SET_FLAGS) { + if (current_fsuid() != inode->i_uid && + !cfs_capable(CFS_CAP_FOWNER)) + RETURN(-EPERM); + } + + /* We mark all of the fields "set" so MDS/OST does not re-set them */ + if (attr->ia_valid & ATTR_CTIME) { + attr->ia_ctime = CFS_CURRENT_TIME; + attr->ia_valid |= ATTR_CTIME_SET; + } + if (!(attr->ia_valid & ATTR_ATIME_SET) && + (attr->ia_valid & ATTR_ATIME)) { + attr->ia_atime = CFS_CURRENT_TIME; + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(attr->ia_valid & ATTR_MTIME_SET) && + (attr->ia_valid & ATTR_MTIME)) { + attr->ia_mtime = CFS_CURRENT_TIME; + attr->ia_valid |= ATTR_MTIME_SET; + } + + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n", + LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), + cfs_time_current_sec()); + + /* If we are changing file size, file content is modified, flag it. */ + if (attr->ia_valid & ATTR_SIZE) { + attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + /* We always do an MDS RPC, even if we're only changing the size; + * only the MDS knows whether truncate() should fail with -ETXTBUSY */ + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + RETURN(-ENOMEM); + + if (!S_ISDIR(inode->i_mode)) { + if (attr->ia_valid & ATTR_SIZE) + inode_dio_write_done(inode); + mutex_unlock(&inode->i_mutex); + down_write(&lli->lli_trunc_sem); + } + + memcpy(&op_data->op_attr, attr, sizeof(*attr)); + + /* Open epoch for truncate. */ + if (exp_connect_som(ll_i2mdexp(inode)) && + (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET))) + op_data->op_flags = MF_EPOCH_OPEN; + + rc = ll_md_setattr(dentry, op_data, &mod); + if (rc) + GOTO(out, rc); + + /* RPC to MDT is sent, cancel data modification flag */ + if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) { + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + ll_ioepoch_open(lli, op_data->op_ioepoch); + if (!S_ISREG(inode->i_mode)) + GOTO(out, rc = 0); + + if (attr->ia_valid & (ATTR_SIZE | + ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET)) + /* For truncate and utimes sending attributes to OSTs, setting + * mtime/atime to the past will be performed under PW [0:EOF] + * extent lock (new_size:EOF for truncate). It may seem + * excessive to send mtime/atime updates to OSTs when not + * setting times to past, but it is necessary due to possible + * time de-synchronization between MDT inode and OST objects */ + rc = ll_setattr_ost(inode, attr); + EXIT; +out: + if (op_data) { + if (op_data->op_ioepoch) { + rc1 = ll_setattr_done_writing(inode, op_data, mod); + if (!rc) + rc = rc1; + } + ll_finish_md_op_data(op_data); + } + if (!S_ISDIR(inode->i_mode)) { + up_write(&lli->lli_trunc_sem); + mutex_lock(&inode->i_mutex); + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + } + + ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? + LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); + + return rc; +} + +int ll_setattr(struct dentry *de, struct iattr *attr) +{ + int mode = de->d_inode->i_mode; + + if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == + (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) + attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + + if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == + (ATTR_SIZE|ATTR_MODE)) && + (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || + (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID)))) + attr->ia_valid |= ATTR_FORCE; + + if ((mode & S_ISUID) && + !(attr->ia_mode & S_ISUID) && + !(attr->ia_valid & ATTR_KILL_SUID)) + attr->ia_valid |= ATTR_KILL_SUID; + + if (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) && + !(attr->ia_mode & S_ISGID) && + !(attr->ia_valid & ATTR_KILL_SGID)) + attr->ia_valid |= ATTR_KILL_SGID; + + return ll_setattr_raw(de, attr); +} + +int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, + __u64 max_age, __u32 flags) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_statfs obd_osfs; + int rc; + ENTRY; + + rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); + if (rc) { + CERROR("md_statfs fails: rc = %d\n", rc); + RETURN(rc); + } + + osfs->os_type = sb->s_magic; + + CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", + osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + flags |= OBD_STATFS_NODELAY; + + rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags); + if (rc) { + CERROR("obd_statfs fails: rc = %d\n", rc); + RETURN(rc); + } + + CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", + obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, + obd_osfs.os_files); + + osfs->os_bsize = obd_osfs.os_bsize; + osfs->os_blocks = obd_osfs.os_blocks; + osfs->os_bfree = obd_osfs.os_bfree; + osfs->os_bavail = obd_osfs.os_bavail; + + /* If we don't have as many objects free on the OST as inodes + * on the MDS, we reduce the total number of inodes to + * compensate, so that the "inodes in use" number is correct. + */ + if (obd_osfs.os_ffree < osfs->os_ffree) { + osfs->os_files = (osfs->os_files - osfs->os_ffree) + + obd_osfs.os_ffree; + osfs->os_ffree = obd_osfs.os_ffree; + } + + RETURN(rc); +} +int ll_statfs(struct dentry *de, struct kstatfs *sfs) +{ + struct super_block *sb = de->d_sb; + struct obd_statfs osfs; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op: at "LPU64" jiffies\n", get_jiffies_64()); + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1); + + /* Some amount of caching on the client is allowed */ + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc) + return rc; + + statfs_unpack(sfs, &osfs); + + /* We need to downshift for all 32-bit kernels, because we can't + * tell if the kernel is being called via sys_statfs64() or not. + * Stop before overflowing f_bsize - in which case it is better + * to just risk EOVERFLOW if caller is using old sys_statfs(). */ + if (sizeof(long) < 8) { + while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + + return 0; +} + +void ll_inode_size_lock(struct inode *inode) +{ + struct ll_inode_info *lli; + + LASSERT(!S_ISDIR(inode->i_mode)); + + lli = ll_i2info(inode); + LASSERT(lli->lli_size_sem_owner != current); + down(&lli->lli_size_sem); + LASSERT(lli->lli_size_sem_owner == NULL); + lli->lli_size_sem_owner = current; +} + +void ll_inode_size_unlock(struct inode *inode) +{ + struct ll_inode_info *lli; + + lli = ll_i2info(inode); + LASSERT(lli->lli_size_sem_owner == current); + lli->lli_size_sem_owner = NULL; + up(&lli->lli_size_sem); +} + +void ll_update_inode(struct inode *inode, struct lustre_md *md) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = md->body; + struct lov_stripe_md *lsm = md->lsm; + struct ll_sb_info *sbi = ll_i2sbi(inode); + + LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (!lli->lli_has_smd && + !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK)) + cl_file_inode_init(inode, md); + + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > MAX_LFS_FILESIZE) + lli->lli_maxbytes = MAX_LFS_FILESIZE; + } + + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + if (body->valid & OBD_MD_FLRMTPERM) + ll_update_remote_perm(inode, md->remote_perm); + } +#ifdef CONFIG_FS_POSIX_ACL + else if (body->valid & OBD_MD_FLACL) { + spin_lock(&lli->lli_lock); + if (lli->lli_posix_acl) + posix_acl_release(lli->lli_posix_acl); + lli->lli_posix_acl = md->posix_acl; + spin_unlock(&lli->lli_lock); + } +#endif + inode->i_ino = cl_fid_build_ino(&body->fid1, ll_need_32bit_api(sbi)); + inode->i_generation = cl_fid_build_gen(&body->fid1); + + if (body->valid & OBD_MD_FLATIME) { + if (body->atime > LTIME_S(inode->i_atime)) + LTIME_S(inode->i_atime) = body->atime; + lli->lli_lvb.lvb_atime = body->atime; + } + if (body->valid & OBD_MD_FLMTIME) { + if (body->mtime > LTIME_S(inode->i_mtime)) { + CDEBUG(D_INODE, "setting ino %lu mtime from %lu " + "to "LPU64"\n", inode->i_ino, + LTIME_S(inode->i_mtime), body->mtime); + LTIME_S(inode->i_mtime) = body->mtime; + } + lli->lli_lvb.lvb_mtime = body->mtime; + } + if (body->valid & OBD_MD_FLCTIME) { + if (body->ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->ctime; + lli->lli_lvb.lvb_ctime = body->ctime; + } + if (body->valid & OBD_MD_FLMODE) + inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); + if (body->valid & OBD_MD_FLTYPE) + inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT); + LASSERT(inode->i_mode != 0); + if (S_ISREG(inode->i_mode)) { + inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, LL_MAX_BLKSIZE_BITS); + } else { + inode->i_blkbits = inode->i_sb->s_blocksize_bits; + } + if (body->valid & OBD_MD_FLUID) + inode->i_uid = body->uid; + if (body->valid & OBD_MD_FLGID) + inode->i_gid = body->gid; + if (body->valid & OBD_MD_FLFLAGS) + inode->i_flags = ll_ext_to_inode_flags(body->flags); + if (body->valid & OBD_MD_FLNLINK) + set_nlink(inode, body->nlink); + if (body->valid & OBD_MD_FLRDEV) + inode->i_rdev = old_decode_dev(body->rdev); + + if (body->valid & OBD_MD_FLID) { + /* FID shouldn't be changed! */ + if (fid_is_sane(&lli->lli_fid)) { + LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1), + "Trying to change FID "DFID + " to the "DFID", inode %lu/%u(%p)\n", + PFID(&lli->lli_fid), PFID(&body->fid1), + inode->i_ino, inode->i_generation, inode); + } else + lli->lli_fid = body->fid1; + } + + LASSERT(fid_seq(&lli->lli_fid) != 0); + + if (body->valid & OBD_MD_FLSIZE) { + if (exp_connect_som(ll_i2mdexp(inode)) && + S_ISREG(inode->i_mode)) { + struct lustre_handle lockh; + ldlm_mode_t mode; + + /* As it is possible a blocking ast has been processed + * by this time, we need to check there is an UPDATE + * lock on the client and set LLIF_MDS_SIZE_LOCK holding + * it. */ + mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE, + &lockh, LDLM_FL_CBPENDING); + if (mode) { + if (lli->lli_flags & (LLIF_DONE_WRITING | + LLIF_EPOCH_PENDING | + LLIF_SOM_DIRTY)) { + CERROR("ino %lu flags %u still has " + "size authority! do not trust " + "the size got from MDS\n", + inode->i_ino, lli->lli_flags); + } else { + /* Use old size assignment to avoid + * deadlock bz14138 & bz14326 */ + i_size_write(inode, body->size); + lli->lli_flags |= LLIF_MDS_SIZE_LOCK; + } + ldlm_lock_decref(&lockh, mode); + } + } else { + /* Use old size assignment to avoid + * deadlock bz14138 & bz14326 */ + i_size_write(inode, body->size); + + CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n", + inode->i_ino, (unsigned long long)body->size); + } + + if (body->valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->blocks; + } + + if (body->valid & OBD_MD_FLMDSCAPA) { + LASSERT(md->mds_capa); + ll_add_capa(inode, md->mds_capa); + } + if (body->valid & OBD_MD_FLOSSCAPA) { + LASSERT(md->oss_capa); + ll_add_capa(inode, md->oss_capa); + } +} + +void ll_read_inode2(struct inode *inode, void *opaque) +{ + struct lustre_md *md = opaque; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n", + PFID(&lli->lli_fid), inode); + + LASSERT(!lli->lli_has_smd); + + /* Core attributes from the MDS first. This is a new inode, and + * the VFS doesn't zero times in the core inode so we have to do + * it ourselves. They will be overwritten by either MDS or OST + * attributes - we just need to make sure they aren't newer. */ + LTIME_S(inode->i_mtime) = 0; + LTIME_S(inode->i_atime) = 0; + LTIME_S(inode->i_ctime) = 0; + inode->i_rdev = 0; + ll_update_inode(inode, md); + + /* OIDEBUG(inode); */ + + /* initializing backing dev info. */ + inode->i_mapping->backing_dev_info = &s2lsi(inode->i_sb)->lsi_bdi; + + + if (S_ISREG(inode->i_mode)) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + inode->i_op = &ll_file_inode_operations; + inode->i_fop = sbi->ll_fop; + inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; + EXIT; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ll_dir_inode_operations; + inode->i_fop = &ll_dir_operations; + EXIT; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &ll_fast_symlink_inode_operations; + EXIT; + } else { + inode->i_op = &ll_special_inode_operations; + + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + + EXIT; + } +} + +void ll_delete_inode(struct inode *inode) +{ + struct cl_inode_info *lli = cl_i2info(inode); + ENTRY; + + if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) + /* discard all dirty pages before truncating them, required by + * osc_extent implementation at LU-1030. */ + cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_DISCARD); + + truncate_inode_pages(&inode->i_data, 0); + + /* Workaround for LU-118 */ + if (inode->i_data.nrpages) { + TREE_READ_LOCK_IRQ(&inode->i_data); + TREE_READ_UNLOCK_IRQ(&inode->i_data); + LASSERTF(inode->i_data.nrpages == 0, + "inode=%lu/%u(%p) nrpages=%lu, see " + "http://jira.whamcloud.com/browse/LU-118\n", + inode->i_ino, inode->i_generation, inode, + inode->i_data.nrpages); + } + /* Workaround end */ + + ll_clear_inode(inode); + clear_inode(inode); + + EXIT; +} + +int ll_iocontrol(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + int rc, flags = 0; + ENTRY; + + switch(cmd) { + case FSFILT_IOC_GETFLAGS: { + struct mdt_body *body; + struct md_op_data *op_data; + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, + 0, 0, LUSTRE_OPC_ANY, + NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_FLFLAGS; + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("failure %d inode %lu\n", rc, inode->i_ino); + RETURN(-abs(rc)); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + flags = body->flags; + + ptlrpc_req_finished(req); + + RETURN(put_user(flags, (int *)arg)); + } + case FSFILT_IOC_SETFLAGS: { + struct lov_stripe_md *lsm; + struct obd_info oinfo = { { { 0 } } }; + struct md_op_data *op_data; + + if (get_user(flags, (int *)arg)) + RETURN(-EFAULT); + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = flags; + op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; + rc = md_setattr(sbi->ll_md_exp, op_data, + NULL, 0, NULL, 0, &req, NULL); + ll_finish_md_op_data(op_data); + ptlrpc_req_finished(req); + if (rc) + RETURN(rc); + + inode->i_flags = ll_ext_to_inode_flags(flags); + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) + RETURN(0); + + OBDO_ALLOC(oinfo.oi_oa); + if (!oinfo.oi_oa) { + ccc_inode_lsm_put(inode, lsm); + RETURN(-ENOMEM); + } + oinfo.oi_md = lsm; + oinfo.oi_oa->o_oi = lsm->lsm_oi; + oinfo.oi_oa->o_flags = flags; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | + OBD_MD_FLGROUP; + oinfo.oi_capa = ll_mdscapa_get(inode); + obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid); + rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL); + capa_put(oinfo.oi_capa); + OBDO_FREE(oinfo.oi_oa); + ccc_inode_lsm_put(inode, lsm); + + if (rc && rc != -EPERM && rc != -EACCES) + CERROR("osc_setattr_async fails: rc = %d\n", rc); + + RETURN(rc); + } + default: + RETURN(-ENOSYS); + } + + RETURN(0); +} + +int ll_flush_ctx(struct inode *inode) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + + CDEBUG(D_SEC, "flush context for user %d\n", current_uid()); + + obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, + 0, NULL, NULL); + return 0; +} + +/* umount -f client means force down, don't save state */ +void ll_umount_begin(struct super_block *sb) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + struct obd_ioctl_data *ioc_data; + ENTRY; + + + CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, + sb->s_count, atomic_read(&sb->s_active)); + + obd = class_exp2obd(sbi->ll_md_exp); + if (obd == NULL) { + CERROR("Invalid MDC connection handle "LPX64"\n", + sbi->ll_md_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + obd = class_exp2obd(sbi->ll_dt_exp); + if (obd == NULL) { + CERROR("Invalid LOV connection handle "LPX64"\n", + sbi->ll_dt_exp->exp_handle.h_cookie); + EXIT; + return; + } + obd->obd_force = 1; + + OBD_ALLOC_PTR(ioc_data); + if (ioc_data) { + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, + sizeof *ioc_data, ioc_data, NULL); + + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, + sizeof *ioc_data, ioc_data, NULL); + + OBD_FREE_PTR(ioc_data); + } + + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just invalidate the requests, + * schedule() and sleep one second if needed, and hope. + */ + schedule(); + + EXIT; +} + +int ll_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + char *profilenm = get_profile_name(sb); + int err; + __u32 read_only; + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + read_only = *flags & MS_RDONLY; + err = obd_set_info_async(NULL, sbi->ll_md_exp, + sizeof(KEY_READ_ONLY), + KEY_READ_ONLY, sizeof(read_only), + &read_only, NULL); + if (err) { + LCONSOLE_WARN("Failed to remount %s %s (%d)\n", + profilenm, read_only ? + "read-only" : "read-write", err); + return err; + } + + if (read_only) + sb->s_flags |= MS_RDONLY; + else + sb->s_flags &= ~MS_RDONLY; + + if (sbi->ll_flags & LL_SBI_VERBOSE) + LCONSOLE_WARN("Remounted %s %s\n", profilenm, + read_only ? "read-only" : "read-write"); + } + return 0; +} + +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *sb, struct lookup_intent *it) +{ + struct ll_sb_info *sbi = NULL; + struct lustre_md md; + int rc; + ENTRY; + + LASSERT(*inode || sb); + sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); + rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp, + sbi->ll_md_exp, &md); + if (rc) + RETURN(rc); + + if (*inode) { + ll_update_inode(*inode, &md); + } else { + LASSERT(sb != NULL); + + /* + * At this point server returns to client's same fid as client + * generated for creating. So using ->fid1 is okay here. + */ + LASSERT(fid_is_sane(&md.body->fid1)); + + *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1, + ll_need_32bit_api(sbi)), + &md); + if (*inode == NULL || IS_ERR(*inode)) { +#ifdef CONFIG_FS_POSIX_ACL + if (md.posix_acl) { + posix_acl_release(md.posix_acl); + md.posix_acl = NULL; + } +#endif + rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM; + *inode = NULL; + CERROR("new_inode -fatal: rc %d\n", rc); + GOTO(out, rc); + } + } + + /* Handling piggyback layout lock. + * Layout lock can be piggybacked by getattr and open request. + * The lsm can be applied to inode only if it comes with a layout lock + * otherwise correct layout may be overwritten, for example: + * 1. proc1: mdt returns a lsm but not granting layout + * 2. layout was changed by another client + * 3. proc2: refresh layout and layout lock granted + * 4. proc1: to apply a stale layout */ + if (it != NULL && it->d.lustre.it_lock_mode != 0) { + struct lustre_handle lockh; + struct ldlm_lock *lock; + + lockh.cookie = it->d.lustre.it_lock_handle; + lock = ldlm_handle2lock(&lockh); + LASSERT(lock != NULL); + if (ldlm_has_layout(lock)) { + struct cl_object_conf conf; + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = *inode; + conf.coc_lock = lock; + conf.u.coc_md = &md; + (void)ll_layout_conf(*inode, &conf); + } + LDLM_LOCK_PUT(lock); + } + +out: + if (md.lsm != NULL) + obd_free_memmd(sbi->ll_dt_exp, &md.lsm); + md_free_lustre_md(sbi->ll_md_exp, &md); + RETURN(rc); +} + +int ll_obd_statfs(struct inode *inode, void *arg) +{ + struct ll_sb_info *sbi = NULL; + struct obd_export *exp; + char *buf = NULL; + struct obd_ioctl_data *data = NULL; + __u32 type; + __u32 flags; + int len = 0, rc; + + if (!inode || !(sbi = ll_i2sbi(inode))) + GOTO(out_statfs, rc = -EINVAL); + + rc = obd_ioctl_getdata(&buf, &len, arg); + if (rc) + GOTO(out_statfs, rc); + + data = (void*)buf; + if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || + !data->ioc_pbuf1 || !data->ioc_pbuf2) + GOTO(out_statfs, rc = -EINVAL); + + if (data->ioc_inllen1 != sizeof(__u32) || + data->ioc_inllen2 != sizeof(__u32) || + data->ioc_plen1 != sizeof(struct obd_statfs) || + data->ioc_plen2 != sizeof(struct obd_uuid)) + GOTO(out_statfs, rc = -EINVAL); + + memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); + if (type & LL_STATFS_LMV) + exp = sbi->ll_md_exp; + else if (type & LL_STATFS_LOV) + exp = sbi->ll_dt_exp; + else + GOTO(out_statfs, rc = -ENODEV); + + flags = (type & LL_STATFS_NODELAY) ? OBD_STATFS_NODELAY : 0; + rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, &flags); + if (rc) + GOTO(out_statfs, rc); +out_statfs: + if (buf) + obd_ioctl_freedata(buf, len); + return rc; +} + +int ll_process_config(struct lustre_cfg *lcfg) +{ + char *ptr; + void *sb; + struct lprocfs_static_vars lvars; + unsigned long x; + int rc = 0; + + lprocfs_llite_init_vars(&lvars); + + /* The instance name contains the sb: lustre-client-aacfe000 */ + ptr = strrchr(lustre_cfg_string(lcfg, 0), '-'); + if (!ptr || !*(++ptr)) + return -EINVAL; + if (sscanf(ptr, "%lx", &x) != 1) + return -EINVAL; + sb = (void *)x; + /* This better be a real Lustre superblock! */ + LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC); + + /* Note we have not called client_common_fill_super yet, so + proc fns must be able to handle that! */ + rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, + lcfg, sb); + if (rc > 0) + rc = 0; + return(rc); +} + +/* this function prepares md_op_data hint for passing ot down to MD stack. */ +struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data, + struct inode *i1, struct inode *i2, + const char *name, int namelen, + int mode, __u32 opc, void *data) +{ + LASSERT(i1 != NULL); + + if (namelen > ll_i2sbi(i1)->ll_namelen) + return ERR_PTR(-ENAMETOOLONG); + + if (op_data == NULL) + OBD_ALLOC_PTR(op_data); + + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + ll_i2gids(op_data->op_suppgids, i1, i2); + op_data->op_fid1 = *ll_inode2fid(i1); + op_data->op_capa1 = ll_mdscapa_get(i1); + + if (i2) { + op_data->op_fid2 = *ll_inode2fid(i2); + op_data->op_capa2 = ll_mdscapa_get(i2); + } else { + fid_zero(&op_data->op_fid2); + op_data->op_capa2 = NULL; + } + + op_data->op_name = name; + op_data->op_namelen = namelen; + op_data->op_mode = mode; + op_data->op_mod_time = cfs_time_current_sec(); + op_data->op_fsuid = current_fsuid(); + op_data->op_fsgid = current_fsgid(); + op_data->op_cap = cfs_curproc_cap_pack(); + op_data->op_bias = 0; + op_data->op_cli_flags = 0; + if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) && + filename_is_volatile(name, namelen, NULL)) + op_data->op_bias |= MDS_CREATE_VOLATILE; + op_data->op_opc = opc; + op_data->op_mds = 0; + op_data->op_data = data; + + /* If the file is being opened after mknod() (normally due to NFS) + * try to use the default stripe data from parent directory for + * allocating OST objects. Try to pass the parent FID to MDS. */ + if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) && + !ll_i2info(i2)->lli_has_smd) { + struct ll_inode_info *lli = ll_i2info(i2); + + spin_lock(&lli->lli_lock); + if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid))) + op_data->op_fid1 = lli->lli_pfid; + spin_unlock(&lli->lli_lock); + /** We ignore parent's capability temporary. */ + } + + /* When called by ll_setattr_raw, file is i1. */ + if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags) + op_data->op_bias |= MDS_DATA_MODIFIED; + + return op_data; +} + +void ll_finish_md_op_data(struct md_op_data *op_data) +{ + capa_put(op_data->op_capa1); + capa_put(op_data->op_capa2); + OBD_FREE_PTR(op_data); +} + +int ll_show_options(struct seq_file *seq, struct dentry *dentry) +{ + struct ll_sb_info *sbi; + + LASSERT((seq != NULL) && (dentry != NULL)); + sbi = ll_s2sbi(dentry->d_sb); + + if (sbi->ll_flags & LL_SBI_NOLCK) + seq_puts(seq, ",nolock"); + + if (sbi->ll_flags & LL_SBI_FLOCK) + seq_puts(seq, ",flock"); + + if (sbi->ll_flags & LL_SBI_LOCALFLOCK) + seq_puts(seq, ",localflock"); + + if (sbi->ll_flags & LL_SBI_USER_XATTR) + seq_puts(seq, ",user_xattr"); + + if (sbi->ll_flags & LL_SBI_LAZYSTATFS) + seq_puts(seq, ",lazystatfs"); + + if (sbi->ll_flags & LL_SBI_USER_FID2PATH) + seq_puts(seq, ",user_fid2path"); + + RETURN(0); +} + +/** + * Get obd name by cmd, and copy out to user space + */ +int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_device *obd; + ENTRY; + + if (cmd == OBD_IOC_GETDTNAME) + obd = class_exp2obd(sbi->ll_dt_exp); + else if (cmd == OBD_IOC_GETMDNAME) + obd = class_exp2obd(sbi->ll_md_exp); + else + RETURN(-EINVAL); + + if (!obd) + RETURN(-ENOENT); + + if (copy_to_user((void *)arg, obd->obd_name, + strlen(obd->obd_name) + 1)) + RETURN(-EFAULT); + + RETURN(0); +} + +/** + * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the + * fsname will be returned in this buffer; otherwise, a static buffer will be + * used to store the fsname and returned to caller. + */ +char *ll_get_fsname(struct super_block *sb, char *buf, int buflen) +{ + static char fsname_static[MTI_NAME_MAXLEN]; + struct lustre_sb_info *lsi = s2lsi(sb); + char *ptr; + int len; + + if (buf == NULL) { + /* this means the caller wants to use static buffer + * and it doesn't care about race. Usually this is + * in error reporting path */ + buf = fsname_static; + buflen = sizeof(fsname_static); + } + + len = strlen(lsi->lsi_lmd->lmd_profile); + ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + if (unlikely(len >= buflen)) + len = buflen - 1; + strncpy(buf, lsi->lsi_lmd->lmd_profile, len); + buf[len] = '\0'; + + return buf; +} + +static char* ll_d_path(struct dentry *dentry, char *buf, int bufsize) +{ + char *path = NULL; + + struct path p; + + p.dentry = dentry; + p.mnt = current->fs->root.mnt; + path_get(&p); + path = d_path(&p, buf, bufsize); + path_put(&p); + + return path; +} + +void ll_dirty_page_discard_warn(struct page *page, int ioret) +{ + char *buf, *path = NULL; + struct dentry *dentry = NULL; + struct ccc_object *obj = cl_inode2ccc(page->mapping->host); + + /* this can be called inside spin lock so use GFP_ATOMIC. */ + buf = (char *)__get_free_page(GFP_ATOMIC); + if (buf != NULL) { + dentry = d_find_alias(page->mapping->host); + if (dentry != NULL) + path = ll_d_path(dentry, buf, PAGE_SIZE); + } + + CWARN("%s: dirty page discard: %s/fid: "DFID"/%s may get corrupted " + "(rc %d)\n", ll_get_fsname(page->mapping->host->i_sb, NULL, 0), + s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev, + PFID(&obj->cob_header.coh_lu.loh_fid), + (path && !IS_ERR(path)) ? path : "", ioret); + + if (dentry != NULL) + dput(dentry); + + if (buf != NULL) + free_page((unsigned long)buf); +} diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c new file mode 100644 index 000000000000..d9590d85634a --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c @@ -0,0 +1,507 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include + +struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, + int *type); + +static struct vm_operations_struct ll_file_vm_ops; + +void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, + size_t count) +{ + policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) + + (vma->vm_pgoff << PAGE_CACHE_SHIFT); + policy->l_extent.end = (policy->l_extent.start + count - 1) | + ~CFS_PAGE_MASK; +} + +struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, + size_t count) +{ + struct vm_area_struct *vma, *ret = NULL; + ENTRY; + + /* mmap_sem must have been held by caller. */ + LASSERT(!down_write_trylock(&mm->mmap_sem)); + + for(vma = find_vma(mm, addr); + vma != NULL && vma->vm_start < (addr + count); vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && + vma->vm_flags & VM_SHARED) { + ret = vma; + break; + } + } + RETURN(ret); +} + +/** + * API independent part for page fault initialization. + * \param vma - virtual memory area addressed to page fault + * \param env - corespondent lu_env to processing + * \param nest - nested level + * \param index - page index corespondent to fault. + * \parm ra_flags - vma readahead flags. + * + * \return allocated and initialized env for fault operation. + * \retval EINVAL if env can't allocated + * \return other error codes from cl_io_init. + */ +struct cl_io *ll_fault_io_init(struct vm_area_struct *vma, + struct lu_env **env_ret, + struct cl_env_nest *nest, + pgoff_t index, unsigned long *ra_flags) +{ + struct file *file = vma->vm_file; + struct inode *inode = file->f_dentry->d_inode; + struct cl_io *io; + struct cl_fault_io *fio; + struct lu_env *env; + ENTRY; + + *env_ret = NULL; + if (ll_file_nolock(file)) + RETURN(ERR_PTR(-EOPNOTSUPP)); + + /* + * page fault can be called when lustre IO is + * already active for the current thread, e.g., when doing read/write + * against user level buffer mapped from Lustre buffer. To avoid + * stomping on existing context, optionally force an allocation of a new + * one. + */ + env = cl_env_nested_get(nest); + if (IS_ERR(env)) + RETURN(ERR_PTR(-EINVAL)); + + *env_ret = env; + + io = ccc_env_thread_io(env); + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + fio->ft_index = index; + fio->ft_executable = vma->vm_flags&VM_EXEC; + + /* + * disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + if (ra_flags != NULL) + *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + vma->vm_flags &= ~VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + + CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, + fio->ft_index, fio->ft_executable); + + if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) { + struct ccc_io *cio = ccc_env_io(env); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(cio->cui_cl.cis_io == io); + + /* mmap lock must be MANDATORY + * it has to cache pages. */ + io->ci_lockreq = CILR_MANDATORY; + + cio->cui_fd = fd; + } + + return io; +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_env_nest nest; + int result; + sigset_t set; + struct inode *inode; + struct ll_inode_info *lli; + ENTRY; + + LASSERT(vmpage != NULL); + + io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result < 0) + GOTO(out, result); + + io->u.ci_fault.ft_mkwrite = 1; + io->u.ci_fault.ft_writable = 1; + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + + /* we grab lli_trunc_sem to exclude truncate case. + * Otherwise, we could add dirty pages into osc cache + * while truncate is on-going. */ + inode = ccc_object_inode(io->ci_obj); + lli = ll_i2info(inode); + down_read(&lli->lli_trunc_sem); + + result = cl_io_loop(env, io); + + up_read(&lli->lli_trunc_sem); + + cfs_restore_sigs(set); + + if (result == 0) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ll_inode_info *lli = ll_i2info(inode); + + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " + "been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + + if (result == 0) { + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + } + EXIT; + +out: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); + + LASSERT(ergo(result == 0, PageLocked(vmpage))); + return(result); +} + + + +static inline int to_fault_error(int result) +{ + switch(result) { + case 0: + result = VM_FAULT_LOCKED; + break; + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + return result; +} + +/** + * Lustre implementation of a vm_operations_struct::fault() method, called by + * VM to server page fault (both in kernel and user space). + * + * \param vma - is virtiual area struct related to page fault + * \param vmf - structure which describe type and address where hit fault + * + * \return allocated and filled _locked_ page for address + * \retval VM_FAULT_ERROR on general error + * \retval NOPAGE_OOM not have memory for allocate new page + */ +static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio = NULL; + struct page *vmpage; + unsigned long ra_flags; + struct cl_env_nest nest; + int result; + int fault_ret = 0; + ENTRY; + + io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); + if (IS_ERR(io)) + RETURN(to_fault_error(PTR_ERR(io))); + + result = io->ci_result; + if (result == 0) { + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = NULL; + vio->u.fault.fault.ft_vmf = vmf; + + result = cl_io_loop(env, io); + + fault_ret = vio->u.fault.fault.ft_flags; + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + page_cache_release(vmpage); + vmf->page = NULL; + } + } + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + vma->vm_flags |= ra_flags; + if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) + fault_ret |= to_fault_error(result); + + CDEBUG(D_MMAP, "%s fault %d/%d\n", + current->comm, fault_ret, result); + RETURN(fault_ret); +} + +static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + int result; + sigset_t set; + + /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite + * so that it can be killed by admin but not cause segfault by + * other signals. */ + set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); + +restart: + result = ll_fault0(vma, vmf); + LASSERT(!(result & VM_FAULT_LOCKED)); + if (result == 0) { + struct page *vmpage = vmf->page; + + /* check if this page has been truncated */ + lock_page(vmpage); + if (unlikely(vmpage->mapping == NULL)) { /* unlucky */ + unlock_page(vmpage); + page_cache_release(vmpage); + vmf->page = NULL; + + if (!printed && ++count > 16) { + CWARN("the page is under heavy contention," + "maybe your app(%s) needs revising :-)\n", + current->comm); + printed = true; + } + + goto restart; + } + + result |= VM_FAULT_LOCKED; + } + cfs_restore_sigs(set); + return result; +} + +static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy" + " contention.\n", + current->comm, vmf->pgoff, + vma->vm_file->f_dentry->d_inode->i_ino); + printed = true; + } + } while (retry); + + switch(result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + + return result; +} + +/** + * To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count in ccc_object::cob_mmap_cnt. + */ +static void ll_vm_open(struct vm_area_struct * vma) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ccc_object *vob = cl_inode2ccc(inode); + + ENTRY; + LASSERT(vma->vm_file); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); + atomic_inc(&vob->cob_mmap_cnt); + EXIT; +} + +/** + * Dual to ll_vm_open(). + */ +static void ll_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ccc_object *vob = cl_inode2ccc(inode); + + ENTRY; + LASSERT(vma->vm_file); + atomic_dec(&vob->cob_mmap_cnt); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); + EXIT; +} + + +/* return the user space pointer that maps to a file offset via a vma */ +static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte) +{ + return vma->vm_start + (byte - ((__u64)vma->vm_pgoff << PAGE_CACHE_SHIFT)); + +} + +/* XXX put nice comment here. talk about __free_pte -> dirty pages and + * nopage's reference passing to the pte */ +int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) +{ + int rc = -ENOENT; + ENTRY; + + LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first); + if (mapping_mapped(mapping)) { + rc = 0; + unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1, + last - first + 1, 0); + } + + RETURN(rc); +} + +static struct vm_operations_struct ll_file_vm_ops = { + .fault = ll_fault, + .page_mkwrite = ll_page_mkwrite, + .open = ll_vm_open, + .close = ll_vm_close, +}; + +int ll_file_mmap(struct file *file, struct vm_area_struct * vma) +{ + struct inode *inode = file->f_dentry->d_inode; + int rc; + ENTRY; + + if (ll_file_nolock(file)) + RETURN(-EOPNOTSUPP); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); + rc = generic_file_mmap(file, vma); + if (rc == 0) { + vma->vm_ops = &ll_file_vm_ops; + vma->vm_ops->open(vma); + /* update the inode's size and mtime */ + rc = ll_glimpse_size(inode); + } + + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c new file mode 100644 index 000000000000..28cc41e90581 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -0,0 +1,319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/llite_nfs.c + * + * NFS export of Lustre Light File System + * + * Author: Yury Umanets + * Author: Huang Hua + */ + +#define DEBUG_SUBSYSTEM S_LLITE +#include +#include "llite_internal.h" +#include + +__u32 get_uuid2int(const char *name, int len) +{ + __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; + while (len--) { + __u32 key = key1 + (key0 ^ (*name++ * 7152373)); + if (key & 0x80000000) key -= 0x7fffffff; + key1 = key0; + key0 = key; + } + return (key0 << 1); +} + +static int ll_nfs_test_inode(struct inode *inode, void *opaque) +{ + return lu_fid_eq(&ll_i2info(inode)->lli_fid, + (struct lu_fid *)opaque); +} + +struct inode *search_inode_for_lustre(struct super_block *sb, + const struct lu_fid *fid) +{ + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct ptlrpc_request *req = NULL; + struct inode *inode = NULL; + int eadatalen = 0; + unsigned long hash = cl_fid_build_ino(fid, + ll_need_32bit_api(sbi)); + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_INFO, "searching inode for:(%lu,"DFID")\n", hash, PFID(fid)); + + inode = ilookup5(sb, hash, ll_nfs_test_inode, (void *)fid); + if (inode) + RETURN(inode); + + rc = ll_get_max_mdsize(sbi, &eadatalen); + if (rc) + RETURN(ERR_PTR(rc)); + + /* Because inode is NULL, ll_prep_md_op_data can not + * be used here. So we allocate op_data ourselves */ + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + return ERR_PTR(-ENOMEM); + + op_data->op_fid1 = *fid; + op_data->op_mode = eadatalen; + op_data->op_valid = OBD_MD_FLEASIZE; + + /* mds_fid2dentry ignores f_type */ + rc = md_getattr(sbi->ll_md_exp, op_data, &req); + OBD_FREE_PTR(op_data); + if (rc) { + CERROR("can't get object attrs, fid "DFID", rc %d\n", + PFID(fid), rc); + RETURN(ERR_PTR(rc)); + } + rc = ll_prep_inode(&inode, req, sb, NULL); + ptlrpc_req_finished(req); + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(inode); +} + +struct lustre_nfs_fid { + struct lu_fid lnf_child; + struct lu_fid lnf_parent; +}; + +static struct dentry * +ll_iget_for_nfs(struct super_block *sb, struct lu_fid *fid, struct lu_fid *parent) +{ + struct inode *inode; + struct dentry *result; + ENTRY; + + CDEBUG(D_INFO, "Get dentry for fid: "DFID"\n", PFID(fid)); + if (!fid_is_sane(fid)) + RETURN(ERR_PTR(-ESTALE)); + + inode = search_inode_for_lustre(sb, fid); + if (IS_ERR(inode)) + RETURN(ERR_PTR(PTR_ERR(inode))); + + if (is_bad_inode(inode)) { + /* we didn't find the right inode.. */ + iput(inode); + RETURN(ERR_PTR(-ESTALE)); + } + + /** + * It is an anonymous dentry without OST objects created yet. + * We have to find the parent to tell MDS how to init lov objects. + */ + if (S_ISREG(inode->i_mode) && !ll_i2info(inode)->lli_has_smd && + parent != NULL) { + struct ll_inode_info *lli = ll_i2info(inode); + + spin_lock(&lli->lli_lock); + lli->lli_pfid = *parent; + spin_unlock(&lli->lli_lock); + } + + result = d_obtain_alias(inode); + if (IS_ERR(result)) + RETURN(result); + + ll_dops_init(result, 1, 0); + + RETURN(result); +} + +#define LUSTRE_NFS_FID 0x97 + +/** + * \a connectable - is nfsd will connect himself or this should be done + * at lustre + * + * The return value is file handle type: + * 1 -- contains child file handle; + * 2 -- contains child file handle and parent file handle; + * 255 -- error. + */ +static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, + struct inode *parent) +{ + struct lustre_nfs_fid *nfs_fid = (void *)fh; + ENTRY; + + CDEBUG(D_INFO, "encoding for (%lu,"DFID") maxlen=%d minlen=%d\n", + inode->i_ino, PFID(ll_inode2fid(inode)), *plen, + (int)sizeof(struct lustre_nfs_fid)); + + if (*plen < sizeof(struct lustre_nfs_fid) / 4) + RETURN(255); + + nfs_fid->lnf_child = *ll_inode2fid(inode); + nfs_fid->lnf_parent = *ll_inode2fid(parent); + *plen = sizeof(struct lustre_nfs_fid) / 4; + + RETURN(LUSTRE_NFS_FID); +} + +static int ll_nfs_get_name_filldir(void *cookie, const char *name, int namelen, + loff_t hash, u64 ino, unsigned type) +{ + /* It is hack to access lde_fid for comparison with lgd_fid. + * So the input 'name' must be part of the 'lu_dirent'. */ + struct lu_dirent *lde = container_of0(name, struct lu_dirent, lde_name); + struct ll_getname_data *lgd = cookie; + struct lu_fid fid; + + fid_le_to_cpu(&fid, &lde->lde_fid); + if (lu_fid_eq(&fid, &lgd->lgd_fid)) { + memcpy(lgd->lgd_name, name, namelen); + lgd->lgd_name[namelen] = 0; + lgd->lgd_found = 1; + } + return lgd->lgd_found; +} + +static int ll_get_name(struct dentry *dentry, char *name, + struct dentry *child) +{ + struct inode *dir = dentry->d_inode; + struct ll_getname_data lgd; + __u64 offset = 0; + int rc; + ENTRY; + + if (!dir || !S_ISDIR(dir->i_mode)) + GOTO(out, rc = -ENOTDIR); + + if (!dir->i_fop) + GOTO(out, rc = -EINVAL); + + lgd.lgd_name = name; + lgd.lgd_fid = ll_i2info(child->d_inode)->lli_fid; + lgd.lgd_found = 0; + + mutex_lock(&dir->i_mutex); + rc = ll_dir_read(dir, &offset, &lgd, ll_nfs_get_name_filldir); + mutex_unlock(&dir->i_mutex); + if (!rc && !lgd.lgd_found) + rc = -ENOENT; + EXIT; + +out: + return rc; +} + +static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != LUSTRE_NFS_FID) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent)); +} + +static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; + + if (fh_type != LUSTRE_NFS_FID) + RETURN(ERR_PTR(-EPROTO)); + + RETURN(ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL)); +} + +static struct dentry *ll_get_parent(struct dentry *dchild) +{ + struct ptlrpc_request *req = NULL; + struct inode *dir = dchild->d_inode; + struct ll_sb_info *sbi; + struct dentry *result = NULL; + struct mdt_body *body; + static char dotdot[] = ".."; + struct md_op_data *op_data; + int rc; + int lmmsize; + ENTRY; + + LASSERT(dir && S_ISDIR(dir->i_mode)); + + sbi = ll_s2sbi(dir->i_sb); + + CDEBUG(D_INFO, "getting parent for (%lu,"DFID")\n", + dir->i_ino, PFID(ll_inode2fid(dir))); + + rc = ll_get_max_mdsize(sbi, &lmmsize); + if (rc != 0) + RETURN(ERR_PTR(rc)); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, + strlen(dotdot), lmmsize, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN((void *)op_data); + + rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); + ll_finish_md_op_data(op_data); + if (rc) { + CERROR("failure %d inode %lu get parent\n", rc, dir->i_ino); + RETURN(ERR_PTR(rc)); + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body->valid & OBD_MD_FLID); + + CDEBUG(D_INFO, "parent for "DFID" is "DFID"\n", + PFID(ll_inode2fid(dir)), PFID(&body->fid1)); + + result = ll_iget_for_nfs(dir->i_sb, &body->fid1, NULL); + + ptlrpc_req_finished(req); + RETURN(result); +} + +struct export_operations lustre_export_operations = { + .get_parent = ll_get_parent, + .encode_fh = ll_encode_fh, + .get_name = ll_get_name, + .fh_to_dentry = ll_fh_to_dentry, + .fh_to_parent = ll_fh_to_parent, +}; diff --git a/drivers/staging/lustre/lustre/llite/llite_rmtacl.c b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c new file mode 100644 index 000000000000..4c610369cb9b --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/llite_rmtacl.c @@ -0,0 +1,301 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/llite_rmtacl.c + * + * Lustre Remote User Access Control List. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifdef CONFIG_FS_POSIX_ACL + +#include +#include +#include "llite_internal.h" + +static inline __u32 rce_hashfunc(uid_t id) +{ + return id & (RCE_HASHES - 1); +} + +static inline __u32 ee_hashfunc(uid_t id) +{ + return id & (EE_HASHES - 1); +} + +obd_valid rce_ops2valid(int ops) +{ + switch (ops) { + case RMT_LSETFACL: + return OBD_MD_FLRMTLSETFACL; + case RMT_LGETFACL: + return OBD_MD_FLRMTLGETFACL; + case RMT_RSETFACL: + return OBD_MD_FLRMTRSETFACL; + case RMT_RGETFACL: + return OBD_MD_FLRMTRGETFACL; + default: + return 0; + } +} + +static struct rmtacl_ctl_entry *rce_alloc(pid_t key, int ops) +{ + struct rmtacl_ctl_entry *rce; + + OBD_ALLOC_PTR(rce); + if (!rce) + return NULL; + + INIT_LIST_HEAD(&rce->rce_list); + rce->rce_key = key; + rce->rce_ops = ops; + + return rce; +} + +static void rce_free(struct rmtacl_ctl_entry *rce) +{ + if (!list_empty(&rce->rce_list)) + list_del(&rce->rce_list); + + OBD_FREE_PTR(rce); +} + +static struct rmtacl_ctl_entry *__rct_search(struct rmtacl_ctl_table *rct, + pid_t key) +{ + struct rmtacl_ctl_entry *rce; + struct list_head *head = &rct->rct_entries[rce_hashfunc(key)]; + + list_for_each_entry(rce, head, rce_list) + if (rce->rce_key == key) + return rce; + + return NULL; +} + +struct rmtacl_ctl_entry *rct_search(struct rmtacl_ctl_table *rct, pid_t key) +{ + struct rmtacl_ctl_entry *rce; + + spin_lock(&rct->rct_lock); + rce = __rct_search(rct, key); + spin_unlock(&rct->rct_lock); + return rce; +} + +int rct_add(struct rmtacl_ctl_table *rct, pid_t key, int ops) +{ + struct rmtacl_ctl_entry *rce, *e; + + rce = rce_alloc(key, ops); + if (rce == NULL) + return -ENOMEM; + + spin_lock(&rct->rct_lock); + e = __rct_search(rct, key); + if (unlikely(e != NULL)) { + CWARN("Unexpected stale rmtacl_entry found: " + "[key: %d] [ops: %d]\n", (int)key, ops); + rce_free(e); + } + list_add_tail(&rce->rce_list, &rct->rct_entries[rce_hashfunc(key)]); + spin_unlock(&rct->rct_lock); + + return 0; +} + +int rct_del(struct rmtacl_ctl_table *rct, pid_t key) +{ + struct rmtacl_ctl_entry *rce; + + spin_lock(&rct->rct_lock); + rce = __rct_search(rct, key); + if (rce) + rce_free(rce); + spin_unlock(&rct->rct_lock); + + return rce ? 0 : -ENOENT; +} + +void rct_init(struct rmtacl_ctl_table *rct) +{ + int i; + + spin_lock_init(&rct->rct_lock); + for (i = 0; i < RCE_HASHES; i++) + INIT_LIST_HEAD(&rct->rct_entries[i]); +} + +void rct_fini(struct rmtacl_ctl_table *rct) +{ + struct rmtacl_ctl_entry *rce; + int i; + + spin_lock(&rct->rct_lock); + for (i = 0; i < RCE_HASHES; i++) + while (!list_empty(&rct->rct_entries[i])) { + rce = list_entry(rct->rct_entries[i].next, + struct rmtacl_ctl_entry, rce_list); + rce_free(rce); + } + spin_unlock(&rct->rct_lock); +} + + +static struct eacl_entry *ee_alloc(pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header) +{ + struct eacl_entry *ee; + + OBD_ALLOC_PTR(ee); + if (!ee) + return NULL; + + INIT_LIST_HEAD(&ee->ee_list); + ee->ee_key = key; + ee->ee_fid = *fid; + ee->ee_type = type; + ee->ee_acl = header; + + return ee; +} + +void ee_free(struct eacl_entry *ee) +{ + if (!list_empty(&ee->ee_list)) + list_del(&ee->ee_list); + + if (ee->ee_acl) + lustre_ext_acl_xattr_free(ee->ee_acl); + + OBD_FREE_PTR(ee); +} + +static struct eacl_entry *__et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type) +{ + struct eacl_entry *ee; + struct list_head *head = &et->et_entries[ee_hashfunc(key)]; + + LASSERT(fid != NULL); + list_for_each_entry(ee, head, ee_list) + if (ee->ee_key == key) { + if (lu_fid_eq(&ee->ee_fid, fid) && + ee->ee_type == type) { + list_del_init(&ee->ee_list); + return ee; + } + } + + return NULL; +} + +struct eacl_entry *et_search_del(struct eacl_table *et, pid_t key, + struct lu_fid *fid, int type) +{ + struct eacl_entry *ee; + + spin_lock(&et->et_lock); + ee = __et_search_del(et, key, fid, type); + spin_unlock(&et->et_lock); + return ee; +} + +void et_search_free(struct eacl_table *et, pid_t key) +{ + struct eacl_entry *ee, *next; + struct list_head *head = &et->et_entries[ee_hashfunc(key)]; + + spin_lock(&et->et_lock); + list_for_each_entry_safe(ee, next, head, ee_list) + if (ee->ee_key == key) + ee_free(ee); + + spin_unlock(&et->et_lock); +} + +int ee_add(struct eacl_table *et, pid_t key, struct lu_fid *fid, int type, + ext_acl_xattr_header *header) +{ + struct eacl_entry *ee, *e; + + ee = ee_alloc(key, fid, type, header); + if (ee == NULL) + return -ENOMEM; + + spin_lock(&et->et_lock); + e = __et_search_del(et, key, fid, type); + if (unlikely(e != NULL)) { + CWARN("Unexpected stale eacl_entry found: " + "[key: %d] [fid: "DFID"] [type: %d]\n", + (int)key, PFID(fid), type); + ee_free(e); + } + list_add_tail(&ee->ee_list, &et->et_entries[ee_hashfunc(key)]); + spin_unlock(&et->et_lock); + + return 0; +} + +void et_init(struct eacl_table *et) +{ + int i; + + spin_lock_init(&et->et_lock); + for (i = 0; i < EE_HASHES; i++) + INIT_LIST_HEAD(&et->et_entries[i]); +} + +void et_fini(struct eacl_table *et) +{ + struct eacl_entry *ee; + int i; + + spin_lock(&et->et_lock); + for (i = 0; i < EE_HASHES; i++) + while (!list_empty(&et->et_entries[i])) { + ee = list_entry(et->et_entries[i].next, + struct eacl_entry, ee_list); + ee_free(ee); + } + spin_unlock(&et->et_lock); +} + +#endif diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c new file mode 100644 index 000000000000..b72f25738bab --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -0,0 +1,869 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* + * linux/drivers/block/loop.c + * + * Written by Theodore Ts'o, 3/29/93 + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + * + * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 + * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 + * + * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 + * + * Added devfs support - Richard Gooch 16-Jan-1998 + * + * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 + * + * Loadable modules and other fixes by AK, 1998 + * + * Maximum number of loop devices now dynamic via max_loop module parameter. + * Russell Kroll 19990701 + * + * Maximum number of loop devices when compiled-in now selectable by passing + * max_loop=<1-255> to the kernel on boot. + * Erik I. Bols?, , Oct 31, 1999 + * + * Completely rewrite request handling to be make_request_fn style and + * non blocking, pushing work to a helper thread. Lots of fixes from + * Al Viro too. + * Jens Axboe , Nov 2000 + * + * Support up to 256 loop devices + * Heinz Mauelshagen , Feb 2002 + * + * Support for falling back on the write file operation when the address space + * operations prepare_write and/or commit_write are not available on the + * backing filesystem. + * Anton Altaparmakov, 16 Feb 2005 + * + * Still To Fix: + * - Advisory locking is ignored here. + * - Should use an own CAP_* category instead of CAP_SYS_ADMIN + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for invalidate_bdev() */ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include "llite_internal.h" + +#define LLOOP_MAX_SEGMENTS LNET_MAX_IOV + +/* Possible states of device */ +enum { + LLOOP_UNBOUND, + LLOOP_BOUND, + LLOOP_RUNDOWN, +}; + +struct lloop_device { + int lo_number; + int lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*ioctl)(struct lloop_device *, int cmd, + unsigned long arg); + + struct file *lo_backing_file; + struct block_device *lo_device; + unsigned lo_blocksize; + + int old_gfp_mask; + + spinlock_t lo_lock; + struct bio *lo_bio; + struct bio *lo_biotail; + int lo_state; + struct semaphore lo_sem; + struct mutex lo_ctl_mutex; + atomic_t lo_pending; + wait_queue_head_t lo_bh_wait; + + struct request_queue *lo_queue; + + const struct lu_env *lo_env; + struct cl_io lo_io; + struct ll_dio_pages lo_pvec; + + /* data to handle bio for lustre. */ + struct lo_request_data { + struct page *lrd_pages[LLOOP_MAX_SEGMENTS]; + loff_t lrd_offsets[LLOOP_MAX_SEGMENTS]; + } lo_requests[1]; +}; + +/* + * Loop flags + */ +enum { + LO_FLAGS_READ_ONLY = 1, +}; + +static int lloop_major; +#define MAX_LOOP_DEFAULT 16 +static int max_loop = MAX_LOOP_DEFAULT; +static struct lloop_device *loop_dev; +static struct gendisk **disks; +static struct mutex lloop_mutex; +static void *ll_iocontrol_magic = NULL; + +static loff_t get_loop_size(struct lloop_device *lo, struct file *file) +{ + loff_t size, offset, loopsize; + + /* Compute loopsize in bytes */ + size = i_size_read(file->f_mapping->host); + offset = lo->lo_offset; + loopsize = size - offset; + if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) + loopsize = lo->lo_sizelimit; + + /* + * Unfortunately, if we want to do I/O on the device, + * the number of 512-byte sectors has to fit into a sector_t. + */ + return loopsize >> 9; +} + +static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head) +{ + const struct lu_env *env = lo->lo_env; + struct cl_io *io = &lo->lo_io; + struct inode *inode = lo->lo_backing_file->f_dentry->d_inode; + struct cl_object *obj = ll_i2info(inode)->lli_clob; + pgoff_t offset; + int ret; + int i; + int rw; + obd_count page_count = 0; + struct bio_vec *bvec; + struct bio *bio; + ssize_t bytes; + + struct ll_dio_pages *pvec = &lo->lo_pvec; + struct page **pages = pvec->ldp_pages; + loff_t *offsets = pvec->ldp_offsets; + + truncate_inode_pages(inode->i_mapping, 0); + + /* initialize the IO */ + memset(io, 0, sizeof(*io)); + io->ci_obj = obj; + ret = cl_io_init(env, io, CIT_MISC, obj); + if (ret) + return io->ci_result; + io->ci_lockreq = CILR_NEVER; + + LASSERT(head != NULL); + rw = head->bi_rw; + for (bio = head; bio != NULL; bio = bio->bi_next) { + LASSERT(rw == bio->bi_rw); + + offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset; + bio_for_each_segment(bvec, bio, i) { + BUG_ON(bvec->bv_offset != 0); + BUG_ON(bvec->bv_len != PAGE_CACHE_SIZE); + + pages[page_count] = bvec->bv_page; + offsets[page_count] = offset; + page_count++; + offset += bvec->bv_len; + } + LASSERT(page_count <= LLOOP_MAX_SEGMENTS); + } + + ll_stats_ops_tally(ll_i2sbi(inode), + (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ, + page_count); + + pvec->ldp_size = page_count << PAGE_CACHE_SHIFT; + pvec->ldp_nr = page_count; + + /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to + * write those pages into OST. Even worse case is that more pages + * would be asked to write out to swap space, and then finally get here + * again. + * Unfortunately this is NOT easy to fix. + * Thoughts on solution: + * 0. Define a reserved pool for cl_pages, which could be a list of + * pre-allocated cl_pages; + * 1. Define a new operation in cl_object_operations{}, says clo_depth, + * which measures how many layers for this lustre object. Generally + * speaking, the depth would be 2, one for llite, and one for lovsub. + * However, for SNS, there will be more since we need additional page + * to store parity; + * 2. Reserve the # of (page_count * depth) cl_pages from the reserved + * pool. Afterwards, the clio would allocate the pages from reserved + * pool, this guarantees we neeedn't allocate the cl_pages from + * generic cl_page slab cache. + * Of course, if there is NOT enough pages in the pool, we might + * be asked to write less pages once, this purely depends on + * implementation. Anyway, we should be careful to avoid deadlocking. + */ + mutex_lock(&inode->i_mutex); + bytes = ll_direct_rw_pages(env, io, rw, inode, pvec); + mutex_unlock(&inode->i_mutex); + cl_io_fini(env, io); + return (bytes == pvec->ldp_size) ? 0 : (int)bytes; +} + +/* + * Add bio to back of pending list + */ +static void loop_add_bio(struct lloop_device *lo, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&lo->lo_lock, flags); + if (lo->lo_biotail) { + lo->lo_biotail->bi_next = bio; + lo->lo_biotail = bio; + } else + lo->lo_bio = lo->lo_biotail = bio; + spin_unlock_irqrestore(&lo->lo_lock, flags); + + atomic_inc(&lo->lo_pending); + if (waitqueue_active(&lo->lo_bh_wait)) + wake_up(&lo->lo_bh_wait); +} + +/* + * Grab first pending buffer + */ +static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req) +{ + struct bio *first; + struct bio **bio; + unsigned int count = 0; + unsigned int page_count = 0; + int rw; + + spin_lock_irq(&lo->lo_lock); + first = lo->lo_bio; + if (unlikely(first == NULL)) { + spin_unlock_irq(&lo->lo_lock); + return 0; + } + + /* TODO: need to split the bio, too bad. */ + LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS); + + rw = first->bi_rw; + bio = &lo->lo_bio; + while (*bio && (*bio)->bi_rw == rw) { + CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n", + (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size, + page_count, (*bio)->bi_vcnt); + if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS) + break; + + + page_count += (*bio)->bi_vcnt; + count++; + bio = &(*bio)->bi_next; + } + if (*bio) { + /* Some of bios can't be mergable. */ + lo->lo_bio = *bio; + *bio = NULL; + } else { + /* Hit the end of queue */ + lo->lo_biotail = NULL; + lo->lo_bio = NULL; + } + *req = first; + spin_unlock_irq(&lo->lo_lock); + return count; +} + +static ll_mrf_ret +loop_make_request(struct request_queue *q, struct bio *old_bio) +{ + struct lloop_device *lo = q->queuedata; + int rw = bio_rw(old_bio); + int inactive; + + if (!lo) + goto err; + + CDEBUG(D_INFO, "submit bio sector %llu size %u\n", + (unsigned long long)old_bio->bi_sector, old_bio->bi_size); + + spin_lock_irq(&lo->lo_lock); + inactive = (lo->lo_state != LLOOP_BOUND); + spin_unlock_irq(&lo->lo_lock); + if (inactive) + goto err; + + if (rw == WRITE) { + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { + rw = READ; + } else if (rw != READ) { + CERROR("lloop: unknown command (%x)\n", rw); + goto err; + } + loop_add_bio(lo, old_bio); + LL_MRF_RETURN(0); +err: + cfs_bio_io_error(old_bio, old_bio->bi_size); + LL_MRF_RETURN(0); +} + + +static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio) +{ + int ret; + ret = do_bio_lustrebacked(lo, bio); + while (bio) { + struct bio *tmp = bio->bi_next; + bio->bi_next = NULL; + cfs_bio_endio(bio, bio->bi_size, ret); + bio = tmp; + } +} + +static inline int loop_active(struct lloop_device *lo) +{ + return atomic_read(&lo->lo_pending) || + (lo->lo_state == LLOOP_RUNDOWN); +} + +/* + * worker thread that handles reads/writes to file backed loop devices, + * to avoid blocking in our make_request_fn. + */ +static int loop_thread(void *data) +{ + struct lloop_device *lo = data; + struct bio *bio; + unsigned int count; + unsigned long times = 0; + unsigned long total_count = 0; + + struct lu_env *env; + int refcheck; + int ret = 0; + + set_user_nice(current, -20); + + lo->lo_state = LLOOP_BOUND; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, ret = PTR_ERR(env)); + + lo->lo_env = env; + memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec)); + lo->lo_pvec.ldp_pages = lo->lo_requests[0].lrd_pages; + lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets; + + /* + * up sem, we are running + */ + up(&lo->lo_sem); + + for (;;) { + wait_event(lo->lo_bh_wait, loop_active(lo)); + if (!atomic_read(&lo->lo_pending)) { + int exiting = 0; + spin_lock_irq(&lo->lo_lock); + exiting = (lo->lo_state == LLOOP_RUNDOWN); + spin_unlock_irq(&lo->lo_lock); + if (exiting) + break; + } + + bio = NULL; + count = loop_get_bio(lo, &bio); + if (!count) { + CWARN("lloop(minor: %d): missing bio\n", lo->lo_number); + continue; + } + + total_count += count; + if (total_count < count) { /* overflow */ + total_count = count; + times = 1; + } else { + times++; + } + if ((times & 127) == 0) { + CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n", + total_count, times, total_count / times); + } + + LASSERT(bio != NULL); + LASSERT(count <= atomic_read(&lo->lo_pending)); + loop_handle_bio(lo, bio); + atomic_sub(count, &lo->lo_pending); + } + cl_env_put(env, &refcheck); + +out: + up(&lo->lo_sem); + return ret; +} + +static int loop_set_fd(struct lloop_device *lo, struct file *unused, + struct block_device *bdev, struct file *file) +{ + struct inode *inode; + struct address_space *mapping; + int lo_flags = 0; + int error; + loff_t size; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + error = -EBUSY; + if (lo->lo_state != LLOOP_UNBOUND) + goto out; + + mapping = file->f_mapping; + inode = mapping->host; + + error = -EINVAL; + if (!S_ISREG(inode->i_mode) || inode->i_sb->s_magic != LL_SUPER_MAGIC) + goto out; + + if (!(file->f_mode & FMODE_WRITE)) + lo_flags |= LO_FLAGS_READ_ONLY; + + size = get_loop_size(lo, file); + + if ((loff_t)(sector_t)size != size) { + error = -EFBIG; + goto out; + } + + /* remove all pages in cache so as dirty pages not to be existent. */ + truncate_inode_pages(mapping, 0); + + set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + + lo->lo_blocksize = PAGE_CACHE_SIZE; + lo->lo_device = bdev; + lo->lo_flags = lo_flags; + lo->lo_backing_file = file; + lo->ioctl = NULL; + lo->lo_sizelimit = 0; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + + lo->lo_bio = lo->lo_biotail = NULL; + + /* + * set queue make_request_fn, and add limits based on lower level + * device + */ + blk_queue_make_request(lo->lo_queue, loop_make_request); + lo->lo_queue->queuedata = lo; + + /* queue parameters */ + CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8))); + blk_queue_logical_block_size(lo->lo_queue, + (unsigned short)PAGE_CACHE_SIZE); + blk_queue_max_hw_sectors(lo->lo_queue, + LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9)); + blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS); + + set_capacity(disks[lo->lo_number], size); + bd_set_size(bdev, size << 9); + + set_blocksize(bdev, lo->lo_blocksize); + + kthread_run(loop_thread, lo, "lloop%d", lo->lo_number); + down(&lo->lo_sem); + return 0; + +out: + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return error; +} + +static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, + int count) +{ + struct file *filp = lo->lo_backing_file; + int gfp = lo->old_gfp_mask; + + if (lo->lo_state != LLOOP_BOUND) + return -ENXIO; + + if (lo->lo_refcnt > count) /* we needed one fd for the ioctl */ + return -EBUSY; + + if (filp == NULL) + return -EINVAL; + + spin_lock_irq(&lo->lo_lock); + lo->lo_state = LLOOP_RUNDOWN; + spin_unlock_irq(&lo->lo_lock); + wake_up(&lo->lo_bh_wait); + + down(&lo->lo_sem); + lo->lo_backing_file = NULL; + lo->ioctl = NULL; + lo->lo_device = NULL; + lo->lo_offset = 0; + lo->lo_sizelimit = 0; + lo->lo_flags = 0; + ll_invalidate_bdev(bdev, 0); + set_capacity(disks[lo->lo_number], 0); + bd_set_size(bdev, 0); + mapping_set_gfp_mask(filp->f_mapping, gfp); + lo->lo_state = LLOOP_UNBOUND; + fput(filp); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return 0; +} + +static int lo_open(struct block_device *bdev, fmode_t mode) +{ + struct lloop_device *lo = bdev->bd_disk->private_data; + + mutex_lock(&lo->lo_ctl_mutex); + lo->lo_refcnt++; + mutex_unlock(&lo->lo_ctl_mutex); + + return 0; +} + +static int lo_release(struct gendisk *disk, fmode_t mode) +{ + struct lloop_device *lo = disk->private_data; + + mutex_lock(&lo->lo_ctl_mutex); + --lo->lo_refcnt; + mutex_unlock(&lo->lo_ctl_mutex); + + return 0; +} + +/* lloop device node's ioctl function. */ +static int lo_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct lloop_device *lo = bdev->bd_disk->private_data; + struct inode *inode = NULL; + int err = 0; + + mutex_lock(&lloop_mutex); + switch (cmd) { + case LL_IOC_LLOOP_DETACH: { + err = loop_clr_fd(lo, bdev, 2); + if (err == 0) + ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */ + break; + } + + case LL_IOC_LLOOP_INFO: { + struct lu_fid fid; + + LASSERT(lo->lo_backing_file != NULL); + if (inode == NULL) + inode = lo->lo_backing_file->f_dentry->d_inode; + if (lo->lo_state == LLOOP_BOUND) + fid = ll_i2info(inode)->lli_fid; + else + fid_zero(&fid); + + if (copy_to_user((struct lu_fid *)arg, &fid, sizeof(fid))) + err = -EFAULT; + break; + } + + default: + err = -EINVAL; + break; + } + mutex_unlock(&lloop_mutex); + + return err; +} + +static struct block_device_operations lo_fops = { + .owner = THIS_MODULE, + .open = lo_open, + .release = lo_release, + .ioctl = lo_ioctl, +}; + +/* dynamic iocontrol callback. + * This callback is registered in lloop_init and will be called by + * ll_iocontrol_call. + * + * This is a llite regular file ioctl function. It takes the responsibility + * of attaching or detaching a file by a lloop's device numner. + */ +static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, + unsigned int cmd, unsigned long arg, + void *magic, int *rcp) +{ + struct lloop_device *lo = NULL; + struct block_device *bdev = NULL; + int err = 0; + dev_t dev; + + if (magic != ll_iocontrol_magic) + return LLIOC_CONT; + + if (disks == NULL) + GOTO(out1, err = -ENODEV); + + CWARN("Enter llop_ioctl\n"); + + mutex_lock(&lloop_mutex); + switch (cmd) { + case LL_IOC_LLOOP_ATTACH: { + struct lloop_device *lo_free = NULL; + int i; + + for (i = 0; i < max_loop; i++, lo = NULL) { + lo = &loop_dev[i]; + if (lo->lo_state == LLOOP_UNBOUND) { + if (!lo_free) + lo_free = lo; + continue; + } + if (lo->lo_backing_file->f_dentry->d_inode == + file->f_dentry->d_inode) + break; + } + if (lo || !lo_free) + GOTO(out, err = -EBUSY); + + lo = lo_free; + dev = MKDEV(lloop_major, lo->lo_number); + + /* quit if the used pointer is writable */ + if (put_user((long)old_encode_dev(dev), (long*)arg)) + GOTO(out, err = -EFAULT); + + bdev = blkdev_get_by_dev(dev, file->f_mode, NULL); + if (IS_ERR(bdev)) + GOTO(out, err = PTR_ERR(bdev)); + + get_file(file); + err = loop_set_fd(lo, NULL, bdev, file); + if (err) { + fput(file); + ll_blkdev_put(bdev, 0); + } + + break; + } + + case LL_IOC_LLOOP_DETACH_BYDEV: { + int minor; + + dev = old_decode_dev(arg); + if (MAJOR(dev) != lloop_major) + GOTO(out, err = -EINVAL); + + minor = MINOR(dev); + if (minor > max_loop - 1) + GOTO(out, err = -EINVAL); + + lo = &loop_dev[minor]; + if (lo->lo_state != LLOOP_BOUND) + GOTO(out, err = -EINVAL); + + bdev = lo->lo_device; + err = loop_clr_fd(lo, bdev, 1); + if (err == 0) + ll_blkdev_put(bdev, 0); /* grabbed in LLOOP_ATTACH */ + + break; + } + + default: + err = -EINVAL; + break; + } + +out: + mutex_unlock(&lloop_mutex); +out1: + if (rcp) + *rcp = err; + return LLIOC_STOP; +} + +static int __init lloop_init(void) +{ + int i; + unsigned int cmdlist[] = { + LL_IOC_LLOOP_ATTACH, + LL_IOC_LLOOP_DETACH_BYDEV, + }; + + if (max_loop < 1 || max_loop > 256) { + max_loop = MAX_LOOP_DEFAULT; + CWARN("lloop: invalid max_loop (must be between" + " 1 and 256), using default (%u)\n", max_loop); + } + + lloop_major = register_blkdev(0, "lloop"); + if (lloop_major < 0) + return -EIO; + + CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n", + lloop_major, max_loop); + + ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist); + if (ll_iocontrol_magic == NULL) + goto out_mem1; + + OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev)); + if (!loop_dev) + goto out_mem1; + + OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks)); + if (!disks) + goto out_mem2; + + for (i = 0; i < max_loop; i++) { + disks[i] = alloc_disk(1); + if (!disks[i]) + goto out_mem3; + } + + mutex_init(&lloop_mutex); + + for (i = 0; i < max_loop; i++) { + struct lloop_device *lo = &loop_dev[i]; + struct gendisk *disk = disks[i]; + + lo->lo_queue = blk_alloc_queue(GFP_KERNEL); + if (!lo->lo_queue) + goto out_mem4; + + mutex_init(&lo->lo_ctl_mutex); + sema_init(&lo->lo_sem, 0); + init_waitqueue_head(&lo->lo_bh_wait); + lo->lo_number = i; + spin_lock_init(&lo->lo_lock); + disk->major = lloop_major; + disk->first_minor = i; + disk->fops = &lo_fops; + sprintf(disk->disk_name, "lloop%d", i); + disk->private_data = lo; + disk->queue = lo->lo_queue; + } + + /* We cannot fail after we call this, so another loop!*/ + for (i = 0; i < max_loop; i++) + add_disk(disks[i]); + return 0; + +out_mem4: + while (i--) + blk_cleanup_queue(loop_dev[i].lo_queue); + i = max_loop; +out_mem3: + while (i--) + put_disk(disks[i]); + OBD_FREE(disks, max_loop * sizeof(*disks)); +out_mem2: + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); +out_mem1: + unregister_blkdev(lloop_major, "lloop"); + ll_iocontrol_unregister(ll_iocontrol_magic); + CERROR("lloop: ran out of memory\n"); + return -ENOMEM; +} + +static void lloop_exit(void) +{ + int i; + + ll_iocontrol_unregister(ll_iocontrol_magic); + for (i = 0; i < max_loop; i++) { + del_gendisk(disks[i]); + blk_cleanup_queue(loop_dev[i].lo_queue); + put_disk(disks[i]); + } + if (ll_unregister_blkdev(lloop_major, "lloop")) + CWARN("lloop: cannot unregister blkdev\n"); + else + CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major); + + OBD_FREE(disks, max_loop * sizeof(*disks)); + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); +} + +module_init(lloop_init); +module_exit(lloop_exit); + +CFS_MODULE_PARM(max_loop, "i", int, 0444, "maximum of lloop_device"); +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre virtual block device"); +MODULE_LICENSE("GPL"); diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c new file mode 100644 index 000000000000..22e19a6c0461 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c @@ -0,0 +1,1397 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include + +#include "llite_internal.h" + +struct proc_dir_entry *proc_lustre_fs_root; + +#ifdef LPROCFS +/* /proc/lustre/llite mount point registration */ +extern struct file_operations vvp_dump_pgcache_file_ops; +struct file_operations ll_rw_extents_stats_fops; +struct file_operations ll_rw_extents_stats_pp_fops; +struct file_operations ll_rw_offset_stats_fops; + +static int ll_rd_blksize(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, "%u\n", osfs.os_bsize); + } + + return rc; +} + +static int ll_rd_kbytestotal(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; + +} + +static int ll_rd_kbytesfree(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; +} + +static int ll_rd_kbytesavail(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; +} + +static int ll_rd_filestotal(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_files); + } + return rc; +} + +static int ll_rd_filesfree(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct obd_statfs osfs; + int rc; + + LASSERT(sb != NULL); + rc = ll_statfs_internal(sb, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_ffree); + } + return rc; + +} + +static int ll_rd_client_type(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct ll_sb_info *sbi = ll_s2sbi((struct super_block *)data); + int rc; + + LASSERT(sbi != NULL); + + *eof = 1; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + rc = snprintf(page, count, "remote client\n"); + else + rc = snprintf(page, count, "local client\n"); + + return rc; +} + +static int ll_rd_fstype(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block*)data; + + LASSERT(sb != NULL); + *eof = 1; + return snprintf(page, count, "%s\n", sb->s_type->name); +} + +static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + + LASSERT(sb != NULL); + *eof = 1; + return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid); +} + +static int ll_rd_site_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + + /* + * See description of statistical counters in struct cl_site, and + * struct lu_site. + */ + return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), + page, count); +} + +static int ll_rd_max_readahead_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); +} + +static int ll_wr_max_readahead_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || pages_number > num_physpages / 2) { + CERROR("can't set file readahead more than %lu MB\n", + num_physpages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/ + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +static int ll_rd_max_readahead_per_file_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_pages_per_file; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); +} + +static int ll_wr_max_readahead_per_file_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages) { + CERROR("can't set file readahead more than" + "max_read_ahead_mb %lu MB\n", + sbi->ll_ra_info.ra_max_pages); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_pages_per_file = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +static int ll_rd_max_read_ahead_whole_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + long pages_number; + int mult; + + spin_lock(&sbi->ll_lock); + pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages; + spin_unlock(&sbi->ll_lock); + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + return lprocfs_read_frac_helper(page, count, pages_number, mult); +} + +static int ll_wr_max_read_ahead_whole_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int mult, rc, pages_number; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + /* Cap this at the current max readahead window size, the readahead + * algorithm does this anyway so it's pointless to set it larger. */ + if (pages_number < 0 || + pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { + CERROR("can't set max_read_ahead_whole_mb more than " + "max_read_ahead_per_file_mb: %lu\n", + sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT)); + return -ERANGE; + } + + spin_lock(&sbi->ll_lock); + sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; + spin_unlock(&sbi->ll_lock); + + return count; +} + +static int ll_rd_max_cached_mb(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = &sbi->ll_cache; + int shift = 20 - PAGE_CACHE_SHIFT; + int max_cached_mb; + int unused_mb; + + *eof = 1; + max_cached_mb = cache->ccc_lru_max >> shift; + unused_mb = atomic_read(&cache->ccc_lru_left) >> shift; + return snprintf(page, count, + "users: %d\n" + "max_cached_mb: %d\n" + "used_mb: %d\n" + "unused_mb: %d\n" + "reclaim_count: %u\n", + atomic_read(&cache->ccc_users), + max_cached_mb, + max_cached_mb - unused_mb, + unused_mb, + cache->ccc_lru_shrinkers); +} + +static int ll_wr_max_cached_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = &sbi->ll_cache; + int mult, rc, pages_number; + int diff = 0; + int nrpages = 0; + ENTRY; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + buffer = lprocfs_find_named_value(buffer, "max_cached_mb:", &count); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + RETURN(rc); + + if (pages_number < 0 || pages_number > num_physpages) { + CERROR("%s: can't set max cache more than %lu MB\n", + ll_get_fsname(sb, NULL, 0), + num_physpages >> (20 - PAGE_CACHE_SHIFT)); + RETURN(-ERANGE); + } + + if (sbi->ll_dt_exp == NULL) + RETURN(-ENODEV); + + spin_lock(&sbi->ll_lock); + diff = pages_number - cache->ccc_lru_max; + spin_unlock(&sbi->ll_lock); + + /* easy - add more LRU slots. */ + if (diff >= 0) { + atomic_add(diff, &cache->ccc_lru_left); + GOTO(out, rc = 0); + } + + diff = -diff; + while (diff > 0) { + int tmp; + + /* reduce LRU budget from free slots. */ + do { + int ov, nv; + + ov = atomic_read(&cache->ccc_lru_left); + if (ov == 0) + break; + + nv = ov > diff ? ov - diff : 0; + rc = cfs_atomic_cmpxchg(&cache->ccc_lru_left, ov, nv); + if (likely(ov == rc)) { + diff -= ov - nv; + nrpages += ov - nv; + break; + } + } while (1); + + if (diff <= 0) + break; + + /* difficult - have to ask OSCs to drop LRU slots. */ + tmp = diff << 1; + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, + sizeof(KEY_CACHE_LRU_SHRINK), + KEY_CACHE_LRU_SHRINK, + sizeof(tmp), &tmp, NULL); + if (rc < 0) + break; + } + +out: + if (rc >= 0) { + spin_lock(&sbi->ll_lock); + cache->ccc_lru_max = pages_number; + spin_unlock(&sbi->ll_lock); + rc = count; + } else { + atomic_add(nrpages, &cache->ccc_lru_left); + } + return rc; +} + +static int ll_rd_checksum(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return snprintf(page, count, "%u\n", + (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0); +} + +static int ll_wr_checksum(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + if (!sbi->ll_dt_exp) + /* Not set up yet */ + return -EAGAIN; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + if (val) + sbi->ll_flags |= LL_SBI_CHECKSUM; + else + sbi->ll_flags &= ~LL_SBI_CHECKSUM; + + rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(val), &val, NULL); + if (rc) + CWARN("Failed to set OSC checksum flags: %d\n", rc); + + return count; +} + +static int ll_rd_max_rw_chunk(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + + return snprintf(page, count, "%lu\n", ll_s2sbi(sb)->ll_max_rw_chunk); +} + +static int ll_wr_max_rw_chunk(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + int rc, val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + ll_s2sbi(sb)->ll_max_rw_chunk = val; + return count; +} + +static int ll_rd_track_id(char *page, int count, void *data, + enum stats_track_type type) +{ + struct super_block *sb = data; + + if (ll_s2sbi(sb)->ll_stats_track_type == type) { + return snprintf(page, count, "%d\n", + ll_s2sbi(sb)->ll_stats_track_id); + + } else if (ll_s2sbi(sb)->ll_stats_track_type == STATS_TRACK_ALL) { + return snprintf(page, count, "0 (all)\n"); + } else { + return snprintf(page, count, "untracked\n"); + } +} + +static int ll_wr_track_id(const char *buffer, unsigned long count, void *data, + enum stats_track_type type) +{ + struct super_block *sb = data; + int rc, pid; + + rc = lprocfs_write_helper(buffer, count, &pid); + if (rc) + return rc; + ll_s2sbi(sb)->ll_stats_track_id = pid; + if (pid == 0) + ll_s2sbi(sb)->ll_stats_track_type = STATS_TRACK_ALL; + else + ll_s2sbi(sb)->ll_stats_track_type = type; + lprocfs_clear_stats(ll_s2sbi(sb)->ll_stats); + return count; +} + +static int ll_rd_track_pid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return (ll_rd_track_id(page, count, data, STATS_TRACK_PID)); +} + +static int ll_wr_track_pid(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PID)); +} + +static int ll_rd_track_ppid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return (ll_rd_track_id(page, count, data, STATS_TRACK_PPID)); +} + +static int ll_wr_track_ppid(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + return (ll_wr_track_id(buffer, count, data, STATS_TRACK_PPID)); +} + +static int ll_rd_track_gid(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return (ll_rd_track_id(page, count, data, STATS_TRACK_GID)); +} + +static int ll_wr_track_gid(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + return (ll_wr_track_id(buffer, count, data, STATS_TRACK_GID)); +} + +static int ll_rd_statahead_max(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return snprintf(page, count, "%u\n", sbi->ll_sa_max); +} + +static int ll_wr_statahead_max(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val >= 0 && val <= LL_SA_RPC_MAX) + sbi->ll_sa_max = val; + else + CERROR("Bad statahead_max value %d. Valid values are in the " + "range [0, %d]\n", val, LL_SA_RPC_MAX); + + return count; +} + +static int ll_rd_statahead_agl(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return snprintf(page, count, "%u\n", + sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0); +} + +static int ll_wr_statahead_agl(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_AGL_ENABLED; + else + sbi->ll_flags &= ~LL_SBI_AGL_ENABLED; + + return count; +} + +static int ll_rd_statahead_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return snprintf(page, count, + "statahead total: %u\n" + "statahead wrong: %u\n" + "agl total: %u\n", + atomic_read(&sbi->ll_sa_total), + atomic_read(&sbi->ll_sa_wrong), + atomic_read(&sbi->ll_agl_total)); +} + +static int ll_rd_lazystatfs(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return snprintf(page, count, "%u\n", + (sbi->ll_flags & LL_SBI_LAZYSTATFS) ? 1 : 0); +} + +static int ll_wr_lazystatfs(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val) + sbi->ll_flags |= LL_SBI_LAZYSTATFS; + else + sbi->ll_flags &= ~LL_SBI_LAZYSTATFS; + + return count; +} + +static int ll_rd_maxea_size(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + unsigned int ealen; + int rc; + + rc = ll_get_max_mdsize(sbi, &ealen); + if (rc) + return rc; + + return snprintf(page, count, "%u\n", ealen); +} + +static int ll_rd_sbi_flags(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + const char *str[] = LL_SBI_FLAGS; + struct super_block *sb = data; + int flags = ll_s2sbi(sb)->ll_flags; + int i = 0; + int rc = 0; + + while (flags != 0) { + if (ARRAY_SIZE(str) <= i) { + CERROR("%s: Revise array LL_SBI_FLAGS to match sbi " + "flags please.\n", ll_get_fsname(sb, NULL, 0)); + return -EINVAL; + } + + if (flags & 0x1) + rc += snprintf(page + rc, count - rc, "%s ", str[i]); + flags >>= 1; + ++i; + } + if (rc > 0) + rc += snprintf(page + rc, count - rc, "\b\n"); + return rc; +} + +static int ll_rd_unstable_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct cl_client_cache *cache = &sbi->ll_cache; + int pages, mb, rc; + + pages = atomic_read(&cache->ccc_unstable_nr); + mb = (pages * PAGE_CACHE_SIZE) >> 20; + + rc = snprintf(page, count, "unstable_pages: %8d\n" + "unstable_mb: %8d\n", pages, mb); + + return rc; +} + +static struct lprocfs_vars lprocfs_llite_obd_vars[] = { + { "uuid", ll_rd_sb_uuid, 0, 0 }, + //{ "mntpt_path", ll_rd_path, 0, 0 }, + { "fstype", ll_rd_fstype, 0, 0 }, + { "site", ll_rd_site_stats, 0, 0 }, + { "blocksize", ll_rd_blksize, 0, 0 }, + { "kbytestotal", ll_rd_kbytestotal, 0, 0 }, + { "kbytesfree", ll_rd_kbytesfree, 0, 0 }, + { "kbytesavail", ll_rd_kbytesavail, 0, 0 }, + { "filestotal", ll_rd_filestotal, 0, 0 }, + { "filesfree", ll_rd_filesfree, 0, 0 }, + { "client_type", ll_rd_client_type, 0, 0 }, + //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, + { "max_read_ahead_mb", ll_rd_max_readahead_mb, + ll_wr_max_readahead_mb, 0 }, + { "max_read_ahead_per_file_mb", ll_rd_max_readahead_per_file_mb, + ll_wr_max_readahead_per_file_mb, 0 }, + { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, + ll_wr_max_read_ahead_whole_mb, 0 }, + { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, + { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 }, + { "max_rw_chunk", ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 }, + { "stats_track_pid", ll_rd_track_pid, ll_wr_track_pid, 0 }, + { "stats_track_ppid", ll_rd_track_ppid, ll_wr_track_ppid, 0 }, + { "stats_track_gid", ll_rd_track_gid, ll_wr_track_gid, 0 }, + { "statahead_max", ll_rd_statahead_max, ll_wr_statahead_max, 0 }, + { "statahead_agl", ll_rd_statahead_agl, ll_wr_statahead_agl, 0 }, + { "statahead_stats", ll_rd_statahead_stats, 0, 0 }, + { "lazystatfs", ll_rd_lazystatfs, ll_wr_lazystatfs, 0 }, + { "max_easize", ll_rd_maxea_size, 0, 0 }, + { "sbi_flags", ll_rd_sbi_flags, 0, 0 }, + { "unstable_stats", ll_rd_unstable_stats, 0, 0}, + { 0 } +}; + +#define MAX_STRING_SIZE 128 + +struct llite_file_opcode { + __u32 opcode; + __u32 type; + const char *opname; +} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { + /* file operation */ + { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, + { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, + { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "read_bytes" }, + { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "write_bytes" }, + { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_read" }, + { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "brw_write" }, + { LPROC_LL_OSC_READ, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "osc_read" }, + { LPROC_LL_OSC_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "osc_write" }, + { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, + { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, + { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" }, + { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" }, + { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" }, + { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" }, + { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" }, + /* inode operation */ + { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, + { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, + { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, + { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, + /* dir inode operation */ + { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" }, + { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" }, + { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" }, + { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" }, + { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" }, + { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" }, + { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" }, + { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" }, + /* special inode operation */ + { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" }, + { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" }, + { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, + { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, + { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" }, + { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" }, + { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" }, +}; + +void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) +{ + if (!sbi->ll_stats) + return; + if (sbi->ll_stats_track_type == STATS_TRACK_ALL) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PID && + sbi->ll_stats_track_id == current->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && + sbi->ll_stats_track_id == current->parent->pid) + lprocfs_counter_add(sbi->ll_stats, op, count); + else if (sbi->ll_stats_track_type == STATS_TRACK_GID && + sbi->ll_stats_track_id == current_gid()) + lprocfs_counter_add(sbi->ll_stats, op, count); +} +EXPORT_SYMBOL(ll_stats_ops_tally); + +static const char *ra_stat_string[] = { + [RA_STAT_HIT] = "hits", + [RA_STAT_MISS] = "misses", + [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", + [RA_STAT_MISS_IN_WINDOW] = "miss inside window", + [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", + [RA_STAT_FAILED_MATCH] = "failed lock match", + [RA_STAT_DISCARDED] = "read but discarded", + [RA_STAT_ZERO_LEN] = "zero length file", + [RA_STAT_ZERO_WINDOW] = "zero size window", + [RA_STAT_EOF] = "read-ahead to EOF", + [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", +}; + + +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc) +{ + struct lprocfs_vars lvars[2]; + struct lustre_sb_info *lsi = s2lsi(sb); + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + char name[MAX_STRING_SIZE + 1], *ptr; + int err, id, len, rc; + ENTRY; + + memset(lvars, 0, sizeof(lvars)); + + name[MAX_STRING_SIZE] = '\0'; + lvars[0].name = name; + + LASSERT(sbi != NULL); + LASSERT(mdc != NULL); + LASSERT(osc != NULL); + + /* Get fsname */ + len = strlen(lsi->lsi_lmd->lmd_profile); + ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); + if (ptr && (strcmp(ptr, "-client") == 0)) + len -= 7; + + /* Mount info */ + snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len, + lsi->lsi_lmd->lmd_profile, sb); + + sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL); + if (IS_ERR(sbi->ll_proc_root)) { + err = PTR_ERR(sbi->ll_proc_root); + sbi->ll_proc_root = NULL; + RETURN(err); + } + + rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444, + &vvp_dump_pgcache_file_ops, sbi); + if (rc) + CWARN("Error adding the dump_page_cache file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats", 0644, + &ll_rw_extents_stats_fops, sbi); + if (rc) + CWARN("Error adding the extent_stats file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "extents_stats_per_process", + 0644, &ll_rw_extents_stats_pp_fops, sbi); + if (rc) + CWARN("Error adding the extents_stats_per_process file\n"); + + rc = lprocfs_seq_create(sbi->ll_proc_root, "offset_stats", 0644, + &ll_rw_offset_stats_fops, sbi); + if (rc) + CWARN("Error adding the offset_stats file\n"); + + /* File operations stats */ + sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_stats == NULL) + GOTO(out, err = -ENOMEM); + /* do counter init */ + for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { + __u32 type = llite_opcode_table[id].type; + void *ptr = NULL; + if (type & LPROCFS_TYPE_REGS) + ptr = "regs"; + else if (type & LPROCFS_TYPE_BYTES) + ptr = "bytes"; + else if (type & LPROCFS_TYPE_PAGES) + ptr = "pages"; + lprocfs_counter_init(sbi->ll_stats, + llite_opcode_table[id].opcode, + (type & LPROCFS_CNTR_AVGMINMAX), + llite_opcode_table[id].opname, ptr); + } + err = lprocfs_register_stats(sbi->ll_proc_root, "stats", sbi->ll_stats); + if (err) + GOTO(out, err); + + sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), + LPROCFS_STATS_FLAG_NONE); + if (sbi->ll_ra_stats == NULL) + GOTO(out, err = -ENOMEM); + + for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) + lprocfs_counter_init(sbi->ll_ra_stats, id, 0, + ra_stat_string[id], "pages"); + err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats", + sbi->ll_ra_stats); + if (err) + GOTO(out, err); + + + err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_llite_obd_vars, sb); + if (err) + GOTO(out, err); + + /* MDC info */ + obd = class_name2obd(mdc); + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_name != NULL); + + snprintf(name, MAX_STRING_SIZE, "%s/common_name", + obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_name; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err) + GOTO(out, err); + + snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_uuid; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err) + GOTO(out, err); + + /* OSC */ + obd = class_name2obd(osc); + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_name != NULL); + + snprintf(name, MAX_STRING_SIZE, "%s/common_name", + obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_name; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err) + GOTO(out, err); + + snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_uuid; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); +out: + if (err) { + lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); + } + RETURN(err); +} + +void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) +{ + if (sbi->ll_proc_root) { + lprocfs_remove(&sbi->ll_proc_root); + lprocfs_free_stats(&sbi->ll_ra_stats); + lprocfs_free_stats(&sbi->ll_stats); + } +} +#undef MAX_STRING_SIZE + +#define pct(a,b) (b ? a * 100 / b : 0) + +static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, + struct seq_file *seq, int which) +{ + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + unsigned long start, end, r, w; + char *unitp = "KMGTPEZY"; + int i, units = 10; + struct per_process_info *pp_info = &io_extents->pp_extents[which]; + + read_cum = 0; + write_cum = 0; + start = 0; + + for(i = 0; i < LL_HIST_MAX; i++) { + read_tot += pp_info->pp_r_hist.oh_buckets[i]; + write_tot += pp_info->pp_w_hist.oh_buckets[i]; + } + + for(i = 0; i < LL_HIST_MAX; i++) { + r = pp_info->pp_r_hist.oh_buckets[i]; + w = pp_info->pp_w_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + end = 1 << (i + LL_HIST_START - units); + seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu | " + "%14lu %4lu %4lu\n", start, *unitp, end, *unitp, + (i == LL_HIST_MAX - 1) ? '+' : ' ', + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + start = end; + if (start == 1<<10) { + start = 1; + units += 10; + unitp++; + } + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int k; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, " + "then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_pp_extent_lock); + for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { + if (io_extents->pp_extents[k].pid != 0) { + seq_printf(seq, "\nPID: %d\n", + io_extents->pp_extents[k].pid); + ll_display_extents_info(io_extents, seq, k); + } + } + spin_unlock(&sbi->ll_pp_extent_lock); + return 0; +} + +static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, + const char *buf, size_t len, + loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + int value = 1, rc = 0; + + rc = lprocfs_write_helper(buf, len, &value); + if (rc < 0 && (strcmp(buf, "disabled") == 0 || + strcmp(buf, "Disabled") == 0)) + value = 0; + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + return len; +} + +LPROC_SEQ_FOPS(ll_rw_extents_stats_pp); + +static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, " + "then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + + seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); + seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", + "extents", "calls", "%", "cum%", + "calls", "%", "cum%"); + spin_lock(&sbi->ll_lock); + ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX); + spin_unlock(&sbi->ll_lock); + + return 0; +} + +static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + int i; + int value = 1, rc = 0; + + rc = lprocfs_write_helper(buf, len, &value); + if (rc < 0 && (strcmp(buf, "disabled") == 0 || + strcmp(buf, "Disabled") == 0)) + value = 0; + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + spin_lock(&sbi->ll_pp_extent_lock); + for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { + io_extents->pp_extents[i].pid = 0; + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); + } + spin_unlock(&sbi->ll_pp_extent_lock); + + return len; +} + +LPROC_SEQ_FOPS(ll_rw_extents_stats); + +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) +{ + int i, cur = -1; + struct ll_rw_process_info *process; + struct ll_rw_process_info *offset; + int *off_count = &sbi->ll_rw_offset_entry_count; + int *process_count = &sbi->ll_offset_process_count; + struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; + + if(!sbi->ll_rw_stats_on) + return; + process = sbi->ll_rw_process_info; + offset = sbi->ll_rw_offset_info; + + spin_lock(&sbi->ll_pp_extent_lock); + /* Extent statistics */ + for(i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if(io_extents->pp_extents[i].pid == pid) { + cur = i; + break; + } + } + + if (cur == -1) { + /* new process */ + sbi->ll_extent_process_count = + (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; + cur = sbi->ll_extent_process_count; + io_extents->pp_extents[cur].pid = pid; + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist); + lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist); + } + + for(i = 0; (count >= (1 << LL_HIST_START << i)) && + (i < (LL_HIST_MAX - 1)); i++); + if (rw == 0) { + io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; + } else { + io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; + io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; + } + spin_unlock(&sbi->ll_pp_extent_lock); + + spin_lock(&sbi->ll_process_lock); + /* Offset statistics */ + for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid == pid) { + if (process[i].rw_last_file != file) { + process[i].rw_range_start = pos; + process[i].rw_last_file_pos = pos + count; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = 0; + process[i].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); + return; + } + if (process[i].rw_last_file_pos != pos) { + *off_count = + (*off_count + 1) % LL_OFFSET_HIST_MAX; + offset[*off_count].rw_op = process[i].rw_op; + offset[*off_count].rw_pid = pid; + offset[*off_count].rw_range_start = + process[i].rw_range_start; + offset[*off_count].rw_range_end = + process[i].rw_last_file_pos; + offset[*off_count].rw_smallest_extent = + process[i].rw_smallest_extent; + offset[*off_count].rw_largest_extent = + process[i].rw_largest_extent; + offset[*off_count].rw_offset = + process[i].rw_offset; + process[i].rw_op = rw; + process[i].rw_range_start = pos; + process[i].rw_smallest_extent = count; + process[i].rw_largest_extent = count; + process[i].rw_offset = pos - + process[i].rw_last_file_pos; + } + if(process[i].rw_smallest_extent > count) + process[i].rw_smallest_extent = count; + if(process[i].rw_largest_extent < count) + process[i].rw_largest_extent = count; + process[i].rw_last_file_pos = pos + count; + spin_unlock(&sbi->ll_process_lock); + return; + } + } + *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; + process[*process_count].rw_pid = pid; + process[*process_count].rw_op = rw; + process[*process_count].rw_range_start = pos; + process[*process_count].rw_last_file_pos = pos + count; + process[*process_count].rw_smallest_extent = count; + process[*process_count].rw_largest_extent = count; + process[*process_count].rw_offset = 0; + process[*process_count].rw_last_file = file; + spin_unlock(&sbi->ll_process_lock); +} + +static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *offset = sbi->ll_rw_offset_info; + struct ll_rw_process_info *process = sbi->ll_rw_process_info; + int i; + + do_gettimeofday(&now); + + if (!sbi->ll_rw_stats_on) { + seq_printf(seq, "disabled\n" + "write anything in this file to activate, " + "then 0 or \"[D/d]isabled\" to deactivate\n"); + return 0; + } + spin_lock(&sbi->ll_process_lock); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", + "R/W", "PID", "RANGE START", "RANGE END", + "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); + /* We stored the discontiguous offsets here; print them first */ + for(i = 0; i < LL_OFFSET_HIST_MAX; i++) { + if (offset[i].rw_pid != 0) + seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", + offset[i].rw_op ? 'W' : 'R', + offset[i].rw_pid, + offset[i].rw_range_start, + offset[i].rw_range_end, + (unsigned long)offset[i].rw_smallest_extent, + (unsigned long)offset[i].rw_largest_extent, + offset[i].rw_offset); + } + /* Then print the current offsets for each process */ + for(i = 0; i < LL_PROCESS_HIST_MAX; i++) { + if (process[i].rw_pid != 0) + seq_printf(seq,"%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", + process[i].rw_op ? 'W' : 'R', + process[i].rw_pid, + process[i].rw_range_start, + process[i].rw_last_file_pos, + (unsigned long)process[i].rw_smallest_extent, + (unsigned long)process[i].rw_largest_extent, + process[i].rw_offset); + } + spin_unlock(&sbi->ll_process_lock); + + return 0; +} + +static ssize_t ll_rw_offset_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct ll_sb_info *sbi = seq->private; + struct ll_rw_process_info *process_info = sbi->ll_rw_process_info; + struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info; + int value = 1, rc = 0; + + rc = lprocfs_write_helper(buf, len, &value); + + if (rc < 0 && (strcmp(buf, "disabled") == 0 || + strcmp(buf, "Disabled") == 0)) + value = 0; + + if (value == 0) + sbi->ll_rw_stats_on = 0; + else + sbi->ll_rw_stats_on = 1; + + spin_lock(&sbi->ll_process_lock); + sbi->ll_offset_process_count = 0; + sbi->ll_rw_offset_entry_count = 0; + memset(process_info, 0, sizeof(struct ll_rw_process_info) * + LL_PROCESS_HIST_MAX); + memset(offset_info, 0, sizeof(struct ll_rw_process_info) * + LL_OFFSET_HIST_MAX); + spin_unlock(&sbi->ll_process_lock); + + return len; +} + +LPROC_SEQ_FOPS(ll_rw_offset_stats); + +void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = NULL; + lvars->obd_vars = lprocfs_llite_obd_vars; +} +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c new file mode 100644 index 000000000000..e6b3f54abbe3 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/namei.c @@ -0,0 +1,1279 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include "llite_internal.h" + +static int ll_create_it(struct inode *, struct dentry *, + int, struct lookup_intent *); + +/* + * Check if we have something mounted at the named dchild. + * In such a case there would always be dentry present. + */ +static int ll_d_mountpoint(struct dentry *dparent, struct dentry *dchild, + struct qstr *name) +{ + int mounted = 0; + + if (unlikely(dchild)) { + mounted = d_mountpoint(dchild); + } else if (dparent) { + dchild = d_lookup(dparent, name); + if (dchild) { + mounted = d_mountpoint(dchild); + dput(dchild); + } + } + return mounted; +} + +int ll_unlock(__u32 mode, struct lustre_handle *lockh) +{ + ENTRY; + + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + + +/* called from iget5_locked->find_inode() under inode_lock spinlock */ +static int ll_test_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_md *md = opaque; + + if (unlikely(!(md->body->valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return 0; + } + + if (!lu_fid_eq(&lli->lli_fid, &md->body->fid1)) + return 0; + + return 1; +} + +static int ll_set_inode(struct inode *inode, void *opaque) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct mdt_body *body = ((struct lustre_md *)opaque)->body; + + if (unlikely(!(body->valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return -EINVAL; + } + + lli->lli_fid = body->fid1; + if (unlikely(!(body->valid & OBD_MD_FLTYPE))) { + CERROR("Can not initialize inode "DFID" without object type: " + "valid = "LPX64"\n", PFID(&lli->lli_fid), body->valid); + return -EINVAL; + } + + inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mode & S_IFMT); + if (unlikely(inode->i_mode == 0)) { + CERROR("Invalid inode "DFID" type\n", PFID(&lli->lli_fid)); + return -EINVAL; + } + + ll_lli_init(lli); + + return 0; +} + + +/* + * Get an inode by inode number (already instantiated by the intent lookup). + * Returns inode or NULL + */ +struct inode *ll_iget(struct super_block *sb, ino_t hash, + struct lustre_md *md) +{ + struct inode *inode; + ENTRY; + + LASSERT(hash != 0); + inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); + + if (inode) { + if (inode->i_state & I_NEW) { + int rc = 0; + + ll_read_inode2(inode, md); + if (S_ISREG(inode->i_mode) && + ll_i2info(inode)->lli_clob == NULL) { + CDEBUG(D_INODE, + "%s: apply lsm %p to inode "DFID".\n", + ll_get_fsname(sb, NULL, 0), md->lsm, + PFID(ll_inode2fid(inode))); + rc = cl_file_inode_init(inode, md); + } + if (rc != 0) { + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else + unlock_new_inode(inode); + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) + ll_update_inode(inode, md); + CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n", + inode, PFID(&md->body->fid1)); + } + RETURN(inode); +} + +static void ll_invalidate_negative_children(struct inode *dir) +{ + struct dentry *dentry, *tmp_subdir; + struct ll_d_hlist_node *p; + + ll_lock_dcache(dir); + ll_d_hlist_for_each_entry(dentry, p, &dir->i_dentry, d_alias) { + spin_lock(&dentry->d_lock); + if (!list_empty(&dentry->d_subdirs)) { + struct dentry *child; + + list_for_each_entry_safe(child, tmp_subdir, + &dentry->d_subdirs, + d_u.d_child) { + if (child->d_inode == NULL) + d_lustre_invalidate(child); + } + } + spin_unlock(&dentry->d_lock); + } + ll_unlock_dcache(dir); +} + +int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + int rc; + struct lustre_handle lockh; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); + RETURN(rc); + } + break; + case LDLM_CB_CANCELING: { + struct inode *inode = ll_inode_from_resource_lock(lock); + struct ll_inode_info *lli; + __u64 bits = lock->l_policy_data.l_inodebits.bits; + struct lu_fid *fid; + ldlm_mode_t mode = lock->l_req_mode; + + /* Inode is set to lock->l_resource->lr_lvb_inode + * for mdc - bug 24555 */ + LASSERT(lock->l_ast_data == NULL); + + /* Invalidate all dentries associated with this inode */ + if (inode == NULL) + break; + + LASSERT(lock->l_flags & LDLM_FL_CANCELING); + /* For OPEN locks we differentiate between lock modes + * LCK_CR, LCK_CW, LCK_PR - bug 22891 */ + if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | + MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM)) + ll_have_md_lock(inode, &bits, LCK_MINMODE); + + if (bits & MDS_INODELOCK_OPEN) + ll_have_md_lock(inode, &bits, mode); + + fid = ll_inode2fid(inode); + if (lock->l_resource->lr_name.name[0] != fid_seq(fid) || + lock->l_resource->lr_name.name[1] != fid_oid(fid) || + lock->l_resource->lr_name.name[2] != fid_ver(fid)) { + LDLM_ERROR(lock, "data mismatch with object " + DFID" (%p)", PFID(fid), inode); + } + + if (bits & MDS_INODELOCK_OPEN) { + int flags = 0; + switch (lock->l_req_mode) { + case LCK_CW: + flags = FMODE_WRITE; + break; + case LCK_PR: + flags = FMODE_EXEC; + break; + case LCK_CR: + flags = FMODE_READ; + break; + default: + CERROR("Unexpected lock mode for OPEN lock " + "%d, inode %ld\n", lock->l_req_mode, + inode->i_ino); + } + ll_md_real_close(inode, flags); + } + + lli = ll_i2info(inode); + if (bits & MDS_INODELOCK_LAYOUT) { + struct cl_object_conf conf = { { 0 } }; + + conf.coc_opc = OBJECT_CONF_INVALIDATE; + conf.coc_inode = inode; + rc = ll_layout_conf(inode, &conf); + if (rc) + CDEBUG(D_INODE, "invaliding layout %d.\n", rc); + } + + if (bits & MDS_INODELOCK_UPDATE) + lli->lli_flags &= ~LLIF_MDS_SIZE_LOCK; + + if (S_ISDIR(inode->i_mode) && + (bits & MDS_INODELOCK_UPDATE)) { + CDEBUG(D_INODE, "invalidating inode %lu\n", + inode->i_ino); + truncate_inode_pages(inode->i_mapping, 0); + ll_invalidate_negative_children(inode); + } + + if (inode->i_sb->s_root && + inode != inode->i_sb->s_root->d_inode && + (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM))) + ll_invalidate_aliases(inode); + iput(inode); + break; + } + default: + LBUG(); + } + + RETURN(0); +} + +__u32 ll_i2suppgid(struct inode *i) +{ + if (current_is_in_group(i->i_gid)) + return (__u32)i->i_gid; + else + return (__u32)(-1); +} + +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ +#if 0 + int i; +#endif + + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + suppgids[0] = ll_i2suppgid(i1); + + if (i2) + suppgids[1] = ll_i2suppgid(i2); + else + suppgids[1] = -1; + +#if 0 + for (i = 0; i < current_ngroups; i++) { + if (suppgids[0] == -1) { + if (current_groups[i] != suppgids[1]) + suppgids[0] = current_groups[i]; + continue; + } + if (suppgids[1] == -1) { + if (current_groups[i] != suppgids[0]) + suppgids[1] = current_groups[i]; + continue; + } + break; + } +#endif +} + +/* + * try to reuse three types of dentry: + * 1. unhashed alias, this one is unhashed by d_invalidate (but it may be valid + * by concurrent .revalidate). + * 2. INVALID alias (common case for no valid ldlm lock held, but this flag may + * be cleared by others calling d_lustre_revalidate). + * 3. DISCONNECTED alias. + */ +static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) +{ + struct dentry *alias, *discon_alias, *invalid_alias; + struct ll_d_hlist_node *p; + + if (ll_d_hlist_empty(&inode->i_dentry)) + return NULL; + + discon_alias = invalid_alias = NULL; + + ll_lock_dcache(inode); + ll_d_hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) { + LASSERT(alias != dentry); + + spin_lock(&alias->d_lock); + if (alias->d_flags & DCACHE_DISCONNECTED) + /* LASSERT(last_discon == NULL); LU-405, bz 20055 */ + discon_alias = alias; + else if (alias->d_parent == dentry->d_parent && + alias->d_name.hash == dentry->d_name.hash && + alias->d_name.len == dentry->d_name.len && + memcmp(alias->d_name.name, dentry->d_name.name, + dentry->d_name.len) == 0) + invalid_alias = alias; + spin_unlock(&alias->d_lock); + + if (invalid_alias) + break; + } + alias = invalid_alias ?: discon_alias ?: NULL; + if (alias) { + spin_lock(&alias->d_lock); + dget_dlock(alias); + spin_unlock(&alias->d_lock); + } + ll_unlock_dcache(inode); + + return alias; +} + +/* + * Similar to d_splice_alias(), but lustre treats invalid alias + * similar to DCACHE_DISCONNECTED, and tries to use it anyway. + */ +struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) +{ + struct dentry *new; + + if (inode) { + new = ll_find_alias(inode, de); + if (new) { + ll_dops_init(new, 1, 1); + d_move(new, de); + iput(inode); + CDEBUG(D_DENTRY, + "Reuse dentry %p inode %p refc %d flags %#x\n", + new, new->d_inode, d_refcount(new), new->d_flags); + return new; + } + } + ll_dops_init(de, 1, 1); + __d_lustre_invalidate(de); + d_add(de, inode); + CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", + de, de->d_inode, d_refcount(de), de->d_flags); + return de; +} + +int ll_lookup_it_finish(struct ptlrpc_request *request, + struct lookup_intent *it, void *data) +{ + struct it_cb_data *icbd = data; + struct dentry **de = icbd->icbd_childp; + struct inode *parent = icbd->icbd_parent; + struct inode *inode = NULL; + __u64 bits = 0; + int rc; + ENTRY; + + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ + CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, + it->d.lustre.it_disposition); + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + rc = ll_prep_inode(&inode, request, (*de)->d_sb, it); + if (rc) + RETURN(rc); + + ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); + + /* We used to query real size from OSTs here, but actually + this is not needed. For stat() calls size would be updated + from subsequent do_revalidate()->ll_inode_revalidate_it() in + 2.4 and + vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 + Everybody else who needs correct file size would call + ll_glimpse_size or some equivalent themselves anyway. + Also see bug 7198. */ + } + + /* Only hash *de if it is unhashed (new dentry). + * Atoimc_open may passin hashed dentries for open. + */ + if (d_unhashed(*de)) + *de = ll_splice_alias(inode, *de); + + if (!it_disposition(it, DISP_LOOKUP_NEG)) { + /* we have lookup look - unhide dentry */ + if (bits & MDS_INODELOCK_LOOKUP) + d_lustre_revalidate(*de); + } else if (!it_disposition(it, DISP_OPEN_CREATE)) { + /* If file created on server, don't depend on parent UPDATE + * lock to unhide it. It is left hidden and next lookup can + * find it in ll_splice_alias. + */ + /* Check that parent has UPDATE lock. */ + struct lookup_intent parent_it = { + .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = 0 }; + + if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, + &ll_i2info(parent)->lli_fid, NULL)) { + d_lustre_revalidate(*de); + ll_intent_release(&parent_it); + } + } + + RETURN(0); +} + +static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, + struct lookup_intent *it, int lookup_flags) +{ + struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; + struct dentry *save = dentry, *retval; + struct ptlrpc_request *req = NULL; + struct md_op_data *op_data; + struct it_cb_data icbd; + __u32 opc; + int rc; + ENTRY; + + if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen) + RETURN(ERR_PTR(-ENAMETOOLONG)); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.len, dentry->d_name.name, parent->i_ino, + parent->i_generation, parent, LL_IT2STR(it)); + + if (d_mountpoint(dentry)) + CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); + + ll_frob_intent(&it, &lookup_it); + + /* As do_lookup is called before follow_mount, root dentry may be left + * not valid, revalidate it here. */ + if (parent->i_sb->s_root && (parent->i_sb->s_root->d_inode == parent) && + (it->it_op & (IT_OPEN | IT_CREAT))) { + rc = ll_inode_revalidate_it(parent->i_sb->s_root, it, + MDS_INODELOCK_LOOKUP); + if (rc) + RETURN(ERR_PTR(rc)); + } + + if (it->it_op == IT_GETATTR) { + rc = ll_statahead_enter(parent, &dentry, 0); + if (rc == 1) { + if (dentry == save) + GOTO(out, retval = NULL); + GOTO(out, retval = dentry); + } + } + + icbd.icbd_childp = &dentry; + icbd.icbd_parent = parent; + + if (it->it_op & IT_CREAT || + (it->it_op & IT_OPEN && it->it_create_mode & O_CREAT)) + opc = LUSTRE_OPC_CREATE; + else + opc = LUSTRE_OPC_ANY; + + op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, + dentry->d_name.len, lookup_flags, opc, + NULL); + if (IS_ERR(op_data)) + RETURN((void *)op_data); + + /* enforce umask if acl disabled or MDS doesn't support umask */ + if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) + it->it_create_mode &= ~current_umask(); + + rc = md_intent_lock(ll_i2mdexp(parent), op_data, NULL, 0, it, + lookup_flags, &req, ll_md_blocking_ast, 0); + ll_finish_md_op_data(op_data); + if (rc < 0) + GOTO(out, retval = ERR_PTR(rc)); + + rc = ll_lookup_it_finish(req, it, &icbd); + if (rc != 0) { + ll_intent_release(it); + GOTO(out, retval = ERR_PTR(rc)); + } + + if ((it->it_op & IT_OPEN) && dentry->d_inode && + !S_ISREG(dentry->d_inode->i_mode) && + !S_ISDIR(dentry->d_inode->i_mode)) { + ll_release_openhandle(dentry, it); + } + ll_lookup_finish_locks(it, dentry); + + if (dentry == save) + GOTO(out, retval = NULL); + else + GOTO(out, retval = dentry); + out: + if (req) + ptlrpc_req_finished(req); + if (it->it_op == IT_GETATTR && (retval == NULL || retval == dentry)) + ll_statahead_mark(parent, dentry); + return retval; +} + +static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, + unsigned int flags) +{ + struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; + struct dentry *de; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),flags=%u\n", + dentry->d_name.len, dentry->d_name.name, parent->i_ino, + parent->i_generation, parent, flags); + + /* Optimize away (CREATE && !OPEN). Let .create handle the race. */ + if ((flags & LOOKUP_CREATE ) && !(flags & LOOKUP_OPEN)) { + ll_dops_init(dentry, 1, 1); + __d_lustre_invalidate(dentry); + d_add(dentry, NULL); + return NULL; + } + + if (flags & (LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE)) + itp = NULL; + else + itp = ⁢ + de = ll_lookup_it(parent, dentry, itp, 0); + + if (itp != NULL) + ll_intent_release(itp); + + return de; +} + +/* + * For cached negative dentry and new dentry, handle lookup/create/open + * together. + */ +static int ll_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode, int *opened) +{ + struct lookup_intent *it; + struct dentry *de; + long long lookup_flags = LOOKUP_OPEN; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),file %p," + "open_flags %x,mode %x opened %d\n", + dentry->d_name.len, dentry->d_name.name, dir->i_ino, + dir->i_generation, dir, file, open_flags, mode, *opened); + + OBD_ALLOC(it, sizeof(*it)); + if (!it) + RETURN(-ENOMEM); + + it->it_op = IT_OPEN; + if (mode) { + it->it_op |= IT_CREAT; + lookup_flags |= LOOKUP_CREATE; + } + it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; + it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); + + /* Dentry added to dcache tree in ll_lookup_it */ + de = ll_lookup_it(dir, dentry, it, lookup_flags); + if (IS_ERR(de)) + rc = PTR_ERR(de); + else if (de != NULL) + dentry = de; + + if (!rc) { + if (it_disposition(it, DISP_OPEN_CREATE)) { + /* Dentry instantiated in ll_create_it. */ + rc = ll_create_it(dir, dentry, mode, it); + if (rc) { + /* We dget in ll_splice_alias. */ + if (de != NULL) + dput(de); + goto out_release; + } + + *opened |= FILE_CREATED; + } + if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) { + /* Open dentry. */ + if (S_ISFIFO(dentry->d_inode->i_mode)) { + /* We cannot call open here as it would + * deadlock. + */ + if (it_disposition(it, DISP_ENQ_OPEN_REF)) + ptlrpc_req_finished( + (struct ptlrpc_request *) + it->d.lustre.it_data); + rc = finish_no_open(file, de); + } else { + file->private_data = it; + rc = finish_open(file, dentry, NULL, opened); + /* We dget in ll_splice_alias. finish_open takes + * care of dget for fd open. + */ + if (de != NULL) + dput(de); + } + } else { + rc = finish_no_open(file, de); + } + } + +out_release: + ll_intent_release(it); + OBD_FREE(it, sizeof(*it)); + + RETURN(rc); +} + + +/* We depend on "mode" being set with the proper file type/umask by now */ +static struct inode *ll_create_node(struct inode *dir, const char *name, + int namelen, const void *data, int datalen, + int mode, __u64 extra, + struct lookup_intent *it) +{ + struct inode *inode = NULL; + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int rc; + ENTRY; + + LASSERT(it && it->d.lustre.it_disposition); + + LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); + request = it->d.lustre.it_data; + it_clear_disposition(it, DISP_ENQ_CREATE_REF); + rc = ll_prep_inode(&inode, request, dir->i_sb, it); + if (rc) + GOTO(out, inode = ERR_PTR(rc)); + + LASSERT(ll_d_hlist_empty(&inode->i_dentry)); + + /* We asked for a lock on the directory, but were granted a + * lock on the inode. Since we finally have an inode pointer, + * stuff it in the lock. */ + CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n", + inode, inode->i_ino, inode->i_generation); + ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); + EXIT; + out: + ptlrpc_req_finished(request); + return inode; +} + +/* + * By the time this is called, we already have created the directory cache + * entry for the new file, but it is so far negative - it has no inode. + * + * We defer creating the OBD object(s) until open, to keep the intent and + * non-intent code paths similar, and also because we do not have the MDS + * inode number before calling ll_create_node() (which is needed for LOV), + * so we would need to do yet another RPC to the MDS to store the LOV EA + * data on the MDS. If needed, we would pass the PACKED lmm as data and + * lmm_size in datalen (the MDS still has code which will handle that). + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, + struct lookup_intent *it) +{ + struct inode *inode; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.len, dentry->d_name.name, dir->i_ino, + dir->i_generation, dir, LL_IT2STR(it)); + + rc = it_open_error(DISP_OPEN_CREATE, it); + if (rc) + RETURN(rc); + + inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len, + NULL, 0, mode, 0, it); + if (IS_ERR(inode)) + RETURN(PTR_ERR(inode)); + + if (filename_is_volatile(dentry->d_name.name, dentry->d_name.len, NULL)) + ll_i2info(inode)->lli_volatile = true; + + d_instantiate(dentry, inode); + RETURN(0); +} + +static void ll_update_times(struct ptlrpc_request *request, + struct inode *inode) +{ + struct mdt_body *body = req_capsule_server_get(&request->rq_pill, + &RMF_MDT_BODY); + + LASSERT(body); + if (body->valid & OBD_MD_FLMTIME && + body->mtime > LTIME_S(inode->i_mtime)) { + CDEBUG(D_INODE, "setting ino %lu mtime from %lu to "LPU64"\n", + inode->i_ino, LTIME_S(inode->i_mtime), body->mtime); + LTIME_S(inode->i_mtime) = body->mtime; + } + if (body->valid & OBD_MD_FLCTIME && + body->ctime > LTIME_S(inode->i_ctime)) + LTIME_S(inode->i_ctime) = body->ctime; +} + +static int ll_new_node(struct inode *dir, struct qstr *name, + const char *tgt, int mode, int rdev, + struct dentry *dchild, __u32 opc) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + struct inode *inode = NULL; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int tgt_len = 0; + int err; + + ENTRY; + if (unlikely(tgt != NULL)) + tgt_len = strlen(tgt) + 1; + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, + name->len, 0, opc, NULL); + if (IS_ERR(op_data)) + GOTO(err_exit, err = PTR_ERR(op_data)); + + err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode, + current_fsuid(), current_fsgid(), + cfs_curproc_cap_pack(), rdev, &request); + ll_finish_md_op_data(op_data); + if (err) + GOTO(err_exit, err); + + ll_update_times(request, dir); + + if (dchild) { + err = ll_prep_inode(&inode, request, dchild->d_sb, NULL); + if (err) + GOTO(err_exit, err); + + d_instantiate(dchild, inode); + } + EXIT; +err_exit: + ptlrpc_req_finished(request); + + return err; +} + +static int ll_mknod_generic(struct inode *dir, struct qstr *name, int mode, + unsigned rdev, struct dentry *dchild) +{ + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p) mode %o dev %x\n", + name->len, name->name, dir->i_ino, dir->i_generation, dir, + mode, rdev); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + + switch (mode & S_IFMT) { + case 0: + mode |= S_IFREG; /* for mode = 0 case, fallthrough */ + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ll_new_node(dir, name, NULL, mode, rdev, dchild, + LUSTRE_OPC_MKNOD); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1); + + RETURN(err); +} + +/* + * Plain create. Intent create is handled in atomic_open. + */ +static int ll_create_nd(struct inode *dir, struct dentry *dentry, + umode_t mode, bool want_excl) +{ + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)," + "flags=%u, excl=%d\n", + dentry->d_name.len, dentry->d_name.name, dir->i_ino, + dir->i_generation, dir, mode, want_excl); + + rc = ll_mknod_generic(dir, &dentry->d_name, mode, 0, dentry); + + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s, unhashed %d\n", + dentry->d_name.len, dentry->d_name.name, d_unhashed(dentry)); + + return rc; +} + +static int ll_symlink_generic(struct inode *dir, struct qstr *name, + const char *tgt, struct dentry *dchild) +{ + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p),target=%.*s\n", + name->len, name->name, dir->i_ino, dir->i_generation, + dir, 3000, tgt); + + err = ll_new_node(dir, name, (char *)tgt, S_IFLNK | S_IRWXUGO, + 0, dchild, LUSTRE_OPC_SYMLINK); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1); + + RETURN(err); +} + +static int ll_link_generic(struct inode *src, struct inode *dir, + struct qstr *name, struct dentry *dchild) +{ + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int err; + + ENTRY; + CDEBUG(D_VFSTRACE, + "VFS Op: inode=%lu/%u(%p), dir=%lu/%u(%p), target=%.*s\n", + src->i_ino, src->i_generation, src, dir->i_ino, + dir->i_generation, dir, name->len, name->name); + + op_data = ll_prep_md_op_data(NULL, src, dir, name->name, name->len, + 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + err = md_link(sbi->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (err) + GOTO(out, err); + + ll_update_times(request, dir); + ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1); + EXIT; +out: + ptlrpc_req_finished(request); + RETURN(err); +} + +static int ll_mkdir_generic(struct inode *dir, struct qstr *name, + int mode, struct dentry *dchild) + +{ + int err; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", + name->len, name->name, dir->i_ino, dir->i_generation, dir); + + if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) + mode &= ~current_umask(); + mode = (mode & (S_IRWXUGO|S_ISVTX)) | S_IFDIR; + err = ll_new_node(dir, name, NULL, mode, 0, dchild, LUSTRE_OPC_MKDIR); + + if (!err) + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); + + RETURN(err); +} + +/* Try to find the child dentry by its name. + If found, put the result fid into @fid. */ +static void ll_get_child_fid(struct inode * dir, struct qstr *name, + struct lu_fid *fid) +{ + struct dentry *parent, *child; + + parent = ll_d_hlist_entry(dir->i_dentry, struct dentry, d_alias); + child = d_lookup(parent, name); + if (child) { + if (child->d_inode) + *fid = *ll_inode2fid(child->d_inode); + dput(child); + } +} + +static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", + name->len, name->name, dir->i_ino, dir->i_generation, dir); + + if (unlikely(ll_d_mountpoint(dparent, dchild, name))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, name->len, + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + ll_get_child_fid(dir, name, &op_data->op_fid3); + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + RETURN(rc); +} + +/** + * Remove dir entry + **/ +int ll_rmdir_entry(struct inode *dir, char *name, int namelen) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", + namelen, name, dir->i_ino, dir->i_generation, dir); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name, strlen(name), + S_IFDIR, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + op_data->op_cli_flags |= CLI_RM_ENTRY; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc == 0) { + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); + } + + ptlrpc_req_finished(request); + RETURN(rc); +} + +int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) +{ + struct mdt_body *body; + struct lov_mds_md *eadata; + struct lov_stripe_md *lsm = NULL; + struct obd_trans_info oti = { 0 }; + struct obdo *oa; + struct obd_capa *oc = NULL; + int rc; + ENTRY; + + /* req is swabbed so this is safe */ + body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + if (!(body->valid & OBD_MD_FLEASIZE)) + RETURN(0); + + if (body->eadatasize == 0) { + CERROR("OBD_MD_FLEASIZE set but eadatasize zero\n"); + GOTO(out, rc = -EPROTO); + } + + /* The MDS sent back the EA because we unlinked the last reference + * to this file. Use this EA to unlink the objects on the OST. + * It's opaque so we don't swab here; we leave it to obd_unpackmd() to + * check it is complete and sensible. */ + eadata = req_capsule_server_sized_get(&request->rq_pill, &RMF_MDT_MD, + body->eadatasize); + LASSERT(eadata != NULL); + + rc = obd_unpackmd(ll_i2dtexp(dir), &lsm, eadata, body->eadatasize); + if (rc < 0) { + CERROR("obd_unpackmd: %d\n", rc); + GOTO(out, rc); + } + LASSERT(rc >= sizeof(*lsm)); + + OBDO_ALLOC(oa); + if (oa == NULL) + GOTO(out_free_memmd, rc = -ENOMEM); + + oa->o_oi = lsm->lsm_oi; + oa->o_mode = body->mode & S_IFMT; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP; + + if (body->valid & OBD_MD_FLCOOKIE) { + oa->o_valid |= OBD_MD_FLCOOKIE; + oti.oti_logcookies = + req_capsule_server_sized_get(&request->rq_pill, + &RMF_LOGCOOKIES, + sizeof(struct llog_cookie) * + lsm->lsm_stripe_count); + if (oti.oti_logcookies == NULL) { + oa->o_valid &= ~OBD_MD_FLCOOKIE; + body->valid &= ~OBD_MD_FLCOOKIE; + } + } + + if (body->valid & OBD_MD_FLOSSCAPA) { + rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc); + if (rc) + GOTO(out_free_memmd, rc); + } + + rc = obd_destroy(NULL, ll_i2dtexp(dir), oa, lsm, &oti, + ll_i2mdexp(dir), oc); + capa_put(oc); + if (rc) + CERROR("obd destroy objid "DOSTID" error %d\n", + POSTID(&lsm->lsm_oi), rc); +out_free_memmd: + obd_free_memmd(ll_i2dtexp(dir), &lsm); + OBDO_FREE(oa); +out: + return rc; +} + +/* ll_unlink_generic() doesn't update the inode with the new link count. + * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there + * is any lock existing. They will recycle dentries and inodes based upon locks + * too. b=20433 */ +static int ll_unlink_generic(struct inode *dir, struct dentry *dparent, + struct dentry *dchild, struct qstr *name) +{ + struct ptlrpc_request *request = NULL; + struct md_op_data *op_data; + int rc; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", + name->len, name->name, dir->i_ino, dir->i_generation, dir); + + /* + * XXX: unlink bind mountpoint maybe call to here, + * just check it as vfs_unlink does. + */ + if (unlikely(ll_d_mountpoint(dparent, dchild, name))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, dir, NULL, name->name, + name->len, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + ll_get_child_fid(dir, name, &op_data->op_fid3); + op_data->op_fid2 = op_data->op_fid3; + rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); + ll_finish_md_op_data(op_data); + if (rc) + GOTO(out, rc); + + ll_update_times(request, dir); + ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); + + rc = ll_objects_destroy(request, dir); + out: + ptlrpc_req_finished(request); + RETURN(rc); +} + +static int ll_rename_generic(struct inode *src, struct dentry *src_dparent, + struct dentry *src_dchild, struct qstr *src_name, + struct inode *tgt, struct dentry *tgt_dparent, + struct dentry *tgt_dchild, struct qstr *tgt_name) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(src); + struct md_op_data *op_data; + int err; + ENTRY; + CDEBUG(D_VFSTRACE,"VFS Op:oldname=%.*s,src_dir=%lu/%u(%p),newname=%.*s," + "tgt_dir=%lu/%u(%p)\n", src_name->len, src_name->name, + src->i_ino, src->i_generation, src, tgt_name->len, + tgt_name->name, tgt->i_ino, tgt->i_generation, tgt); + + if (unlikely(ll_d_mountpoint(src_dparent, src_dchild, src_name) || + ll_d_mountpoint(tgt_dparent, tgt_dchild, tgt_name))) + RETURN(-EBUSY); + + op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + ll_get_child_fid(src, src_name, &op_data->op_fid3); + ll_get_child_fid(tgt, tgt_name, &op_data->op_fid4); + err = md_rename(sbi->ll_md_exp, op_data, + src_name->name, src_name->len, + tgt_name->name, tgt_name->len, &request); + ll_finish_md_op_data(op_data); + if (!err) { + ll_update_times(request, src); + ll_update_times(request, tgt); + ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); + err = ll_objects_destroy(request, src); + } + + ptlrpc_req_finished(request); + + RETURN(err); +} + +static int ll_mknod(struct inode *dir, struct dentry *dchild, ll_umode_t mode, + dev_t rdev) +{ + return ll_mknod_generic(dir, &dchild->d_name, mode, + old_encode_dev(rdev), dchild); +} + +static int ll_unlink(struct inode * dir, struct dentry *dentry) +{ + return ll_unlink_generic(dir, NULL, dentry, &dentry->d_name); +} + +static int ll_mkdir(struct inode *dir, struct dentry *dentry, ll_umode_t mode) +{ + return ll_mkdir_generic(dir, &dentry->d_name, mode, dentry); +} + +static int ll_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ll_rmdir_generic(dir, NULL, dentry, &dentry->d_name); +} + +static int ll_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname) +{ + return ll_symlink_generic(dir, &dentry->d_name, oldname, dentry); +} + +static int ll_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + return ll_link_generic(old_dentry->d_inode, dir, &new_dentry->d_name, + new_dentry); +} + +static int ll_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int err; + err = ll_rename_generic(old_dir, NULL, + old_dentry, &old_dentry->d_name, + new_dir, NULL, new_dentry, + &new_dentry->d_name); + if (!err) { + d_move(old_dentry, new_dentry); + } + return err; +} + +struct inode_operations ll_dir_inode_operations = { + .mknod = ll_mknod, + .atomic_open = ll_atomic_open, + .lookup = ll_lookup_nd, + .create = ll_create_nd, + /* We need all these non-raw things for NFSD, to not patch it. */ + .unlink = ll_unlink, + .mkdir = ll_mkdir, + .rmdir = ll_rmdir, + .symlink = ll_symlink, + .link = ll_link, + .rename = ll_rename, + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .get_acl = ll_get_acl, +}; + +struct inode_operations ll_special_inode_operations = { + .setattr = ll_setattr, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .get_acl = ll_get_acl, +}; diff --git a/drivers/staging/lustre/lustre/llite/remote_perm.c b/drivers/staging/lustre/lustre/llite/remote_perm.c new file mode 100644 index 000000000000..68b2dc4a7b62 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/remote_perm.c @@ -0,0 +1,333 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/remote_perm.c + * + * Lustre Permission Cache for Remote Client + * + * Author: Lai Siyao + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +struct kmem_cache *ll_remote_perm_cachep = NULL; +struct kmem_cache *ll_rmtperm_hash_cachep = NULL; + +static inline struct ll_remote_perm *alloc_ll_remote_perm(void) +{ + struct ll_remote_perm *lrp; + + OBD_SLAB_ALLOC_PTR_GFP(lrp, ll_remote_perm_cachep, GFP_KERNEL); + if (lrp) + INIT_HLIST_NODE(&lrp->lrp_list); + return lrp; +} + +static inline void free_ll_remote_perm(struct ll_remote_perm *lrp) +{ + if (!lrp) + return; + + if (!hlist_unhashed(&lrp->lrp_list)) + hlist_del(&lrp->lrp_list); + OBD_SLAB_FREE(lrp, ll_remote_perm_cachep, sizeof(*lrp)); +} + +struct hlist_head *alloc_rmtperm_hash(void) +{ + struct hlist_head *hash; + int i; + + OBD_SLAB_ALLOC_GFP(hash, ll_rmtperm_hash_cachep, + REMOTE_PERM_HASHSIZE * sizeof(*hash), + GFP_IOFS); + if (!hash) + return NULL; + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) + INIT_HLIST_HEAD(hash + i); + + return hash; +} + +void free_rmtperm_hash(struct hlist_head *hash) +{ + int i; + struct ll_remote_perm *lrp; + struct hlist_node *next; + + if(!hash) + return; + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) + hlist_for_each_entry_safe(lrp, next, hash + i, + lrp_list) + free_ll_remote_perm(lrp); + OBD_SLAB_FREE(hash, ll_rmtperm_hash_cachep, + REMOTE_PERM_HASHSIZE * sizeof(*hash)); +} + +static inline int remote_perm_hashfunc(uid_t uid) +{ + return uid & (REMOTE_PERM_HASHSIZE - 1); +} + +/* NB: setxid permission is not checked here, instead it's done on + * MDT when client get remote permission. */ +static int do_check_remote_perm(struct ll_inode_info *lli, int mask) +{ + struct hlist_head *head; + struct ll_remote_perm *lrp; + int found = 0, rc; + ENTRY; + + if (!lli->lli_remote_perms) + RETURN(-ENOENT); + + head = lli->lli_remote_perms + remote_perm_hashfunc(current_uid()); + + spin_lock(&lli->lli_lock); + hlist_for_each_entry(lrp, head, lrp_list) { + if (lrp->lrp_uid != current_uid()) + continue; + if (lrp->lrp_gid != current_gid()) + continue; + if (lrp->lrp_fsuid != current_fsuid()) + continue; + if (lrp->lrp_fsgid != current_fsgid()) + continue; + found = 1; + break; + } + + if (!found) + GOTO(out, rc = -ENOENT); + + CDEBUG(D_SEC, "found remote perm: %u/%u/%u/%u - %#x\n", + lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid, + lrp->lrp_access_perm); + rc = ((lrp->lrp_access_perm & mask) == mask) ? 0 : -EACCES; + +out: + spin_unlock(&lli->lli_lock); + return rc; +} + +int ll_update_remote_perm(struct inode *inode, struct mdt_remote_perm *perm) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_remote_perm *lrp = NULL, *tmp = NULL; + struct hlist_head *head, *perm_hash = NULL; + ENTRY; + + LASSERT(ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT); + +#if 0 + if (perm->rp_uid != current->uid || + perm->rp_gid != current->gid || + perm->rp_fsuid != current->fsuid || + perm->rp_fsgid != current->fsgid) { + /* user might setxid in this small period */ + CDEBUG(D_SEC, + "remote perm user %u/%u/%u/%u != current %u/%u/%u/%u\n", + perm->rp_uid, perm->rp_gid, perm->rp_fsuid, + perm->rp_fsgid, current->uid, current->gid, + current->fsuid, current->fsgid); + RETURN(-EAGAIN); + } +#endif + + if (!lli->lli_remote_perms) { + perm_hash = alloc_rmtperm_hash(); + if (perm_hash == NULL) { + CERROR("alloc lli_remote_perms failed!\n"); + RETURN(-ENOMEM); + } + } + + spin_lock(&lli->lli_lock); + + if (!lli->lli_remote_perms) + lli->lli_remote_perms = perm_hash; + else if (perm_hash) + free_rmtperm_hash(perm_hash); + + head = lli->lli_remote_perms + remote_perm_hashfunc(perm->rp_uid); + +again: + hlist_for_each_entry(tmp, head, lrp_list) { + if (tmp->lrp_uid != perm->rp_uid) + continue; + if (tmp->lrp_gid != perm->rp_gid) + continue; + if (tmp->lrp_fsuid != perm->rp_fsuid) + continue; + if (tmp->lrp_fsgid != perm->rp_fsgid) + continue; + if (lrp) + free_ll_remote_perm(lrp); + lrp = tmp; + break; + } + + if (!lrp) { + spin_unlock(&lli->lli_lock); + lrp = alloc_ll_remote_perm(); + if (!lrp) { + CERROR("alloc memory for ll_remote_perm failed!\n"); + RETURN(-ENOMEM); + } + spin_lock(&lli->lli_lock); + goto again; + } + + lrp->lrp_access_perm = perm->rp_access_perm; + if (lrp != tmp) { + lrp->lrp_uid = perm->rp_uid; + lrp->lrp_gid = perm->rp_gid; + lrp->lrp_fsuid = perm->rp_fsuid; + lrp->lrp_fsgid = perm->rp_fsgid; + hlist_add_head(&lrp->lrp_list, head); + } + lli->lli_rmtperm_time = cfs_time_current(); + spin_unlock(&lli->lli_lock); + + CDEBUG(D_SEC, "new remote perm@%p: %u/%u/%u/%u - %#x\n", + lrp, lrp->lrp_uid, lrp->lrp_gid, lrp->lrp_fsuid, lrp->lrp_fsgid, + lrp->lrp_access_perm); + + RETURN(0); +} + +int lustre_check_remote_perm(struct inode *inode, int mask) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + struct mdt_remote_perm *perm; + struct obd_capa *oc; + cfs_time_t save; + int i = 0, rc; + ENTRY; + + do { + save = lli->lli_rmtperm_time; + rc = do_check_remote_perm(lli, mask); + if (!rc || (rc != -ENOENT && i)) + break; + + might_sleep(); + + mutex_lock(&lli->lli_rmtperm_mutex); + /* check again */ + if (save != lli->lli_rmtperm_time) { + rc = do_check_remote_perm(lli, mask); + if (!rc || (rc != -ENOENT && i)) { + mutex_unlock(&lli->lli_rmtperm_mutex); + break; + } + } + + if (i++ > 5) { + CERROR("check remote perm falls in dead loop!\n"); + LBUG(); + } + + oc = ll_mdscapa_get(inode); + rc = md_get_remote_perm(sbi->ll_md_exp, ll_inode2fid(inode), oc, + ll_i2suppgid(inode), &req); + capa_put(oc); + if (rc) { + mutex_unlock(&lli->lli_rmtperm_mutex); + break; + } + + perm = req_capsule_server_swab_get(&req->rq_pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (unlikely(perm == NULL)) { + mutex_unlock(&lli->lli_rmtperm_mutex); + rc = -EPROTO; + break; + } + + rc = ll_update_remote_perm(inode, perm); + mutex_unlock(&lli->lli_rmtperm_mutex); + if (rc == -ENOMEM) + break; + + ptlrpc_req_finished(req); + req = NULL; + } while (1); + ptlrpc_req_finished(req); + RETURN(rc); +} + +#if 0 /* NB: remote perms can't be freed in ll_mdc_blocking_ast of UPDATE lock, + * because it will fail sanity test 48. + */ +void ll_free_remote_perms(struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct hlist_head *hash = lli->lli_remote_perms; + struct ll_remote_perm *lrp; + struct hlist_node *node, *next; + int i; + + LASSERT(hash); + + spin_lock(&lli->lli_lock); + + for (i = 0; i < REMOTE_PERM_HASHSIZE; i++) { + hlist_for_each_entry_safe(lrp, node, next, hash + i, + lrp_list) + free_ll_remote_perm(lrp); + } + + spin_unlock(&lli->lli_lock); +} +#endif diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c new file mode 100644 index 000000000000..0a0ac262eaaa --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/rw.c @@ -0,0 +1,1307 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/llite/rw.c + * + * Lustre Lite I/O page cache routines shared by different kernel revs + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +/* current_is_kswapd() */ +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include "llite_internal.h" +#include + +/** + * Finalizes cl-data before exiting typical address_space operation. Dual to + * ll_cl_init(). + */ +static void ll_cl_fini(struct ll_cl_context *lcc) +{ + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + LASSERT(lcc->lcc_cookie == current); + LASSERT(env != NULL); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + + if (io && lcc->lcc_created) { + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_iter_fini(env, io); + cl_io_fini(env, io); + } + cl_env_put(env, &lcc->lcc_refcheck); +} + +/** + * Initializes common cl-data at the typical address_space operation entry + * point. + */ +static struct ll_cl_context *ll_cl_init(struct file *file, + struct page *vmpage, int create) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + struct cl_io *io; + struct cl_object *clob; + struct ccc_io *cio; + + int refcheck; + int result = 0; + + clob = ll_i2info(vmpage->mapping->host)->lli_clob; + LASSERT(clob != NULL); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return ERR_PTR(PTR_ERR(env)); + + lcc = &vvp_env_info(env)->vti_io_ctx; + memset(lcc, 0, sizeof(*lcc)); + lcc->lcc_env = env; + lcc->lcc_refcheck = refcheck; + lcc->lcc_cookie = current; + + cio = ccc_env_io(env); + io = cio->cui_cl.cis_io; + if (io == NULL && create) { + struct inode *inode = vmpage->mapping->host; + loff_t pos; + + if (mutex_trylock(&inode->i_mutex)) { + mutex_unlock(&(inode)->i_mutex); + + /* this is too bad. Someone is trying to write the + * page w/o holding inode mutex. This means we can + * add dirty pages into cache during truncate */ + CERROR("Proc %s is dirting page w/o inode lock, this" + "will break truncate.\n", current->comm); + libcfs_debug_dumpstack(NULL); + LBUG(); + return ERR_PTR(-EIO); + } + + /* + * Loop-back driver calls ->prepare_write() and ->sendfile() + * methods directly, bypassing file system ->write() operation, + * so cl_io has to be created here. + */ + io = ccc_env_thread_io(env); + ll_io_init(io, file, 1); + + /* No lock at all for this kind of IO - we can't do it because + * we have held page lock, it would cause deadlock. + * XXX: This causes poor performance to loop device - One page + * per RPC. + * In order to get better performance, users should use + * lloop driver instead. + */ + io->ci_lockreq = CILR_NEVER; + + pos = (vmpage->index << PAGE_CACHE_SHIFT); + + /* Create a temp IO to serve write. */ + result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE); + if (result == 0) { + cio->cui_fd = LUSTRE_FPRIVATE(file); + cio->cui_iov = NULL; + cio->cui_nrsegs = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + result = cl_io_lock(env, io); + if (result == 0) + result = cl_io_start(env, io); + } + } else + result = io->ci_result; + lcc->lcc_created = 1; + } + + lcc->lcc_io = io; + if (io == NULL) + result = -EIO; + if (result == 0) { + struct cl_page *page; + + LASSERT(io != NULL); + LASSERT(io->ci_state == CIS_IO_GOING); + LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file)); + page = cl_page_find(env, clob, vmpage->index, vmpage, + CPT_CACHEABLE); + if (!IS_ERR(page)) { + lcc->lcc_page = page; + lu_ref_add(&page->cp_reference, "cl_io", io); + result = 0; + } else + result = PTR_ERR(page); + } + if (result) { + ll_cl_fini(lcc); + lcc = ERR_PTR(result); + } + + CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %d %p %p\n", + vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result, + env, io); + return lcc; +} + +static struct ll_cl_context *ll_cl_get(void) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + lcc = &vvp_env_info(env)->vti_io_ctx; + LASSERT(env == lcc->lcc_env); + LASSERT(current == lcc->lcc_cookie); + cl_env_put(env, &refcheck); + + /* env has got in ll_cl_init, so it is still usable. */ + return lcc; +} + +/** + * ->prepare_write() address space operation called by generic_file_write() + * for every page during write. + */ +int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from, + unsigned to) +{ + struct ll_cl_context *lcc; + int result; + ENTRY; + + lcc = ll_cl_init(file, vmpage, 1); + if (!IS_ERR(lcc)) { + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + cl_page_assume(env, io, page); + + result = cl_io_prepare_write(env, io, page, from, to); + if (result == 0) { + /* + * Add a reference, so that page is not evicted from + * the cache until ->commit_write() is called. + */ + cl_page_get(page); + lu_ref_add(&page->cp_reference, "prepare_write", + current); + } else { + cl_page_unassume(env, io, page); + ll_cl_fini(lcc); + } + /* returning 0 in prepare assumes commit must be called + * afterwards */ + } else { + result = PTR_ERR(lcc); + } + RETURN(result); +} + +int ll_commit_write(struct file *file, struct page *vmpage, unsigned from, + unsigned to) +{ + struct ll_cl_context *lcc; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + int result = 0; + ENTRY; + + lcc = ll_cl_get(); + env = lcc->lcc_env; + page = lcc->lcc_page; + io = lcc->lcc_io; + + LASSERT(cl_page_is_owned(page, io)); + LASSERT(from <= to); + if (from != to) /* handle short write case. */ + result = cl_io_commit_write(env, io, page, from, to); + if (cl_page_is_owned(page, io)) + cl_page_unassume(env, io, page); + + /* + * Release reference acquired by ll_prepare_write(). + */ + lu_ref_del(&page->cp_reference, "prepare_write", current); + cl_page_put(env, page); + ll_cl_fini(lcc); + RETURN(result); +} + +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt) +{ + __u64 opc; + + opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; + return ll_osscapa_get(inode, opc); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); + +/** + * Get readahead pages from the filesystem readahead pool of the client for a + * thread. + * + * /param sbi superblock for filesystem readahead state ll_ra_info + * /param ria per-thread readahead state + * /param pages number of pages requested for readahead for the thread. + * + * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. + * It should work well if the ra_max_pages is much greater than the single + * file's read-ahead window, and not too many threads contending for + * these readahead pages. + * + * TODO: There may be a 'global sync problem' if many threads are trying + * to get an ra budget that is larger than the remaining readahead pages + * and reach here at exactly the same time. They will compute /a ret to + * consume the remaining pages, but will fail at atomic_add_return() and + * get a zero ra window, although there is still ra space remaining. - Jay */ + +static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, + struct ra_io_arg *ria, + unsigned long pages) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + long ret; + ENTRY; + + /* If read-ahead pages left are less than 1M, do not do read-ahead, + * otherwise it will form small read RPC(< 1M), which hurt server + * performance a lot. */ + ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages); + if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) + GOTO(out, ret = 0); + + /* If the non-strided (ria_pages == 0) readahead window + * (ria_start + ret) has grown across an RPC boundary, then trim + * readahead size by the amount beyond the RPC so it ends on an + * RPC boundary. If the readahead window is already ending on + * an RPC boundary (beyond_rpc == 0), or smaller than a full + * RPC (beyond_rpc < ret) the readahead size is unchanged. + * The (beyond_rpc != 0) check is skipped since the conditional + * branch is more expensive than subtracting zero from the result. + * + * Strided read is left unaligned to avoid small fragments beyond + * the RPC boundary from needing an extra read RPC. */ + if (ria->ria_pages == 0) { + long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES; + if (/* beyond_rpc != 0 && */ beyond_rpc < ret) + ret -= beyond_rpc; + } + + if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { + atomic_sub(ret, &ra->ra_cur_pages); + ret = 0; + } + +out: + RETURN(ret); +} + +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + atomic_sub(len, &ra->ra_cur_pages); +} + +static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) +{ + LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which); + lprocfs_counter_incr(sbi->ll_ra_stats, which); +} + +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) +{ + struct ll_sb_info *sbi = ll_i2sbi(mapping->host); + ll_ra_stats_inc_sbi(sbi, which); +} + +#define RAS_CDEBUG(ras) \ + CDEBUG(D_READA, \ + "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ + "csr %lu sf %lu sp %lu sl %lu \n", \ + ras->ras_last_readpage, ras->ras_consecutive_requests, \ + ras->ras_consecutive_pages, ras->ras_window_start, \ + ras->ras_window_len, ras->ras_next_readahead, \ + ras->ras_requests, ras->ras_request_index, \ + ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ + ras->ras_stride_pages, ras->ras_stride_length) + +static int index_in_window(unsigned long index, unsigned long point, + unsigned long before, unsigned long after) +{ + unsigned long start = point - before, end = point + after; + + if (start > point) + start = 0; + if (end < point) + end = ~0; + + return start <= index && index <= end; +} + +static struct ll_readahead_state *ll_ras_get(struct file *f) +{ + struct ll_file_data *fd; + + fd = LUSTRE_FPRIVATE(f); + return &fd->fd_ras; +} + +void ll_ra_read_in(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + ras->ras_requests++; + ras->ras_request_index = 0; + ras->ras_consecutive_requests++; + rar->lrr_reader = current; + + list_add(&rar->lrr_linkage, &ras->ras_read_beads); + spin_unlock(&ras->ras_lock); +} + +void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar) +{ + struct ll_readahead_state *ras; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + list_del_init(&rar->lrr_linkage); + spin_unlock(&ras->ras_lock); +} + +static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras) +{ + struct ll_ra_read *scan; + + list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) { + if (scan->lrr_reader == current) + return scan; + } + return NULL; +} + +struct ll_ra_read *ll_ra_read_get(struct file *f) +{ + struct ll_readahead_state *ras; + struct ll_ra_read *bead; + + ras = ll_ras_get(f); + + spin_lock(&ras->ras_lock); + bead = ll_ra_read_get_locked(ras); + spin_unlock(&ras->ras_lock); + return bead; +} + +static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_page *page, + struct page *vmpage) +{ + struct ccc_page *cp; + int rc; + + ENTRY; + + rc = 0; + cl_page_assume(env, io, page); + lu_ref_add(&page->cp_reference, "ra", current); + cp = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + if (!cp->cpg_defer_uptodate && !PageUptodate(vmpage)) { + rc = cl_page_is_under_lock(env, io, page); + if (rc == -EBUSY) { + cp->cpg_defer_uptodate = 1; + cp->cpg_ra_used = 0; + cl_page_list_add(queue, page); + rc = 1; + } else { + cl_page_delete(env, page); + rc = -ENOLCK; + } + } else { + /* skip completed pages */ + cl_page_unassume(env, io, page); + } + lu_ref_del(&page->cp_reference, "ra", current); + cl_page_put(env, page); + RETURN(rc); +} + +/** + * Initiates read-ahead of a page with given index. + * + * \retval +ve: page was added to \a queue. + * + * \retval -ENOLCK: there is no extent lock for this part of a file, stop + * read-ahead. + * + * \retval -ve, 0: page wasn't added to \a queue for other reason. + */ +static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, + pgoff_t index, struct address_space *mapping) +{ + struct page *vmpage; + struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; + struct cl_page *page; + enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ + unsigned int gfp_mask; + int rc = 0; + const char *msg = NULL; + + ENTRY; + + gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT; +#ifdef __GFP_NOWARN + gfp_mask |= __GFP_NOWARN; +#endif + vmpage = grab_cache_page_nowait(mapping, index); + if (vmpage != NULL) { + /* Check if vmpage was truncated or reclaimed */ + if (vmpage->mapping == mapping) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + rc = cl_read_ahead_page(env, io, queue, + page, vmpage); + if (rc == -ENOLCK) { + which = RA_STAT_FAILED_MATCH; + msg = "lock match failed"; + } + } else { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "cl_page_find failed"; + } + } else { + which = RA_STAT_WRONG_GRAB_PAGE; + msg = "g_c_p_n returned invalid page"; + } + if (rc != 1) + unlock_page(vmpage); + page_cache_release(vmpage); + } else { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "g_c_p_n failed"; + } + if (msg != NULL) { + ll_ra_stats_inc(mapping, which); + CDEBUG(D_READA, "%s\n", msg); + } + RETURN(rc); +} + +#define RIA_DEBUG(ria) \ + CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ + ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ + ria->ria_pages) + +/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't + * know what the actual RPC size is. If this needs to change, it makes more + * sense to tune the i_blkbits value for the file based on the OSTs it is + * striped over, rather than having a constant value for all files here. */ + +/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)). + * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled + * by default, this should be adjusted corresponding with max_read_ahead_mb + * and max_read_ahead_per_file_mb otherwise the readahead budget can be used + * up quickly which will affect read performance siginificantly. See LU-2816 */ +#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT) + +static inline int stride_io_mode(struct ll_readahead_state *ras) +{ + return ras->ras_consecutive_stride_requests > 1; +} +/* The function calculates how much pages will be read in + * [off, off + length], in such stride IO area, + * stride_offset = st_off, stride_lengh = st_len, + * stride_pages = st_pgs + * + * |------------------|*****|------------------|*****|------------|*****|.... + * st_off + * |--- st_pgs ---| + * |----- st_len -----| + * + * How many pages it should read in such pattern + * |-------------------------------------------------------------| + * off + * |<------ length ------->| + * + * = |<----->| + |-------------------------------------| + |---| + * start_left st_pgs * i end_left + */ +static unsigned long +stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, + unsigned long off, unsigned long length) +{ + __u64 start = off > st_off ? off - st_off : 0; + __u64 end = off + length > st_off ? off + length - st_off : 0; + unsigned long start_left = 0; + unsigned long end_left = 0; + unsigned long pg_count; + + if (st_len == 0 || length == 0 || end == 0) + return length; + + start_left = do_div(start, st_len); + if (start_left < st_pgs) + start_left = st_pgs - start_left; + else + start_left = 0; + + end_left = do_div(end, st_len); + if (end_left > st_pgs) + end_left = st_pgs; + + CDEBUG(D_READA, "start "LPU64", end "LPU64" start_left %lu end_left %lu \n", + start, end, start_left, end_left); + + if (start == end) + pg_count = end_left - (st_pgs - start_left); + else + pg_count = start_left + st_pgs * (end - start - 1) + end_left; + + CDEBUG(D_READA, "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu" + "pgcount %lu\n", st_off, st_len, st_pgs, off, length, pg_count); + + return pg_count; +} + +static int ria_page_count(struct ra_io_arg *ria) +{ + __u64 length = ria->ria_end >= ria->ria_start ? + ria->ria_end - ria->ria_start + 1 : 0; + + return stride_pg_count(ria->ria_stoff, ria->ria_length, + ria->ria_pages, ria->ria_start, + length); +} + +/*Check whether the index is in the defined ra-window */ +static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) +{ + /* If ria_length == ria_pages, it means non-stride I/O mode, + * idx should always inside read-ahead window in this case + * For stride I/O mode, just check whether the idx is inside + * the ria_pages. */ + return ria->ria_length == 0 || ria->ria_length == ria->ria_pages || + (idx >= ria->ria_stoff && (idx - ria->ria_stoff) % + ria->ria_length < ria->ria_pages); +} + +static int ll_read_ahead_pages(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *queue, + struct ra_io_arg *ria, + unsigned long *reserved_pages, + struct address_space *mapping, + unsigned long *ra_end) +{ + int rc, count = 0, stride_ria; + unsigned long page_idx; + + LASSERT(ria != NULL); + RIA_DEBUG(ria); + + stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; + for (page_idx = ria->ria_start; page_idx <= ria->ria_end && + *reserved_pages > 0; page_idx++) { + if (ras_inside_ra_window(page_idx, ria)) { + /* If the page is inside the read-ahead window*/ + rc = ll_read_ahead_page(env, io, queue, + page_idx, mapping); + if (rc == 1) { + (*reserved_pages)--; + count ++; + } else if (rc == -ENOLCK) + break; + } else if (stride_ria) { + /* If it is not in the read-ahead window, and it is + * read-ahead mode, then check whether it should skip + * the stride gap */ + pgoff_t offset; + /* FIXME: This assertion only is valid when it is for + * forward read-ahead, it will be fixed when backward + * read-ahead is implemented */ + LASSERTF(page_idx > ria->ria_stoff, "Invalid page_idx %lu" + "rs %lu re %lu ro %lu rl %lu rp %lu\n", page_idx, + ria->ria_start, ria->ria_end, ria->ria_stoff, + ria->ria_length, ria->ria_pages); + offset = page_idx - ria->ria_stoff; + offset = offset % (ria->ria_length); + if (offset > ria->ria_pages) { + page_idx += ria->ria_length - offset; + CDEBUG(D_READA, "i %lu skip %lu \n", page_idx, + ria->ria_length - offset); + continue; + } + } + } + *ra_end = page_idx; + return count; +} + +int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct ll_readahead_state *ras, struct address_space *mapping, + struct cl_page_list *queue, int flags) +{ + struct vvp_io *vio = vvp_env_io(env); + struct vvp_thread_info *vti = vvp_env_info(env); + struct cl_attr *attr = ccc_env_thread_attr(env); + unsigned long start = 0, end = 0, reserved; + unsigned long ra_end, len; + struct inode *inode; + struct ll_ra_read *bead; + struct ra_io_arg *ria = &vti->vti_ria; + struct ll_inode_info *lli; + struct cl_object *clob; + int ret = 0; + __u64 kms; + ENTRY; + + inode = mapping->host; + lli = ll_i2info(inode); + clob = lli->lli_clob; + + memset(ria, 0, sizeof *ria); + + cl_object_attr_lock(clob); + ret = cl_object_attr_get(env, clob, attr); + cl_object_attr_unlock(clob); + + if (ret != 0) + RETURN(ret); + kms = attr->cat_kms; + if (kms == 0) { + ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN); + RETURN(0); + } + + spin_lock(&ras->ras_lock); + if (vio->cui_ra_window_set) + bead = &vio->cui_bead; + else + bead = NULL; + + /* Enlarge the RA window to encompass the full read */ + if (bead != NULL && ras->ras_window_start + ras->ras_window_len < + bead->lrr_start + bead->lrr_count) { + ras->ras_window_len = bead->lrr_start + bead->lrr_count - + ras->ras_window_start; + } + /* Reserve a part of the read-ahead window that we'll be issuing */ + if (ras->ras_window_len) { + start = ras->ras_next_readahead; + end = ras->ras_window_start + ras->ras_window_len - 1; + } + if (end != 0) { + unsigned long rpc_boundary; + /* + * Align RA window to an optimal boundary. + * + * XXX This would be better to align to cl_max_pages_per_rpc + * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may + * be aligned to the RAID stripe size in the future and that + * is more important than the RPC size. + */ + /* Note: we only trim the RPC, instead of extending the RPC + * to the boundary, so to avoid reading too much pages during + * random reading. */ + rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1))); + if (rpc_boundary > 0) + rpc_boundary--; + + if (rpc_boundary > start) + end = rpc_boundary; + + /* Truncate RA window to end of file */ + end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT)); + + ras->ras_next_readahead = max(end, end + 1); + RAS_CDEBUG(ras); + } + ria->ria_start = start; + ria->ria_end = end; + /* If stride I/O mode is detected, get stride window*/ + if (stride_io_mode(ras)) { + ria->ria_stoff = ras->ras_stride_offset; + ria->ria_length = ras->ras_stride_length; + ria->ria_pages = ras->ras_stride_pages; + } + spin_unlock(&ras->ras_lock); + + if (end == 0) { + ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW); + RETURN(0); + } + len = ria_page_count(ria); + if (len == 0) + RETURN(0); + + reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len); + if (reserved < len) + ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); + + CDEBUG(D_READA, "reserved page %lu ra_cur %d ra_max %lu\n", reserved, + atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), + ll_i2sbi(inode)->ll_ra_info.ra_max_pages); + + ret = ll_read_ahead_pages(env, io, queue, + ria, &reserved, mapping, &ra_end); + + LASSERTF(reserved >= 0, "reserved %lu\n", reserved); + if (reserved != 0) + ll_ra_count_put(ll_i2sbi(inode), reserved); + + if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT)) + ll_ra_stats_inc(mapping, RA_STAT_EOF); + + /* if we didn't get to the end of the region we reserved from + * the ras we need to go back and update the ras so that the + * next read-ahead tries from where we left off. we only do so + * if the region we failed to issue read-ahead on is still ahead + * of the app and behind the next index to start read-ahead from */ + CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n", + ra_end, end, ria->ria_end); + + if (ra_end != end + 1) { + spin_lock(&ras->ras_lock); + if (ra_end < ras->ras_next_readahead && + index_in_window(ra_end, ras->ras_window_start, 0, + ras->ras_window_len)) { + ras->ras_next_readahead = ra_end; + RAS_CDEBUG(ras); + } + spin_unlock(&ras->ras_lock); + } + + RETURN(ret); +} + +static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1)); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, + unsigned long index) +{ + ras->ras_last_readpage = index; + ras->ras_consecutive_requests = 0; + ras->ras_consecutive_pages = 0; + ras->ras_window_len = 0; + ras_set_start(inode, ras, index); + ras->ras_next_readahead = max(ras->ras_window_start, index); + + RAS_CDEBUG(ras); +} + +/* called with the ras_lock held or from places where it doesn't matter */ +static void ras_stride_reset(struct ll_readahead_state *ras) +{ + ras->ras_consecutive_stride_requests = 0; + ras->ras_stride_length = 0; + ras->ras_stride_pages = 0; + RAS_CDEBUG(ras); +} + +void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) +{ + spin_lock_init(&ras->ras_lock); + ras_reset(inode, ras, 0); + ras->ras_requests = 0; + INIT_LIST_HEAD(&ras->ras_read_beads); +} + +/* + * Check whether the read request is in the stride window. + * If it is in the stride window, return 1, otherwise return 0. + */ +static int index_in_stride_window(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap; + + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || + ras->ras_stride_pages == ras->ras_stride_length) + return 0; + + stride_gap = index - ras->ras_last_readpage - 1; + + /* If it is contiguous read */ + if (stride_gap == 0) + return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; + + /* Otherwise check the stride by itself */ + return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && + ras->ras_consecutive_pages == ras->ras_stride_pages; +} + +static void ras_update_stride_detector(struct ll_readahead_state *ras, + unsigned long index) +{ + unsigned long stride_gap = index - ras->ras_last_readpage - 1; + + if (!stride_io_mode(ras) && (stride_gap != 0 || + ras->ras_consecutive_stride_requests == 0)) { + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + } + LASSERT(ras->ras_request_index == 0); + LASSERT(ras->ras_consecutive_stride_requests == 0); + + if (index <= ras->ras_last_readpage) { + /*Reset stride window for forward read*/ + ras_stride_reset(ras); + return; + } + + ras->ras_stride_pages = ras->ras_consecutive_pages; + ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; + + RAS_CDEBUG(ras); + return; +} + +static unsigned long +stride_page_count(struct ll_readahead_state *ras, unsigned long len) +{ + return stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length, + ras->ras_stride_pages, ras->ras_stride_offset, + len); +} + +/* Stride Read-ahead window will be increased inc_len according to + * stride I/O pattern */ +static void ras_stride_increase_window(struct ll_readahead_state *ras, + struct ll_ra_info *ra, + unsigned long inc_len) +{ + unsigned long left, step, window_len; + unsigned long stride_len; + + LASSERT(ras->ras_stride_length > 0); + LASSERTF(ras->ras_window_start + ras->ras_window_len + >= ras->ras_stride_offset, "window_start %lu, window_len %lu" + " stride_offset %lu\n", ras->ras_window_start, + ras->ras_window_len, ras->ras_stride_offset); + + stride_len = ras->ras_window_start + ras->ras_window_len - + ras->ras_stride_offset; + + left = stride_len % ras->ras_stride_length; + window_len = ras->ras_window_len - left; + + if (left < ras->ras_stride_pages) + left += inc_len; + else + left = ras->ras_stride_pages + inc_len; + + LASSERT(ras->ras_stride_pages != 0); + + step = left / ras->ras_stride_pages; + left %= ras->ras_stride_pages; + + window_len += step * ras->ras_stride_length + left; + + if (stride_page_count(ras, window_len) <= ra->ra_max_pages_per_file) + ras->ras_window_len = window_len; + + RAS_CDEBUG(ras); +} + +static void ras_increase_window(struct inode *inode, + struct ll_readahead_state *ras, + struct ll_ra_info *ra) +{ + /* The stretch of ra-window should be aligned with max rpc_size + * but current clio architecture does not support retrieve such + * information from lower layer. FIXME later + */ + if (stride_io_mode(ras)) + ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode)); + else + ras->ras_window_len = min(ras->ras_window_len + + RAS_INCREASE_STEP(inode), + ra->ra_max_pages_per_file); +} + +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit) +{ + struct ll_ra_info *ra = &sbi->ll_ra_info; + int zero = 0, stride_detect = 0, ra_miss = 0; + ENTRY; + + spin_lock(&ras->ras_lock); + + ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); + + /* reset the read-ahead window in two cases. First when the app seeks + * or reads to some other part of the file. Secondly if we get a + * read-ahead miss that we think we've previously issued. This can + * be a symptom of there being so many read-ahead pages that the VM is + * reclaiming it before we get to it. */ + if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { + zero = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); + } else if (!hit && ras->ras_window_len && + index < ras->ras_next_readahead && + index_in_window(index, ras->ras_window_start, 0, + ras->ras_window_len)) { + ra_miss = 1; + ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); + } + + /* On the second access to a file smaller than the tunable + * ra_max_read_ahead_whole_pages trigger RA on all pages in the + * file up to ra_max_pages_per_file. This is simply a best effort + * and only occurs once per open file. Normal RA behavior is reverted + * to for subsequent IO. The mmap case does not increment + * ras_requests and thus can never trigger this behavior. */ + if (ras->ras_requests == 2 && !ras->ras_request_index) { + __u64 kms_pages; + + kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages, + ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file); + + if (kms_pages && + kms_pages <= ra->ra_max_read_ahead_whole_pages) { + ras->ras_window_start = 0; + ras->ras_last_readpage = 0; + ras->ras_next_readahead = 0; + ras->ras_window_len = min(ra->ra_max_pages_per_file, + ra->ra_max_read_ahead_whole_pages); + GOTO(out_unlock, 0); + } + } + if (zero) { + /* check whether it is in stride I/O mode*/ + if (!index_in_stride_window(ras, index)) { + if (ras->ras_consecutive_stride_requests == 0 && + ras->ras_request_index == 0) { + ras_update_stride_detector(ras, index); + ras->ras_consecutive_stride_requests++; + } else { + ras_stride_reset(ras); + } + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + GOTO(out_unlock, 0); + } else { + ras->ras_consecutive_pages = 0; + ras->ras_consecutive_requests = 0; + if (++ras->ras_consecutive_stride_requests > 1) + stride_detect = 1; + RAS_CDEBUG(ras); + } + } else { + if (ra_miss) { + if (index_in_stride_window(ras, index) && + stride_io_mode(ras)) { + /*If stride-RA hit cache miss, the stride dector + *will not be reset to avoid the overhead of + *redetecting read-ahead mode */ + if (index != ras->ras_last_readpage + 1) + ras->ras_consecutive_pages = 0; + ras_reset(inode, ras, index); + RAS_CDEBUG(ras); + } else { + /* Reset both stride window and normal RA + * window */ + ras_reset(inode, ras, index); + ras->ras_consecutive_pages++; + ras_stride_reset(ras); + GOTO(out_unlock, 0); + } + } else if (stride_io_mode(ras)) { + /* If this is contiguous read but in stride I/O mode + * currently, check whether stride step still is valid, + * if invalid, it will reset the stride ra window*/ + if (!index_in_stride_window(ras, index)) { + /* Shrink stride read-ahead window to be zero */ + ras_stride_reset(ras); + ras->ras_window_len = 0; + ras->ras_next_readahead = index; + } + } + } + ras->ras_consecutive_pages++; + ras->ras_last_readpage = index; + ras_set_start(inode, ras, index); + + if (stride_io_mode(ras)) + /* Since stride readahead is sentivite to the offset + * of read-ahead, so we use original offset here, + * instead of ras_window_start, which is RPC aligned */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + else + ras->ras_next_readahead = max(ras->ras_window_start, + ras->ras_next_readahead); + RAS_CDEBUG(ras); + + /* Trigger RA in the mmap case where ras_consecutive_requests + * is not incremented and thus can't be used to trigger RA */ + if (!ras->ras_window_len && ras->ras_consecutive_pages == 4) { + ras->ras_window_len = RAS_INCREASE_STEP(inode); + GOTO(out_unlock, 0); + } + + /* Initially reset the stride window offset to next_readahead*/ + if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { + /** + * Once stride IO mode is detected, next_readahead should be + * reset to make sure next_readahead > stride offset + */ + ras->ras_next_readahead = max(index, ras->ras_next_readahead); + ras->ras_stride_offset = index; + ras->ras_window_len = RAS_INCREASE_STEP(inode); + } + + /* The initial ras_window_len is set to the request size. To avoid + * uselessly reading and discarding pages for random IO the window is + * only increased once per consecutive request received. */ + if ((ras->ras_consecutive_requests > 1 || stride_detect) && + !ras->ras_request_index) + ras_increase_window(inode, ras, ra); + EXIT; +out_unlock: + RAS_CDEBUG(ras); + ras->ras_request_index++; + spin_unlock(&ras->ras_lock); + return; +} + +int ll_writepage(struct page *vmpage, struct writeback_control *wbc) +{ + struct inode *inode = vmpage->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + struct cl_object *clob; + struct cl_env_nest nest; + bool redirtied = false; + bool unlocked = false; + int result; + ENTRY; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + LASSERT(ll_i2dtexp(inode) != NULL); + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + GOTO(out, result = PTR_ERR(env)); + + clob = ll_i2info(inode)->lli_clob; + LASSERT(clob != NULL); + + io = ccc_env_thread_io(env); + io->ci_obj = clob; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result == 0) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + lu_ref_add(&page->cp_reference, "writepage", + current); + cl_page_assume(env, io, page); + result = cl_page_flush(env, io, page); + if (result != 0) { + /* + * Re-dirty page on error so it retries write, + * but not in case when IO has actually + * occurred and completed with an error. + */ + if (!PageError(vmpage)) { + redirty_page_for_writepage(wbc, vmpage); + result = 0; + redirtied = true; + } + } + cl_page_disown(env, io, page); + unlocked = true; + lu_ref_del(&page->cp_reference, + "writepage", current); + cl_page_put(env, page); + } else { + result = PTR_ERR(page); + } + } + cl_io_fini(env, io); + + if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { + loff_t offset = cl_offset(clob, vmpage->index); + + /* Flush page failed because the extent is being written out. + * Wait for the write of extent to be finished to avoid + * breaking kernel which assumes ->writepage should mark + * PageWriteback or clean the page. */ + result = cl_sync_file_range(inode, offset, + offset + PAGE_CACHE_SIZE - 1, + CL_FSYNC_LOCAL); + if (result > 0) { + /* actually we may have written more than one page. + * decreasing this page because the caller will count + * it. */ + wbc->nr_to_write -= result - 1; + result = 0; + } + } + + cl_env_nested_put(&nest, env); + GOTO(out, result); + +out: + if (result < 0) { + if (!lli->lli_async_rc) + lli->lli_async_rc = result; + SetPageError(vmpage); + if (!unlocked) + unlock_page(vmpage); + } + return result; +} + +int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + loff_t start; + loff_t end; + enum cl_fsync_mode mode; + int range_whole = 0; + int result; + ENTRY; + + if (wbc->range_cyclic) { + start = mapping->writeback_index << PAGE_CACHE_SHIFT; + end = OBD_OBJECT_EOF; + } else { + start = wbc->range_start; + end = wbc->range_end; + if (end == LLONG_MAX) { + end = OBD_OBJECT_EOF; + range_whole = start == 0; + } + } + + mode = CL_FSYNC_NONE; + if (wbc->sync_mode == WB_SYNC_ALL) + mode = CL_FSYNC_LOCAL; + + result = cl_sync_file_range(inode, start, end, mode); + if (result > 0) { + wbc->nr_to_write -= result; + result = 0; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { + if (end == OBD_OBJECT_EOF) + end = i_size_read(inode); + mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1; + } + RETURN(result); +} + +int ll_readpage(struct file *file, struct page *vmpage) +{ + struct ll_cl_context *lcc; + int result; + ENTRY; + + lcc = ll_cl_init(file, vmpage, 0); + if (!IS_ERR(lcc)) { + struct lu_env *env = lcc->lcc_env; + struct cl_io *io = lcc->lcc_io; + struct cl_page *page = lcc->lcc_page; + + LASSERT(page->cp_type == CPT_CACHEABLE); + if (likely(!PageUptodate(vmpage))) { + cl_page_assume(env, io, page); + result = cl_io_read_page(env, io, page); + } else { + /* Page from a non-object file. */ + unlock_page(vmpage); + result = 0; + } + ll_cl_fini(lcc); + } else { + unlock_page(vmpage); + result = PTR_ERR(lcc); + } + RETURN(result); +} diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c new file mode 100644 index 000000000000..27e4e64bc1e7 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/rw26.c @@ -0,0 +1,586 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/llite/rw26.c + * + * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" +#include + +/** + * Implements Linux VM address_space::invalidatepage() method. This method is + * called when the page is truncate from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, offset] bytes of the page remain valid (this is for a case of not-page + * aligned truncate). Lustre leaves partially truncated page in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static void ll_invalidatepage(struct page *vmpage, unsigned long offset) +{ + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + int refcheck; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + /* + * It is safe to not check anything in invalidatepage/releasepage + * below because they are run with page locked and all our io is + * happening with locked page too + */ + if (offset == 0) { + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + inode = vmpage->mapping->host; + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + lu_ref_add(&page->cp_reference, + "delete", vmpage); + cl_page_delete(env, page); + lu_ref_del(&page->cp_reference, + "delete", vmpage); + cl_page_put(env, page); + } + } else + LASSERT(vmpage->private == 0); + cl_env_put(env, &refcheck); + } + } +} + +#ifdef HAVE_RELEASEPAGE_WITH_INT +#define RELEASEPAGE_ARG_TYPE int +#else +#define RELEASEPAGE_ARG_TYPE gfp_t +#endif +static int ll_releasepage(struct page *vmpage, RELEASEPAGE_ARG_TYPE gfp_mask) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_object *obj; + struct cl_page *page; + struct address_space *mapping; + int result; + + LASSERT(PageLocked(vmpage)); + if (PageWriteback(vmpage) || PageDirty(vmpage)) + return 0; + + mapping = vmpage->mapping; + if (mapping == NULL) + return 1; + + obj = ll_i2info(mapping->host)->lli_clob; + if (obj == NULL) + return 1; + + /* 1 for page allocator, 1 for cl_page and 1 for page cache */ + if (page_count(vmpage) > 3) + return 0; + + /* TODO: determine what gfp should be used by @gfp_mask. */ + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + /* If we can't allocate an env we won't call cl_page_put() + * later on which further means it's impossible to drop + * page refcount by cl_page, so ask kernel to not free + * this page. */ + return 0; + + page = cl_vmpage_page(vmpage, obj); + result = page == NULL; + if (page != NULL) { + if (!cl_page_in_use(page)) { + result = 1; + cl_page_delete(env, page); + } + cl_page_put(env, page); + } + cl_env_nested_put(&nest, env); + return result; +} + +static int ll_set_page_dirty(struct page *vmpage) +{ +#if 0 + struct cl_page *page = vvp_vmpage_page_transient(vmpage); + struct vvp_object *obj = cl_inode2vvp(vmpage->mapping->host); + struct vvp_page *cpg; + + /* + * XXX should page method be called here? + */ + LASSERT(&obj->co_cl == page->cp_obj); + cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); + /* + * XXX cannot do much here, because page is possibly not locked: + * sys_munmap()->... + * ->unmap_page_range()->zap_pte_range()->set_page_dirty(). + */ + vvp_write_pending(obj, cpg); +#endif + RETURN(__set_page_dirty_nobuffers(vmpage)); +} + +#define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL + +static inline int ll_get_user_pages(int rw, unsigned long user_addr, + size_t size, struct page ***pages, + int *max_pages) +{ + int result = -ENOMEM; + + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) { + *pages = NULL; + return -EFBIG; + } + + *max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + *max_pages -= user_addr >> PAGE_CACHE_SHIFT; + + OBD_ALLOC_LARGE(*pages, *max_pages * sizeof(**pages)); + if (*pages) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + *max_pages, (rw == READ), 0, *pages, + NULL); + up_read(¤t->mm->mmap_sem); + if (unlikely(result <= 0)) + OBD_FREE_LARGE(*pages, *max_pages * sizeof(**pages)); + } + + return result; +} + +/* ll_free_user_pages - tear down page struct array + * @pages: array of page struct pointers underlying target buffer */ +static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) +{ + int i; + + for (i = 0; i < npages; i++) { + if (pages[i] == NULL) + break; + if (do_dirty) + set_page_dirty_lock(pages[i]); + page_cache_release(pages[i]); + } + + OBD_FREE_LARGE(pages, npages * sizeof(*pages)); +} + +ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv) +{ + struct cl_page *clp; + struct cl_2queue *queue; + struct cl_object *obj = io->ci_obj; + int i; + ssize_t rc = 0; + loff_t file_offset = pv->ldp_start_offset; + long size = pv->ldp_size; + int page_count = pv->ldp_nr; + struct page **pages = pv->ldp_pages; + long page_size = cl_page_size(obj); + bool do_io; + int io_pages = 0; + ENTRY; + + queue = &io->ci_queue; + cl_2queue_init(queue); + for (i = 0; i < page_count; i++) { + if (pv->ldp_offsets) + file_offset = pv->ldp_offsets[i]; + + LASSERT(!(file_offset & (page_size - 1))); + clp = cl_page_find(env, obj, cl_index(obj, file_offset), + pv->ldp_pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + do_io = true; + + /* check the page type: if the page is a host page, then do + * write directly */ + if (clp->cp_type == CPT_CACHEABLE) { + struct page *vmpage = cl_page_vmpage(env, clp); + struct page *src_page; + struct page *dst_page; + void *src; + void *dst; + + src_page = (rw == WRITE) ? pages[i] : vmpage; + dst_page = (rw == WRITE) ? vmpage : pages[i]; + + src = ll_kmap_atomic(src_page, KM_USER0); + dst = ll_kmap_atomic(dst_page, KM_USER1); + memcpy(dst, src, min(page_size, size)); + ll_kunmap_atomic(dst, KM_USER1); + ll_kunmap_atomic(src, KM_USER0); + + /* make sure page will be added to the transfer by + * cl_io_submit()->...->vvp_page_prep_write(). */ + if (rw == WRITE) + set_page_dirty(vmpage); + + if (rw == READ) { + /* do not issue the page for read, since it + * may reread a ra page which has NOT uptodate + * bit set. */ + cl_page_disown(env, io, clp); + do_io = false; + } + } + + if (likely(do_io)) { + cl_2queue_add(queue, clp); + + /* + * Set page clip to tell transfer formation engine + * that page has to be sent even if it is beyond KMS. + */ + cl_page_clip(env, clp, 0, min(size, page_size)); + + ++io_pages; + } + + /* drop the reference count for cl_page_find */ + cl_page_put(env, clp); + size -= page_size; + file_offset += page_size; + } + + if (rc == 0 && io_pages) { + rc = cl_io_submit_sync(env, io, + rw == READ ? CRT_READ : CRT_WRITE, + queue, 0); + } + if (rc == 0) + rc = pv->ldp_size; + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + RETURN(rc); +} +EXPORT_SYMBOL(ll_direct_rw_pages); + +static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct address_space *mapping, + size_t size, loff_t file_offset, + struct page **pages, int page_count) +{ + struct ll_dio_pages pvec = { .ldp_pages = pages, + .ldp_nr = page_count, + .ldp_size = size, + .ldp_offsets = NULL, + .ldp_start_offset = file_offset + }; + + return ll_direct_rw_pages(env, io, rw, inode, &pvec); +} + +#ifdef KMALLOC_MAX_SIZE +#define MAX_MALLOC KMALLOC_MAX_SIZE +#else +#define MAX_MALLOC (128 * 1024) +#endif + +/* This is the maximum size of a single O_DIRECT request, based on the + * kmalloc limit. We need to fit all of the brw_page structs, each one + * representing PAGE_SIZE worth of user data, into a single buffer, and + * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is + * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */ +#define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) & \ + ~(DT_MAX_BRW_SIZE - 1)) +static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) +{ + struct lu_env *env; + struct cl_io *io; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ccc_object *obj = cl_inode2ccc(inode); + long count = iov_length(iov, nr_segs); + long tot_bytes = 0, result = 0; + struct ll_inode_info *lli = ll_i2info(inode); + unsigned long seg = 0; + long size = MAX_DIO_SIZE; + int refcheck; + ENTRY; + + if (!lli->lli_has_smd) + RETURN(-EBADF); + + /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ + if ((file_offset & ~CFS_PAGE_MASK) || (count & ~CFS_PAGE_MASK)) + RETURN(-EINVAL); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), size=%lu (max %lu), " + "offset=%lld=%llx, pages %lu (max %lu)\n", + inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE, + file_offset, file_offset, count >> PAGE_CACHE_SHIFT, + MAX_DIO_SIZE >> PAGE_CACHE_SHIFT); + + /* Check that all user buffers are aligned as well */ + for (seg = 0; seg < nr_segs; seg++) { + if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) || + (iov[seg].iov_len & ~CFS_PAGE_MASK)) + RETURN(-EINVAL); + } + + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + io = ccc_env_io(env)->cui_cl.cis_io; + LASSERT(io != NULL); + + /* 0. Need locking between buffered and direct access. and race with + * size changing by concurrent truncates and writes. + * 1. Need inode mutex to operate transient pages. + */ + if (rw == READ) + mutex_lock(&inode->i_mutex); + + LASSERT(obj->cob_transient_pages == 0); + for (seg = 0; seg < nr_segs; seg++) { + long iov_left = iov[seg].iov_len; + unsigned long user_addr = (unsigned long)iov[seg].iov_base; + + if (rw == READ) { + if (file_offset >= i_size_read(inode)) + break; + if (file_offset + iov_left > i_size_read(inode)) + iov_left = i_size_read(inode) - file_offset; + } + + while (iov_left > 0) { + struct page **pages; + int page_count, max_pages = 0; + long bytes; + + bytes = min(size, iov_left); + page_count = ll_get_user_pages(rw, user_addr, bytes, + &pages, &max_pages); + if (likely(page_count > 0)) { + if (unlikely(page_count < max_pages)) + bytes = page_count << PAGE_CACHE_SHIFT; + result = ll_direct_IO_26_seg(env, io, rw, inode, + file->f_mapping, + bytes, file_offset, + pages, page_count); + ll_free_user_pages(pages, max_pages, rw==READ); + } else if (page_count == 0) { + GOTO(out, result = -EFAULT); + } else { + result = page_count; + } + if (unlikely(result <= 0)) { + /* If we can't allocate a large enough buffer + * for the request, shrink it to a smaller + * PAGE_SIZE multiple and try again. + * We should always be able to kmalloc for a + * page worth of page pointers = 4MB on i386. */ + if (result == -ENOMEM && + size > (PAGE_CACHE_SIZE / sizeof(*pages)) * + PAGE_CACHE_SIZE) { + size = ((((size / 2) - 1) | + ~CFS_PAGE_MASK) + 1) & + CFS_PAGE_MASK; + CDEBUG(D_VFSTRACE,"DIO size now %lu\n", + size); + continue; + } + + GOTO(out, result); + } + + tot_bytes += result; + file_offset += result; + iov_left -= result; + user_addr += result; + } + } +out: + LASSERT(obj->cob_transient_pages == 0); + if (rw == READ) + mutex_unlock(&inode->i_mutex); + + if (tot_bytes > 0) { + if (rw == WRITE) { + struct lov_stripe_md *lsm; + + lsm = ccc_inode_lsm_get(inode); + LASSERT(lsm != NULL); + lov_stripe_lock(lsm); + obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0); + lov_stripe_unlock(lsm); + ccc_inode_lsm_put(inode, lsm); + } + } + + cl_env_put(env, &refcheck); + RETURN(tot_bytes ? : result); +} + +static int ll_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int rc; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + ENTRY; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + RETURN(-ENOMEM); + + *pagep = page; + + rc = ll_prepare_write(file, page, from, from + len); + if (rc) { + unlock_page(page); + page_cache_release(page); + } + RETURN(rc); +} + +static int ll_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + int rc; + + rc = ll_commit_write(file, page, from, from + copied); + unlock_page(page); + page_cache_release(page); + + return rc ?: copied; +} + +#ifdef CONFIG_MIGRATION +int ll_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page + , enum migrate_mode mode + ) +{ + /* Always fail page migration until we have a proper implementation */ + return -EIO; +} +#endif + +#ifndef MS_HAS_NEW_AOPS +struct address_space_operations ll_aops = { + .readpage = ll_readpage, +// .readpages = ll_readpages, + .direct_IO = ll_direct_IO_26, + .writepage = ll_writepage, + .writepages = ll_writepages, + .set_page_dirty = ll_set_page_dirty, + .write_begin = ll_write_begin, + .write_end = ll_write_end, + .invalidatepage = ll_invalidatepage, + .releasepage = (void *)ll_releasepage, +#ifdef CONFIG_MIGRATION + .migratepage = ll_migratepage, +#endif + .bmap = NULL +}; +#else +struct address_space_operations_ext ll_aops = { + .orig_aops.readpage = ll_readpage, +// .orig_aops.readpages = ll_readpages, + .orig_aops.direct_IO = ll_direct_IO_26, + .orig_aops.writepage = ll_writepage, + .orig_aops.writepages = ll_writepages, + .orig_aops.set_page_dirty = ll_set_page_dirty, + .orig_aops.prepare_write = ll_prepare_write, + .orig_aops.commit_write = ll_commit_write, + .orig_aops.invalidatepage = ll_invalidatepage, + .orig_aops.releasepage = ll_releasepage, +#ifdef CONFIG_MIGRATION + .orig_aops.migratepage = ll_migratepage, +#endif + .orig_aops.bmap = NULL, + .write_begin = ll_write_begin, + .write_end = ll_write_end +}; +#endif diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c new file mode 100644 index 000000000000..7747f8f2079d --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/statahead.c @@ -0,0 +1,1722 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include "llite_internal.h" + +#define SA_OMITTED_ENTRY_MAX 8ULL + +typedef enum { + /** negative values are for error cases */ + SA_ENTRY_INIT = 0, /** init entry */ + SA_ENTRY_SUCC = 1, /** stat succeed */ + SA_ENTRY_INVA = 2, /** invalid entry */ + SA_ENTRY_DEST = 3, /** entry to be destroyed */ +} se_stat_t; + +struct ll_sa_entry { + /* link into sai->sai_entries */ + struct list_head se_link; + /* link into sai->sai_entries_{received,stated} */ + struct list_head se_list; + /* link into sai hash table locally */ + struct list_head se_hash; + /* entry reference count */ + atomic_t se_refcount; + /* entry index in the sai */ + __u64 se_index; + /* low layer ldlm lock handle */ + __u64 se_handle; + /* entry status */ + se_stat_t se_stat; + /* entry size, contains name */ + int se_size; + /* pointer to async getattr enqueue info */ + struct md_enqueue_info *se_minfo; + /* pointer to the async getattr request */ + struct ptlrpc_request *se_req; + /* pointer to the target inode */ + struct inode *se_inode; + /* entry name */ + struct qstr se_qstr; +}; + +static unsigned int sai_generation = 0; +static DEFINE_SPINLOCK(sai_generation_lock); + +static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) +{ + return list_empty(&entry->se_hash); +} + +/* + * The entry only can be released by the caller, it is necessary to hold lock. + */ +static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) +{ + smp_rmb(); + return (entry->se_stat != SA_ENTRY_INIT); +} + +static inline int ll_sa_entry_hash(int val) +{ + return val & LL_SA_CACHE_MASK; +} + +/* + * Insert entry to hash SA table. + */ +static inline void +ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + int i = ll_sa_entry_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_add_tail(&entry->se_hash, &sai->sai_cache[i]); + spin_unlock(&sai->sai_cache_lock[i]); +} + +/* + * Remove entry from SA table. + */ +static inline void +ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + int i = ll_sa_entry_hash(entry->se_qstr.hash); + + spin_lock(&sai->sai_cache_lock[i]); + list_del_init(&entry->se_hash); + spin_unlock(&sai->sai_cache_lock[i]); +} + +static inline int agl_should_run(struct ll_statahead_info *sai, + struct inode *inode) +{ + return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid); +} + +static inline struct ll_sa_entry * +sa_first_received_entry(struct ll_statahead_info *sai) +{ + return list_entry(sai->sai_entries_received.next, + struct ll_sa_entry, se_list); +} + +static inline struct ll_inode_info * +agl_first_entry(struct ll_statahead_info *sai) +{ + return list_entry(sai->sai_entries_agl.next, + struct ll_inode_info, lli_agl_list); +} + +static inline int sa_sent_full(struct ll_statahead_info *sai) +{ + return atomic_read(&sai->sai_cache_count) >= sai->sai_max; +} + +static inline int sa_received_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_entries_received); +} + +static inline int agl_list_empty(struct ll_statahead_info *sai) +{ + return list_empty(&sai->sai_entries_agl); +} + +/** + * (1) hit ratio less than 80% + * or + * (2) consecutive miss more than 8 + * then means low hit. + */ +static inline int sa_low_hit(struct ll_statahead_info *sai) +{ + return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || + (sai->sai_consecutive_miss > 8)); +} + +/* + * If the given index is behind of statahead window more than + * SA_OMITTED_ENTRY_MAX, then it is old. + */ +static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) +{ + return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < + sai->sai_index); +} + +/* + * Insert it into sai_entries tail when init. + */ +static struct ll_sa_entry * +ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index, + const char *name, int len) +{ + struct ll_inode_info *lli; + struct ll_sa_entry *entry; + int entry_size; + char *dname; + ENTRY; + + entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; + OBD_ALLOC(entry, entry_size); + if (unlikely(entry == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n", + len, name, entry, index); + + entry->se_index = index; + + /* + * Statahead entry reference rules: + * + * 1) When statahead entry is initialized, its reference is set as 2. + * One reference is used by the directory scanner. When the scanner + * searches the statahead cache for the given name, it can perform + * lockless hash lookup (only the scanner can remove entry from hash + * list), and once found, it needn't to call "atomic_inc()" for the + * entry reference. So the performance is improved. After using the + * statahead entry, the scanner will call "atomic_dec()" to drop the + * reference held when initialization. If it is the last reference, + * the statahead entry will be freed. + * + * 2) All other threads, including statahead thread and ptlrpcd thread, + * when they process the statahead entry, the reference for target + * should be held to guarantee the entry will not be released by the + * directory scanner. After processing the entry, these threads will + * drop the entry reference. If it is the last reference, the entry + * will be freed. + * + * The second reference when initializes the statahead entry is used + * by the statahead thread, following the rule 2). + */ + atomic_set(&entry->se_refcount, 2); + entry->se_stat = SA_ENTRY_INIT; + entry->se_size = entry_size; + dname = (char *)entry + sizeof(struct ll_sa_entry); + memcpy(dname, name, len); + dname[len] = 0; + entry->se_qstr.hash = full_name_hash(name, len); + entry->se_qstr.len = len; + entry->se_qstr.name = dname; + + lli = ll_i2info(sai->sai_inode); + spin_lock(&lli->lli_sa_lock); + list_add_tail(&entry->se_link, &sai->sai_entries); + INIT_LIST_HEAD(&entry->se_list); + ll_sa_entry_enhash(sai, entry); + spin_unlock(&lli->lli_sa_lock); + + atomic_inc(&sai->sai_cache_count); + + RETURN(entry); +} + +/* + * Used by the directory scanner to search entry with name. + * + * Only the caller can remove the entry from hash, so it is unnecessary to hold + * hash lock. It is caller's duty to release the init refcount on the entry, so + * it is also unnecessary to increase refcount on the entry. + */ +static struct ll_sa_entry * +ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) +{ + struct ll_sa_entry *entry; + int i = ll_sa_entry_hash(qstr->hash); + + list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { + if (entry->se_qstr.hash == qstr->hash && + entry->se_qstr.len == qstr->len && + memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) + return entry; + } + return NULL; +} + +/* + * Used by the async getattr request callback to find entry with index. + * + * Inside lli_sa_lock to prevent others to change the list during the search. + * It needs to increase entry refcount before returning to guarantee that the + * entry cannot be freed by others. + */ +static struct ll_sa_entry * +ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) +{ + struct ll_sa_entry *entry; + + list_for_each_entry(entry, &sai->sai_entries, se_link) { + if (entry->se_index == index) { + LASSERT(atomic_read(&entry->se_refcount) > 0); + atomic_inc(&entry->se_refcount); + return entry; + } + if (entry->se_index > index) + break; + } + return NULL; +} + +static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, + struct ll_sa_entry *entry) +{ + struct md_enqueue_info *minfo = entry->se_minfo; + struct ptlrpc_request *req = entry->se_req; + + if (minfo) { + entry->se_minfo = NULL; + ll_intent_release(&minfo->mi_it); + iput(minfo->mi_dir); + OBD_FREE_PTR(minfo); + } + + if (req) { + entry->se_req = NULL; + ptlrpc_req_finished(req); + } +} + +static void ll_sa_entry_put(struct ll_statahead_info *sai, + struct ll_sa_entry *entry) +{ + if (atomic_dec_and_test(&entry->se_refcount)) { + CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n", + entry->se_qstr.len, entry->se_qstr.name, entry, + entry->se_index); + + LASSERT(list_empty(&entry->se_link)); + LASSERT(list_empty(&entry->se_list)); + LASSERT(ll_sa_entry_unhashed(entry)); + + ll_sa_entry_cleanup(sai, entry); + if (entry->se_inode) + iput(entry->se_inode); + + OBD_FREE(entry, entry->se_size); + atomic_dec(&sai->sai_cache_count); + } +} + +static inline void +do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + + LASSERT(!ll_sa_entry_unhashed(entry)); + LASSERT(!list_empty(&entry->se_link)); + + ll_sa_entry_unhash(sai, entry); + + spin_lock(&lli->lli_sa_lock); + entry->se_stat = SA_ENTRY_DEST; + list_del_init(&entry->se_link); + if (likely(!list_empty(&entry->se_list))) + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + ll_sa_entry_put(sai, entry); +} + +/* + * Delete it from sai_entries_stated list when fini. + */ +static void +ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ll_sa_entry *pos, *next; + + if (entry) + do_sa_entry_fini(sai, entry); + + /* drop old entry, only 'scanner' process does this, no need to lock */ + list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { + if (!is_omitted_entry(sai, pos->se_index)) + break; + do_sa_entry_fini(sai, pos); + } +} + +/* + * Inside lli_sa_lock. + */ +static void +do_sa_entry_to_stated(struct ll_statahead_info *sai, + struct ll_sa_entry *entry, se_stat_t stat) +{ + struct ll_sa_entry *se; + struct list_head *pos = &sai->sai_entries_stated; + + if (!list_empty(&entry->se_list)) + list_del_init(&entry->se_list); + + list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { + if (se->se_index < entry->se_index) { + pos = &se->se_list; + break; + } + } + + list_add(&entry->se_list, pos); + entry->se_stat = stat; +} + +/* + * Move entry to sai_entries_stated and sort with the index. + * \retval 1 -- entry to be destroyed. + * \retval 0 -- entry is inserted into stated list. + */ +static int +ll_sa_entry_to_stated(struct ll_statahead_info *sai, + struct ll_sa_entry *entry, se_stat_t stat) +{ + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + int ret = 1; + + ll_sa_entry_cleanup(sai, entry); + + spin_lock(&lli->lli_sa_lock); + if (likely(entry->se_stat != SA_ENTRY_DEST)) { + do_sa_entry_to_stated(sai, entry, stat); + ret = 0; + } + spin_unlock(&lli->lli_sa_lock); + + return ret; +} + +/* + * Insert inode into the list of sai_entries_agl. + */ +static void ll_agl_add(struct ll_statahead_info *sai, + struct inode *inode, int index) +{ + struct ll_inode_info *child = ll_i2info(inode); + struct ll_inode_info *parent = ll_i2info(sai->sai_inode); + int added = 0; + + spin_lock(&child->lli_agl_lock); + if (child->lli_agl_index == 0) { + child->lli_agl_index = index; + spin_unlock(&child->lli_agl_lock); + + LASSERT(list_empty(&child->lli_agl_list)); + + igrab(inode); + spin_lock(&parent->lli_agl_lock); + if (agl_list_empty(sai)) + added = 1; + list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); + spin_unlock(&parent->lli_agl_lock); + } else { + spin_unlock(&child->lli_agl_lock); + } + + if (added > 0) + wake_up(&sai->sai_agl_thread.t_ctl_waitq); +} + +static struct ll_statahead_info *ll_sai_alloc(void) +{ + struct ll_statahead_info *sai; + int i; + ENTRY; + + OBD_ALLOC_PTR(sai); + if (!sai) + RETURN(NULL); + + atomic_set(&sai->sai_refcount, 1); + + spin_lock(&sai_generation_lock); + sai->sai_generation = ++sai_generation; + if (unlikely(sai_generation == 0)) + sai->sai_generation = ++sai_generation; + spin_unlock(&sai_generation_lock); + + sai->sai_max = LL_SA_RPC_MIN; + sai->sai_index = 1; + init_waitqueue_head(&sai->sai_waitq); + init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); + init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); + + INIT_LIST_HEAD(&sai->sai_entries); + INIT_LIST_HEAD(&sai->sai_entries_received); + INIT_LIST_HEAD(&sai->sai_entries_stated); + INIT_LIST_HEAD(&sai->sai_entries_agl); + + for (i = 0; i < LL_SA_CACHE_SIZE; i++) { + INIT_LIST_HEAD(&sai->sai_cache[i]); + spin_lock_init(&sai->sai_cache_lock[i]); + } + atomic_set(&sai->sai_cache_count, 0); + + RETURN(sai); +} + +static inline struct ll_statahead_info * +ll_sai_get(struct ll_statahead_info *sai) +{ + atomic_inc(&sai->sai_refcount); + return sai; +} + +static void ll_sai_put(struct ll_statahead_info *sai) +{ + struct inode *inode = sai->sai_inode; + struct ll_inode_info *lli = ll_i2info(inode); + ENTRY; + + if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { + struct ll_sa_entry *entry, *next; + + if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { + /* It is race case, the interpret callback just hold + * a reference count */ + spin_unlock(&lli->lli_sa_lock); + RETURN_EXIT; + } + + LASSERT(lli->lli_opendir_key == NULL); + LASSERT(thread_is_stopped(&sai->sai_thread)); + LASSERT(thread_is_stopped(&sai->sai_agl_thread)); + + lli->lli_sai = NULL; + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + + if (sai->sai_sent > sai->sai_replied) + CDEBUG(D_READA,"statahead for dir "DFID" does not " + "finish: [sent:"LPU64"] [replied:"LPU64"]\n", + PFID(&lli->lli_fid), + sai->sai_sent, sai->sai_replied); + + list_for_each_entry_safe(entry, next, + &sai->sai_entries, se_link) + do_sa_entry_fini(sai, entry); + + LASSERT(list_empty(&sai->sai_entries)); + LASSERT(sa_received_empty(sai)); + LASSERT(list_empty(&sai->sai_entries_stated)); + + LASSERT(atomic_read(&sai->sai_cache_count) == 0); + LASSERT(agl_list_empty(sai)); + + iput(inode); + OBD_FREE_PTR(sai); + } + + EXIT; +} + +/* Do NOT forget to drop inode refcount when into sai_entries_agl. */ +static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) +{ + struct ll_inode_info *lli = ll_i2info(inode); + __u64 index = lli->lli_agl_index; + int rc; + ENTRY; + + LASSERT(list_empty(&lli->lli_agl_list)); + + /* AGL maybe fall behind statahead with one entry */ + if (is_omitted_entry(sai, index + 1)) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* Someone is in glimpse (sync or async), do nothing. */ + rc = down_write_trylock(&lli->lli_glimpse_sem); + if (rc == 0) { + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + /* + * Someone triggered glimpse within 1 sec before. + * 1) The former glimpse succeeded with glimpse lock granted by OST, and + * if the lock is still cached on client, AGL needs to do nothing. If + * it is cancelled by other client, AGL maybe cannot obtaion new lock + * for no glimpse callback triggered by AGL. + * 2) The former glimpse succeeded, but OST did not grant glimpse lock. + * Under such case, it is quite possible that the OST will not grant + * glimpse lock for AGL also. + * 3) The former glimpse failed, compared with other two cases, it is + * relative rare. AGL can ignore such case, and it will not muchly + * affect the performance. + */ + if (lli->lli_glimpse_time != 0 && + cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { + up_write(&lli->lli_glimpse_sem); + lli->lli_agl_index = 0; + iput(inode); + RETURN_EXIT; + } + + CDEBUG(D_READA, "Handling (init) async glimpse: inode = " + DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index); + + cl_agl(inode); + lli->lli_agl_index = 0; + lli->lli_glimpse_time = cfs_time_current(); + up_write(&lli->lli_glimpse_sem); + + CDEBUG(D_READA, "Handled (init) async glimpse: inode= " + DFID", idx = "LPU64", rc = %d\n", + PFID(&lli->lli_fid), index, rc); + + iput(inode); + + EXIT; +} + +static void ll_post_statahead(struct ll_statahead_info *sai) +{ + struct inode *dir = sai->sai_inode; + struct inode *child; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_sa_entry *entry; + struct md_enqueue_info *minfo; + struct lookup_intent *it; + struct ptlrpc_request *req; + struct mdt_body *body; + int rc = 0; + ENTRY; + + spin_lock(&lli->lli_sa_lock); + if (unlikely(sa_received_empty(sai))) { + spin_unlock(&lli->lli_sa_lock); + RETURN_EXIT; + } + entry = sa_first_received_entry(sai); + atomic_inc(&entry->se_refcount); + list_del_init(&entry->se_list); + spin_unlock(&lli->lli_sa_lock); + + LASSERT(entry->se_handle != 0); + + minfo = entry->se_minfo; + it = &minfo->mi_it; + req = entry->se_req; + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + child = entry->se_inode; + if (child == NULL) { + /* + * lookup. + */ + LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); + + /* XXX: No fid in reply, this is probaly cross-ref case. + * SA can't handle it yet. */ + if (body->valid & OBD_MD_MDS) + GOTO(out, rc = -EAGAIN); + } else { + /* + * revalidate. + */ + /* unlinked and re-created with the same name */ + if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){ + entry->se_inode = NULL; + iput(child); + child = NULL; + } + } + + it->d.lustre.it_lock_handle = entry->se_handle; + rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); + if (rc != 1) + GOTO(out, rc = -EAGAIN); + + rc = ll_prep_inode(&child, req, dir->i_sb, it); + if (rc) + GOTO(out, rc); + + CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", + child, child->i_ino, child->i_generation); + ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); + + entry->se_inode = child; + + if (agl_should_run(sai, child)) + ll_agl_add(sai, child, entry->se_index); + + EXIT; + +out: + /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock + * reference count by calling "ll_intent_drop_lock()" in spite of the + * above operations failed or not. Do not worry about calling + * "ll_intent_drop_lock()" more than once. */ + rc = ll_sa_entry_to_stated(sai, entry, + rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); + if (rc == 0 && entry->se_index == sai->sai_index_wait) + wake_up(&sai->sai_waitq); + ll_sa_entry_put(sai, entry); +} + +static int ll_statahead_interpret(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, int rc) +{ + struct lookup_intent *it = &minfo->mi_it; + struct inode *dir = minfo->mi_dir; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = NULL; + struct ll_sa_entry *entry; + int wakeup; + ENTRY; + + if (it_disposition(it, DISP_LOOKUP_NEG)) + rc = -ENOENT; + + spin_lock(&lli->lli_sa_lock); + /* stale entry */ + if (unlikely(lli->lli_sai == NULL || + lli->lli_sai->sai_generation != minfo->mi_generation)) { + spin_unlock(&lli->lli_sa_lock); + GOTO(out, rc = -ESTALE); + } else { + sai = ll_sai_get(lli->lli_sai); + if (unlikely(!thread_is_running(&sai->sai_thread))) { + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + GOTO(out, rc = -EBADFD); + } + + entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); + if (entry == NULL) { + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + GOTO(out, rc = -EIDRM); + } + + if (rc != 0) { + do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); + wakeup = (entry->se_index == sai->sai_index_wait); + } else { + entry->se_minfo = minfo; + entry->se_req = ptlrpc_request_addref(req); + /* Release the async ibits lock ASAP to avoid deadlock + * when statahead thread tries to enqueue lock on parent + * for readpage and other tries to enqueue lock on child + * with parent's lock held, for example: unlink. */ + entry->se_handle = it->d.lustre.it_lock_handle; + ll_intent_drop_lock(it); + wakeup = sa_received_empty(sai); + list_add_tail(&entry->se_list, + &sai->sai_entries_received); + } + sai->sai_replied++; + spin_unlock(&lli->lli_sa_lock); + + ll_sa_entry_put(sai, entry); + if (wakeup) + wake_up(&sai->sai_thread.t_ctl_waitq); + } + + EXIT; + +out: + if (rc != 0) { + ll_intent_release(it); + iput(dir); + OBD_FREE_PTR(minfo); + } + if (sai != NULL) + ll_sai_put(sai); + return rc; +} + +static void sa_args_fini(struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + LASSERT(minfo && einfo); + iput(minfo->mi_dir); + capa_put(minfo->mi_data.op_capa1); + capa_put(minfo->mi_data.op_capa2); + OBD_FREE_PTR(minfo); + OBD_FREE_PTR(einfo); +} + +/** + * There is race condition between "capa_put" and "ll_statahead_interpret" for + * accessing "op_data.op_capa[1,2]" as following: + * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling + * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and + * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid + * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling + * "md_intent_getattr_async". + */ +static int sa_args_init(struct inode *dir, struct inode *child, + struct ll_sa_entry *entry, struct md_enqueue_info **pmi, + struct ldlm_enqueue_info **pei, + struct obd_capa **pcapa) +{ + struct qstr *qstr = &entry->se_qstr; + struct ll_inode_info *lli = ll_i2info(dir); + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct md_op_data *op_data; + + OBD_ALLOC_PTR(einfo); + if (einfo == NULL) + return -ENOMEM; + + OBD_ALLOC_PTR(minfo); + if (minfo == NULL) { + OBD_FREE_PTR(einfo); + return -ENOMEM; + } + + op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, + qstr->len, 0, LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) { + OBD_FREE_PTR(einfo); + OBD_FREE_PTR(minfo); + return PTR_ERR(op_data); + } + + minfo->mi_it.it_op = IT_GETATTR; + minfo->mi_dir = igrab(dir); + minfo->mi_cb = ll_statahead_interpret; + minfo->mi_generation = lli->lli_sai->sai_generation; + minfo->mi_cbdata = entry->se_index; + + einfo->ei_type = LDLM_IBITS; + einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); + einfo->ei_cb_bl = ll_md_blocking_ast; + einfo->ei_cb_cp = ldlm_completion_ast; + einfo->ei_cb_gl = NULL; + einfo->ei_cbdata = NULL; + + *pmi = minfo; + *pei = einfo; + pcapa[0] = op_data->op_capa1; + pcapa[1] = op_data->op_capa2; + + return 0; +} + +static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) +{ + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct obd_capa *capas[2]; + int rc; + ENTRY; + + rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas); + if (rc) + RETURN(rc); + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); + if (!rc) { + capa_put(capas[0]); + capa_put(capas[1]); + } else { + sa_args_fini(minfo, einfo); + } + + RETURN(rc); +} + +/** + * similar to ll_revalidate_it(). + * \retval 1 -- dentry valid + * \retval 0 -- will send stat-ahead request + * \retval others -- prepare stat-ahead request failed + */ +static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = 0 }; + struct md_enqueue_info *minfo; + struct ldlm_enqueue_info *einfo; + struct obd_capa *capas[2]; + int rc; + ENTRY; + + if (unlikely(inode == NULL)) + RETURN(1); + + if (d_mountpoint(dentry)) + RETURN(1); + + if (unlikely(dentry == dentry->d_sb->s_root)) + RETURN(1); + + entry->se_inode = igrab(inode); + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL); + if (rc == 1) { + entry->se_handle = it.d.lustre.it_lock_handle; + ll_intent_release(&it); + RETURN(1); + } + + rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas); + if (rc) { + entry->se_inode = NULL; + iput(inode); + RETURN(rc); + } + + rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); + if (!rc) { + capa_put(capas[0]); + capa_put(capas[1]); + } else { + entry->se_inode = NULL; + iput(inode); + sa_args_fini(minfo, einfo); + } + + RETURN(rc); +} + +static void ll_statahead_one(struct dentry *parent, const char* entry_name, + int entry_name_len) +{ + struct inode *dir = parent->d_inode; + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *dentry = NULL; + struct ll_sa_entry *entry; + int rc; + int rc1; + ENTRY; + + entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name, + entry_name_len); + if (IS_ERR(entry)) + RETURN_EXIT; + + dentry = d_lookup(parent, &entry->se_qstr); + if (!dentry) { + rc = do_sa_lookup(dir, entry); + } else { + rc = do_sa_revalidate(dir, entry, dentry); + if (rc == 1 && agl_should_run(sai, dentry->d_inode)) + ll_agl_add(sai, dentry->d_inode, entry->se_index); + } + + if (dentry != NULL) + dput(dentry); + + if (rc) { + rc1 = ll_sa_entry_to_stated(sai, entry, + rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); + if (rc1 == 0 && entry->se_index == sai->sai_index_wait) + wake_up(&sai->sai_waitq); + } else { + sai->sai_sent++; + } + + sai->sai_index++; + /* drop one refcount on entry by ll_sa_entry_alloc */ + ll_sa_entry_put(sai, entry); + + EXIT; +} + +static int ll_agl_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); + struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct l_wait_info lwi = { 0 }; + ENTRY; + + CDEBUG(D_READA, "agl thread started: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + + atomic_inc(&sbi->ll_agl_total); + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 1; + thread_set_flags(thread, SVC_RUNNING); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + + while (1) { + l_wait_event(thread->t_ctl_waitq, + !agl_list_empty(sai) || + !thread_is_running(thread), + &lwi); + + if (!thread_is_running(thread)) + break; + + spin_lock(&plli->lli_agl_lock); + /* The statahead thread maybe help to process AGL entries, + * so check whether list empty again. */ + if (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + } else { + spin_unlock(&plli->lli_agl_lock); + } + } + + spin_lock(&plli->lli_agl_lock); + sai->sai_agl_valid = 0; + while (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + clli->lli_agl_index = 0; + iput(&clli->lli_vfs_inode); + spin_lock(&plli->lli_agl_lock); + } + thread_set_flags(thread, SVC_STOPPED); + spin_unlock(&plli->lli_agl_lock); + wake_up(&thread->t_ctl_waitq); + ll_sai_put(sai); + CDEBUG(D_READA, "agl thread stopped: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + RETURN(0); +} + +static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) +{ + struct ptlrpc_thread *thread = &sai->sai_agl_thread; + struct l_wait_info lwi = { 0 }; + struct ll_inode_info *plli; + task_t *task; + ENTRY; + + CDEBUG(D_READA, "start agl thread: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + + plli = ll_i2info(parent->d_inode); + task = kthread_run(ll_agl_thread, parent, + "ll_agl_%u", plli->lli_opendir_pid); + if (IS_ERR(task)) { + CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); + thread_set_flags(thread, SVC_STOPPED); + RETURN_EXIT; + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + EXIT; +} + +static int ll_statahead_thread(void *arg) +{ + struct dentry *parent = (struct dentry *)arg; + struct inode *dir = parent->d_inode; + struct ll_inode_info *plli = ll_i2info(dir); + struct ll_inode_info *clli; + struct ll_sb_info *sbi = ll_i2sbi(dir); + struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); + struct ptlrpc_thread *thread = &sai->sai_thread; + struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; + struct page *page; + __u64 pos = 0; + int first = 0; + int rc = 0; + struct ll_dir_chain chain; + struct l_wait_info lwi = { 0 }; + ENTRY; + + CDEBUG(D_READA, "statahead thread started: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + + if (sbi->ll_flags & LL_SBI_AGL_ENABLED) + ll_start_agl(parent, sai); + + atomic_inc(&sbi->ll_sa_total); + spin_lock(&plli->lli_sa_lock); + thread_set_flags(thread, SVC_RUNNING); + spin_unlock(&plli->lli_sa_lock); + wake_up(&thread->t_ctl_waitq); + + ll_dir_chain_init(&chain); + page = ll_get_dir_page(dir, pos, &chain); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + rc = PTR_ERR(page); + CDEBUG(D_READA, "error reading dir "DFID" at "LPU64 + "/"LPU64": [rc %d] [parent %u]\n", + PFID(ll_inode2fid(dir)), pos, sai->sai_index, + rc, plli->lli_opendir_pid); + GOTO(out, rc); + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + if (unlikely(hash < pos)) + /* + * Skip until we find target hash value. + */ + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * Skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) { + /* + * skip "." + */ + continue; + } else if (name[1] == '.' && namelen == 2) { + /* + * skip ".." + */ + continue; + } else if (!sai->sai_ls_all) { + /* + * skip hidden files. + */ + sai->sai_skip_hidden++; + continue; + } + } + + /* + * don't stat-ahead first entry. + */ + if (unlikely(++first == 1)) + continue; + +keep_it: + l_wait_event(thread->t_ctl_waitq, + !sa_sent_full(sai) || + !sa_received_empty(sai) || + !agl_list_empty(sai) || + !thread_is_running(thread), + &lwi); + +interpret_it: + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + if (unlikely(!thread_is_running(thread))) { + ll_release_page(page, 0); + GOTO(out, rc = 0); + } + + /* If no window for metadata statahead, but there are + * some AGL entries to be triggered, then try to help + * to process the AGL entries. */ + if (sa_sent_full(sai)) { + spin_lock(&plli->lli_agl_lock); + while (!agl_list_empty(sai)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, + sai); + + if (!sa_received_empty(sai)) + goto interpret_it; + + if (unlikely( + !thread_is_running(thread))) { + ll_release_page(page, 0); + GOTO(out, rc = 0); + } + + if (!sa_sent_full(sai)) + goto do_it; + + spin_lock(&plli->lli_agl_lock); + } + spin_unlock(&plli->lli_agl_lock); + + goto keep_it; + } + +do_it: + ll_statahead_one(parent, name, namelen); + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(page, 0); + while (1) { + l_wait_event(thread->t_ctl_waitq, + !sa_received_empty(sai) || + sai->sai_sent == sai->sai_replied|| + !thread_is_running(thread), + &lwi); + + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + if (unlikely(!thread_is_running(thread))) + GOTO(out, rc = 0); + + if (sai->sai_sent == sai->sai_replied && + sa_received_empty(sai)) + break; + } + + spin_lock(&plli->lli_agl_lock); + while (!agl_list_empty(sai) && + thread_is_running(thread)) { + clli = agl_first_entry(sai); + list_del_init(&clli->lli_agl_list); + spin_unlock(&plli->lli_agl_lock); + ll_agl_trigger(&clli->lli_vfs_inode, sai); + spin_lock(&plli->lli_agl_lock); + } + spin_unlock(&plli->lli_agl_lock); + + GOTO(out, rc = 0); + } else if (1) { + /* + * chain is exhausted. + * Normal case: continue to the next page. + */ + ll_release_page(page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + sai->sai_in_readpage = 1; + page = ll_get_dir_page(dir, pos, &chain); + sai->sai_in_readpage = 0; + } else { + LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + ll_release_page(page, 1); + /* + * go into overflow page. + */ + } + } + EXIT; + +out: + if (sai->sai_agl_valid) { + spin_lock(&plli->lli_agl_lock); + thread_set_flags(agl_thread, SVC_STOPPING); + spin_unlock(&plli->lli_agl_lock); + wake_up(&agl_thread->t_ctl_waitq); + + CDEBUG(D_READA, "stop agl thread: [pid %d]\n", + current_pid()); + l_wait_event(agl_thread->t_ctl_waitq, + thread_is_stopped(agl_thread), + &lwi); + } else { + /* Set agl_thread flags anyway. */ + thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); + } + ll_dir_chain_fini(&chain); + spin_lock(&plli->lli_sa_lock); + if (!sa_received_empty(sai)) { + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&plli->lli_sa_lock); + + /* To release the resources held by received entries. */ + while (!sa_received_empty(sai)) + ll_post_statahead(sai); + + spin_lock(&plli->lli_sa_lock); + } + thread_set_flags(thread, SVC_STOPPED); + spin_unlock(&plli->lli_sa_lock); + wake_up(&sai->sai_waitq); + wake_up(&thread->t_ctl_waitq); + ll_sai_put(sai); + dput(parent); + CDEBUG(D_READA, "statahead thread stopped: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + return rc; +} + +/** + * called in ll_file_release(). + */ +void ll_stop_statahead(struct inode *dir, void *key) +{ + struct ll_inode_info *lli = ll_i2info(dir); + + if (unlikely(key == NULL)) + return; + + spin_lock(&lli->lli_sa_lock); + if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { + spin_unlock(&lli->lli_sa_lock); + return; + } + + lli->lli_opendir_key = NULL; + + if (lli->lli_sai) { + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; + + if (!thread_is_stopped(thread)) { + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&lli->lli_sa_lock); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_READA, "stop statahead thread: [pid %d]\n", + current_pid()); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + } else { + spin_unlock(&lli->lli_sa_lock); + } + + /* + * Put the ref which was held when first statahead_enter. + * It maybe not the last ref for some statahead requests + * maybe inflight. + */ + ll_sai_put(lli->lli_sai); + } else { + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + } +} + +enum { + /** + * not first dirent, or is "." + */ + LS_NONE_FIRST_DE = 0, + /** + * the first non-hidden dirent + */ + LS_FIRST_DE, + /** + * the first hidden dirent, that is "." + */ + LS_FIRST_DOT_DE +}; + +static int is_first_dirent(struct inode *dir, struct dentry *dentry) +{ + struct ll_dir_chain chain; + struct qstr *target = &dentry->d_name; + struct page *page; + __u64 pos = 0; + int dot_de; + int rc = LS_NONE_FIRST_DE; + ENTRY; + + ll_dir_chain_init(&chain); + page = ll_get_dir_page(dir, pos, &chain); + + while (1) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (IS_ERR(page)) { + struct ll_inode_info *lli = ll_i2info(dir); + + rc = PTR_ERR(page); + CERROR("error reading dir "DFID" at "LPU64": " + "[rc %d] [parent %u]\n", + PFID(ll_inode2fid(dir)), pos, + rc, lli->lli_opendir_pid); + break; + } + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL; + ent = lu_dirent_next(ent)) { + __u64 hash; + int namelen; + char *name; + + hash = le64_to_cpu(ent->lde_hash); + /* The ll_get_dir_page() can return any page containing + * the given hash which may be not the start hash. */ + if (unlikely(hash < pos)) + continue; + + namelen = le16_to_cpu(ent->lde_namelen); + if (unlikely(namelen == 0)) + /* + * skip dummy record. + */ + continue; + + name = ent->lde_name; + if (name[0] == '.') { + if (namelen == 1) + /* + * skip "." + */ + continue; + else if (name[1] == '.' && namelen == 2) + /* + * skip ".." + */ + continue; + else + dot_de = 1; + } else { + dot_de = 0; + } + + if (dot_de && target->name[0] != '.') { + CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", + target->len, target->name, + namelen, name); + continue; + } + + if (target->len != namelen || + memcmp(target->name, name, namelen) != 0) + rc = LS_NONE_FIRST_DE; + else if (!dot_de) + rc = LS_FIRST_DE; + else + rc = LS_FIRST_DOT_DE; + + ll_release_page(page, 0); + GOTO(out, rc); + } + pos = le64_to_cpu(dp->ldp_hash_end); + if (pos == MDS_DIR_END_OFF) { + /* + * End of directory reached. + */ + ll_release_page(page, 0); + break; + } else if (1) { + /* + * chain is exhausted + * Normal case: continue to the next page. + */ + ll_release_page(page, le32_to_cpu(dp->ldp_flags) & + LDF_COLLIDE); + page = ll_get_dir_page(dir, pos, &chain); + } else { + /* + * go into overflow page. + */ + LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); + ll_release_page(page, 1); + } + } + EXIT; + +out: + ll_dir_chain_fini(&chain); + return rc; +} + +static void +ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) +{ + struct ptlrpc_thread *thread = &sai->sai_thread; + struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); + int hit; + ENTRY; + + if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) + hit = 1; + else + hit = 0; + + ll_sa_entry_fini(sai, entry); + if (hit) { + sai->sai_hit++; + sai->sai_consecutive_miss = 0; + sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); + } else { + struct ll_inode_info *lli = ll_i2info(sai->sai_inode); + + sai->sai_miss++; + sai->sai_consecutive_miss++; + if (sa_low_hit(sai) && thread_is_running(thread)) { + atomic_inc(&sbi->ll_sa_wrong); + CDEBUG(D_READA, "Statahead for dir "DFID" hit " + "ratio too low: hit/miss "LPU64"/"LPU64 + ", sent/replied "LPU64"/"LPU64", stopping " + "statahead thread: pid %d\n", + PFID(&lli->lli_fid), sai->sai_hit, + sai->sai_miss, sai->sai_sent, + sai->sai_replied, current_pid()); + spin_lock(&lli->lli_sa_lock); + if (!thread_is_stopped(thread)) + thread_set_flags(thread, SVC_STOPPING); + spin_unlock(&lli->lli_sa_lock); + } + } + + if (!thread_is_stopped(thread)) + wake_up(&thread->t_ctl_waitq); + + EXIT; +} + +/** + * Start statahead thread if this is the first dir entry. + * Otherwise if a thread is started already, wait it until it is ahead of me. + * \retval 1 -- find entry with lock in cache, the caller needs to do + * nothing. + * \retval 0 -- find entry in cache, but without lock, the caller needs + * refresh from MDS. + * \retval others -- the caller need to process as non-statahead. + */ +int do_statahead_enter(struct inode *dir, struct dentry **dentryp, + int only_unplug) +{ + struct ll_inode_info *lli = ll_i2info(dir); + struct ll_statahead_info *sai = lli->lli_sai; + struct dentry *parent; + struct ll_sa_entry *entry; + struct ptlrpc_thread *thread; + struct l_wait_info lwi = { 0 }; + int rc = 0; + struct ll_inode_info *plli; + ENTRY; + + LASSERT(lli->lli_opendir_pid == current_pid()); + + if (sai) { + thread = &sai->sai_thread; + if (unlikely(thread_is_stopped(thread) && + list_empty(&sai->sai_entries_stated))) { + /* to release resource */ + ll_stop_statahead(dir, lli->lli_opendir_key); + RETURN(-EAGAIN); + } + + if ((*dentryp)->d_name.name[0] == '.') { + if (sai->sai_ls_all || + sai->sai_miss_hidden >= sai->sai_skip_hidden) { + /* + * Hidden dentry is the first one, or statahead + * thread does not skip so many hidden dentries + * before "sai_ls_all" enabled as below. + */ + } else { + if (!sai->sai_ls_all) + /* + * It maybe because hidden dentry is not + * the first one, "sai_ls_all" was not + * set, then "ls -al" missed. Enable + * "sai_ls_all" for such case. + */ + sai->sai_ls_all = 1; + + /* + * Such "getattr" has been skipped before + * "sai_ls_all" enabled as above. + */ + sai->sai_miss_hidden++; + RETURN(-EAGAIN); + } + } + + entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); + if (entry == NULL || only_unplug) { + ll_sai_unplug(sai, entry); + RETURN(entry ? 1 : -EAGAIN); + } + + /* if statahead is busy in readdir, help it do post-work */ + while (!ll_sa_entry_stated(entry) && + sai->sai_in_readpage && + !sa_received_empty(sai)) + ll_post_statahead(sai); + + if (!ll_sa_entry_stated(entry)) { + sai->sai_index_wait = entry->se_index; + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(sai->sai_waitq, + ll_sa_entry_stated(entry) || + thread_is_stopped(thread), + &lwi); + if (rc < 0) { + ll_sai_unplug(sai, entry); + RETURN(-EAGAIN); + } + } + + if (entry->se_stat == SA_ENTRY_SUCC && + entry->se_inode != NULL) { + struct inode *inode = entry->se_inode; + struct lookup_intent it = { .it_op = IT_GETATTR, + .d.lustre.it_lock_handle = + entry->se_handle }; + __u64 bits; + + rc = md_revalidate_lock(ll_i2mdexp(dir), &it, + ll_inode2fid(inode), &bits); + if (rc == 1) { + if ((*dentryp)->d_inode == NULL) { + *dentryp = ll_splice_alias(inode, + *dentryp); + } else if ((*dentryp)->d_inode != inode) { + /* revalidate, but inode is recreated */ + CDEBUG(D_READA, + "stale dentry %.*s inode %lu/%u, " + "statahead inode %lu/%u\n", + (*dentryp)->d_name.len, + (*dentryp)->d_name.name, + (*dentryp)->d_inode->i_ino, + (*dentryp)->d_inode->i_generation, + inode->i_ino, + inode->i_generation); + ll_sai_unplug(sai, entry); + RETURN(-ESTALE); + } else { + iput(inode); + } + entry->se_inode = NULL; + + if ((bits & MDS_INODELOCK_LOOKUP) && + d_lustre_invalid(*dentryp)) + d_lustre_revalidate(*dentryp); + ll_intent_release(&it); + } + } + + ll_sai_unplug(sai, entry); + RETURN(rc); + } + + /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ + rc = is_first_dirent(dir, *dentryp); + if (rc == LS_NONE_FIRST_DE) + /* It is not "ls -{a}l" operation, no need statahead for it. */ + GOTO(out, rc = -EAGAIN); + + sai = ll_sai_alloc(); + if (sai == NULL) + GOTO(out, rc = -ENOMEM); + + sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); + sai->sai_inode = igrab(dir); + if (unlikely(sai->sai_inode == NULL)) { + CWARN("Do not start stat ahead on dying inode "DFID"\n", + PFID(&lli->lli_fid)); + GOTO(out, rc = -ESTALE); + } + + /* get parent reference count here, and put it in ll_statahead_thread */ + parent = dget((*dentryp)->d_parent); + if (unlikely(sai->sai_inode != parent->d_inode)) { + struct ll_inode_info *nlli = ll_i2info(parent->d_inode); + + CWARN("Race condition, someone changed %.*s just now: " + "old parent "DFID", new parent "DFID"\n", + (*dentryp)->d_name.len, (*dentryp)->d_name.name, + PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); + dput(parent); + iput(sai->sai_inode); + GOTO(out, rc = -EAGAIN); + } + + CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %.*s]\n", + current_pid(), parent->d_name.len, parent->d_name.name); + + lli->lli_sai = sai; + + plli = ll_i2info(parent->d_inode); + rc = PTR_ERR(kthread_run(ll_statahead_thread, parent, + "ll_sa_%u", plli->lli_opendir_pid)); + thread = &sai->sai_thread; + if (IS_ERR_VALUE(rc)) { + CERROR("can't start ll_sa thread, rc: %d\n", rc); + dput(parent); + lli->lli_opendir_key = NULL; + thread_set_flags(thread, SVC_STOPPED); + thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); + ll_sai_put(sai); + LASSERT(lli->lli_sai == NULL); + RETURN(-EAGAIN); + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + /* + * We don't stat-ahead for the first dirent since we are already in + * lookup. + */ + RETURN(-EAGAIN); + +out: + if (sai != NULL) + OBD_FREE_PTR(sai); + spin_lock(&lli->lli_sa_lock); + lli->lli_opendir_key = NULL; + lli->lli_opendir_pid = 0; + spin_unlock(&lli->lli_sa_lock); + return rc; +} diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c new file mode 100644 index 000000000000..4101c52ed5d7 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/super25.c @@ -0,0 +1,226 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "llite_internal.h" + +static struct kmem_cache *ll_inode_cachep; + +static struct inode *ll_alloc_inode(struct super_block *sb) +{ + struct ll_inode_info *lli; + ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1); + OBD_SLAB_ALLOC_PTR_GFP(lli, ll_inode_cachep, __GFP_IO); + if (lli == NULL) + return NULL; + + inode_init_once(&lli->lli_vfs_inode); + return &lli->lli_vfs_inode; +} + +static void ll_inode_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ll_inode_info *ptr = ll_i2info(inode); + OBD_SLAB_FREE_PTR(ptr, ll_inode_cachep); +} + +static void ll_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ll_inode_destroy_callback); +} + +int ll_init_inodecache(void) +{ + ll_inode_cachep = kmem_cache_create("lustre_inode_cache", + sizeof(struct ll_inode_info), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (ll_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ll_destroy_inodecache(void) +{ + kmem_cache_destroy(ll_inode_cachep); +} + +/* exported operations */ +struct super_operations lustre_super_operations = +{ + .alloc_inode = ll_alloc_inode, + .destroy_inode = ll_destroy_inode, + .evict_inode = ll_delete_inode, + .put_super = ll_put_super, + .statfs = ll_statfs, + .umount_begin = ll_umount_begin, + .remount_fs = ll_remount_fs, + .show_options = ll_show_options, +}; + + +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); + +int vvp_global_init(void); +void vvp_global_fini(void); + +static int __init init_lustre_lite(void) +{ + int i, rc, seed[2]; + struct timeval tv; + lnet_process_id_t lnet_id; + + CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1); + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre client module (%p).\n", + &lustre_super_operations); + + rc = ll_init_inodecache(); + if (rc) + return -ENOMEM; + ll_file_data_slab = kmem_cache_create("ll_file_data", + sizeof(struct ll_file_data), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (ll_file_data_slab == NULL) { + ll_destroy_inodecache(); + return -ENOMEM; + } + + ll_remote_perm_cachep = kmem_cache_create("ll_remote_perm_cache", + sizeof(struct ll_remote_perm), + 0, 0, NULL); + if (ll_remote_perm_cachep == NULL) { + kmem_cache_destroy(ll_file_data_slab); + ll_file_data_slab = NULL; + ll_destroy_inodecache(); + return -ENOMEM; + } + + ll_rmtperm_hash_cachep = kmem_cache_create("ll_rmtperm_hash_cache", + REMOTE_PERM_HASHSIZE * + sizeof(struct list_head), + 0, 0, NULL); + if (ll_rmtperm_hash_cachep == NULL) { + kmem_cache_destroy(ll_remote_perm_cachep); + ll_remote_perm_cachep = NULL; + kmem_cache_destroy(ll_file_data_slab); + ll_file_data_slab = NULL; + ll_destroy_inodecache(); + return -ENOMEM; + } + + proc_lustre_fs_root = proc_lustre_root ? + lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL; + + lustre_register_client_fill_super(ll_fill_super); + lustre_register_kill_super_cb(ll_kill_super); + + lustre_register_client_process_config(ll_process_config); + + cfs_get_random_bytes(seed, sizeof(seed)); + + /* Nodes with small feet have little entropy + * the NID for this node gives the most entropy in the low bits */ + for (i=0; ; i++) { + if (LNetGetId(i, &lnet_id) == -ENOENT) { + break; + } + if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) { + seed[0] ^= LNET_NIDADDR(lnet_id.nid); + } + } + + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + + init_timer(&ll_capa_timer); + ll_capa_timer.function = ll_capa_timer_callback; + rc = ll_capa_thread_start(); + /* + * XXX normal cleanup is needed here. + */ + if (rc == 0) + rc = vvp_global_init(); + + return rc; +} + +static void __exit exit_lustre_lite(void) +{ + vvp_global_fini(); + del_timer(&ll_capa_timer); + ll_capa_thread_stop(); + LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0, + "client remaining capa count %d\n", + capa_count[CAPA_SITE_CLIENT]); + + lustre_register_client_fill_super(NULL); + lustre_register_kill_super_cb(NULL); + + lustre_register_client_process_config(NULL); + + ll_destroy_inodecache(); + + kmem_cache_destroy(ll_rmtperm_hash_cachep); + ll_rmtperm_hash_cachep = NULL; + + kmem_cache_destroy(ll_remote_perm_cachep); + ll_remote_perm_cachep = NULL; + + kmem_cache_destroy(ll_file_data_slab); + if (proc_lustre_fs_root) + lprocfs_remove(&proc_lustre_fs_root); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Lite Client File System"); +MODULE_LICENSE("GPL"); + +module_init(init_lustre_lite); +module_exit(exit_lustre_lite); diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c new file mode 100644 index 000000000000..5260e989a4e5 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -0,0 +1,192 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include "llite_internal.h" + +static int ll_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, char **symname) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc, symlen = i_size_read(inode) + 1; + struct mdt_body *body; + struct md_op_data *op_data; + ENTRY; + + *request = NULL; + + if (lli->lli_symlink_name) { + int print_limit = min_t(int, PAGE_SIZE - 128, symlen); + + *symname = lli->lli_symlink_name; + /* If the total CDEBUG() size is larger than a page, it + * will print a warning to the console, avoid this by + * printing just the last part of the symlink. */ + CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", + print_limit < symlen ? "..." : "", print_limit, + (*symname) + symlen - print_limit, symlen); + RETURN(0); + } + + op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, + LUSTRE_OPC_ANY, NULL); + if (IS_ERR(op_data)) + RETURN(PTR_ERR(op_data)); + + op_data->op_valid = OBD_MD_LINKNAME; + rc = md_getattr(sbi->ll_md_exp, op_data, request); + ll_finish_md_op_data(op_data); + if (rc) { + if (rc != -ENOENT) + CERROR("inode %lu: rc = %d\n", inode->i_ino, rc); + GOTO (failed, rc); + } + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + if ((body->valid & OBD_MD_LINKNAME) == 0) { + CERROR("OBD_MD_LINKNAME not set on reply\n"); + GOTO(failed, rc = -EPROTO); + } + + LASSERT(symlen != 0); + if (body->eadatasize != symlen) { + CERROR("inode %lu: symlink length %d not expected %d\n", + inode->i_ino, body->eadatasize - 1, symlen - 1); + GOTO(failed, rc = -EPROTO); + } + + *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); + if (*symname == NULL || + strnlen(*symname, symlen) != symlen - 1) { + /* not full/NULL terminated */ + CERROR("inode %lu: symlink not NULL terminated string" + "of length %d\n", inode->i_ino, symlen - 1); + GOTO(failed, rc = -EPROTO); + } + + OBD_ALLOC(lli->lli_symlink_name, symlen); + /* do not return an error if we cannot cache the symlink locally */ + if (lli->lli_symlink_name) { + memcpy(lli->lli_symlink_name, *symname, symlen); + *symname = lli->lli_symlink_name; + } + RETURN(0); + +failed: + RETURN (rc); +} + +static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct ptlrpc_request *request; + char *symname; + int rc; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + if (rc) + GOTO(out, rc); + + rc = vfs_readlink(dentry, buffer, buflen, symname); + out: + ptlrpc_req_finished(request); + ll_inode_size_unlock(inode); + RETURN(rc); +} + +static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct ptlrpc_request *request = NULL; + int rc; + char *symname; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op\n"); + /* Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. + * For 8k stacks we need to limit it to 7 for local servers. */ + if (THREAD_SIZE < 8192 && current->link_count >= 6) { + rc = -ELOOP; + } else if (THREAD_SIZE == 8192 && current->link_count >= 8) { + rc = -ELOOP; + } else { + ll_inode_size_lock(inode); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode); + } + if (rc) { + ptlrpc_req_finished(request); + request = NULL; + symname = ERR_PTR(rc); + } + + nd_set_link(nd, symname); + /* symname may contain a pointer to the request message buffer, + * we delay request releasing until ll_put_link then. + */ + RETURN(request); +} + +static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + ptlrpc_req_finished(cookie); +} + +struct inode_operations ll_fast_symlink_inode_operations = { + .readlink = ll_readlink, + .setattr = ll_setattr, + .follow_link = ll_follow_link, + .put_link = ll_put_link, + .getattr = ll_getattr, + .permission = ll_inode_permission, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, +}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c new file mode 100644 index 000000000000..60daf750e2e1 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c @@ -0,0 +1,547 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_device and cl_device_type implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +/* + * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical + * "llite_" (var. "ll_") prefix. + */ + +struct kmem_cache *vvp_thread_kmem; +static struct kmem_cache *vvp_session_kmem; +static struct lu_kmem_descr vvp_caches[] = { + { + .ckd_cache = &vvp_thread_kmem, + .ckd_name = "vvp_thread_kmem", + .ckd_size = sizeof (struct vvp_thread_info), + }, + { + .ckd_cache = &vvp_session_kmem, + .ckd_name = "vvp_session_kmem", + .ckd_size = sizeof (struct vvp_session) + }, + { + .ckd_cache = NULL + } +}; + +static void *vvp_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, vvp_thread_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void vvp_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, vvp_thread_kmem); +} + +static void *vvp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_session *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, vvp_session_kmem, __GFP_IO); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void vvp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_session *session = data; + OBD_SLAB_FREE_PTR(session, vvp_session_kmem); +} + + +struct lu_context_key vvp_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = vvp_key_init, + .lct_fini = vvp_key_fini +}; + +struct lu_context_key vvp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = vvp_session_key_init, + .lct_fini = vvp_session_key_fini +}; + +/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key); + +static const struct lu_device_operations vvp_lu_ops = { + .ldo_object_alloc = vvp_object_alloc +}; + +static const struct cl_device_operations vvp_cl_ops = { + .cdo_req_init = ccc_req_init +}; + +static struct lu_device *vvp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops); +} + +static const struct lu_device_type_operations vvp_device_type_ops = { + .ldto_init = vvp_type_init, + .ldto_fini = vvp_type_fini, + + .ldto_start = vvp_type_start, + .ldto_stop = vvp_type_stop, + + .ldto_device_alloc = vvp_device_alloc, + .ldto_device_free = ccc_device_free, + .ldto_device_init = ccc_device_init, + .ldto_device_fini = ccc_device_fini +}; + +struct lu_device_type vvp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_VVP_NAME, + .ldt_ops = &vvp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** + * A mutex serializing calls to vvp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +int vvp_global_init(void) +{ + int result; + + result = lu_kmem_init(vvp_caches); + if (result == 0) { + result = ccc_global_init(&vvp_device_type); + if (result != 0) + lu_kmem_fini(vvp_caches); + } + return result; +} + +void vvp_global_fini(void) +{ + ccc_global_fini(&vvp_device_type); + lu_kmem_fini(vvp_caches); +} + + +/***************************************************************************** + * + * mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + int refcheck; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cl = cl_type_setup(env, NULL, &vvp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + cl2ccc_dev(cl)->cdv_sb = sb; + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; + } + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + RETURN(rc); +} + +int cl_sb_fini(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + struct cl_device *cld; + int refcheck; + int result; + + ENTRY; + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cld = sbi->ll_cl; + + if (cld != NULL) { + cl_stack_fini(env, cld); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + result = 0; + } else { + CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); + result = PTR_ERR(env); + } + /* + * If mount failed (sbi->ll_cl == NULL), and this there are no other + * mounts, stop device types manually (this usually happens + * automatically when last device is destroyed). + */ + lu_types_stop(); + RETURN(result); +} + +/**************************************************************************** + * + * /proc/fs/lustre/llite/$MNT/dump_page_cache + * + ****************************************************************************/ + +/* + * To represent contents of a page cache as a byte stream, following + * information if encoded in 64bit offset: + * + * - file hash bucket in lu_site::ls_hash[] 28bits + * + * - how far file is from bucket head 4bits + * + * - page index 32bits + * + * First two data identify a file in the cache uniquely. + */ + +#define PGC_OBJ_SHIFT (32 + 4) +#define PGC_DEPTH_SHIFT (32) + +struct vvp_pgcache_id { + unsigned vpi_bucket; + unsigned vpi_depth; + uint32_t vpi_index; + + unsigned vpi_curdep; + struct lu_object_header *vpi_obj; +}; + +static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id) +{ + CLASSERT(sizeof(pos) == sizeof(__u64)); + + id->vpi_index = pos & 0xffffffff; + id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf; + id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT); +} + +static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id) +{ + return + ((__u64)id->vpi_index) | + ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) | + ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT); +} + +static int vvp_pgcache_obj_get(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + struct vvp_pgcache_id *id = data; + struct lu_object_header *hdr = cfs_hash_object(hs, hnode); + + if (id->vpi_curdep-- > 0) + return 0; /* continue */ + + if (lu_object_is_dying(hdr)) + return 1; + + cfs_hash_get(hs, hnode); + id->vpi_obj = hdr; + return 1; +} + +static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, + struct lu_device *dev, + struct vvp_pgcache_id *id) +{ + LASSERT(lu_device_is_cl(dev)); + + id->vpi_depth &= 0xf; + id->vpi_obj = NULL; + id->vpi_curdep = id->vpi_depth; + + cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket, + vvp_pgcache_obj_get, id); + if (id->vpi_obj != NULL) { + struct lu_object *lu_obj; + + lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type); + if (lu_obj != NULL) { + lu_object_ref_add(lu_obj, "dump", current); + return lu2cl(lu_obj); + } + lu_object_put(env, lu_object_top(id->vpi_obj)); + + } else if (id->vpi_curdep > 0) { + id->vpi_depth = 0xf; + } + return NULL; +} + +static loff_t vvp_pgcache_find(const struct lu_env *env, + struct lu_device *dev, loff_t pos) +{ + struct cl_object *clob; + struct lu_site *site; + struct vvp_pgcache_id id; + + site = dev->ld_site; + vvp_pgcache_id_unpack(pos, &id); + + while (1) { + if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash)) + return ~0ULL; + clob = vvp_pgcache_obj(env, dev, &id); + if (clob != NULL) { + struct cl_object_header *hdr; + int nr; + struct cl_page *pg; + + /* got an object. Find next page. */ + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + nr = radix_tree_gang_lookup(&hdr->coh_tree, + (void **)&pg, + id.vpi_index, 1); + if (nr > 0) { + id.vpi_index = pg->cp_index; + /* Cant support over 16T file */ + nr = !(pg->cp_index > 0xffffffff); + } + spin_unlock(&hdr->coh_page_guard); + + lu_object_ref_del(&clob->co_lu, "dump", current); + cl_object_put(env, clob); + if (nr > 0) + return vvp_pgcache_id_pack(&id); + } + /* to the next object. */ + ++id.vpi_depth; + id.vpi_depth &= 0xf; + if (id.vpi_depth == 0 && ++id.vpi_bucket == 0) + return ~0ULL; + id.vpi_index = 0; + } +} + +#define seq_page_flag(seq, page, flag, has_flags) do { \ + if (test_bit(PG_##flag, &(page)->flags)) { \ + seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ + has_flags = 1; \ + } \ +} while(0) + +static void vvp_pgcache_page_show(const struct lu_env *env, + struct seq_file *seq, struct cl_page *page) +{ + struct ccc_page *cpg; + struct page *vmpage; + int has_flags; + + cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + vmpage = cpg->cpg_page; + seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [", + 0 /* gen */, + cpg, page, + "none", + cpg->cpg_write_queued ? "wq" : "- ", + cpg->cpg_defer_uptodate ? "du" : "- ", + PageWriteback(vmpage) ? "wb" : "-", + vmpage, vmpage->mapping->host->i_ino, + vmpage->mapping->host->i_generation, + vmpage->mapping->host, vmpage->index, + page_count(vmpage)); + has_flags = 0; + seq_page_flag(seq, vmpage, locked, has_flags); + seq_page_flag(seq, vmpage, error, has_flags); + seq_page_flag(seq, vmpage, referenced, has_flags); + seq_page_flag(seq, vmpage, uptodate, has_flags); + seq_page_flag(seq, vmpage, dirty, has_flags); + seq_page_flag(seq, vmpage, writeback, has_flags); + seq_printf(seq, "%s]\n", has_flags ? "" : "-"); +} + +static int vvp_pgcache_show(struct seq_file *f, void *v) +{ + loff_t pos; + struct ll_sb_info *sbi; + struct cl_object *clob; + struct lu_env *env; + struct cl_page *page; + struct cl_object_header *hdr; + struct vvp_pgcache_id id; + int refcheck; + int result; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + pos = *(loff_t *) v; + vvp_pgcache_id_unpack(pos, &id); + sbi = f->private; + clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id); + if (clob != NULL) { + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + page = cl_page_lookup(hdr, id.vpi_index); + spin_unlock(&hdr->coh_page_guard); + + seq_printf(f, "%8x@"DFID": ", + id.vpi_index, PFID(&hdr->coh_lu.loh_fid)); + if (page != NULL) { + vvp_pgcache_page_show(env, f, page); + cl_page_put(env, page); + } else + seq_puts(f, "missing\n"); + lu_object_ref_del(&clob->co_lu, "dump", current); + cl_object_put(env, clob); + } else + seq_printf(f, "%llx missing\n", pos); + cl_env_put(env, &refcheck); + result = 0; + } else + result = PTR_ERR(env); + return result; +} + +static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + sbi = f->private; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + if (sbi->ll_site->ls_obj_hash->hs_cur_bits > 64 - PGC_OBJ_SHIFT) + pos = ERR_PTR(-EFBIG); + else { + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, + *pos); + if (*pos == ~0ULL) + pos = NULL; + } + cl_env_put(env, &refcheck); + } + return pos; +} + +static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1); + if (*pos == ~0ULL) + pos = NULL; + cl_env_put(env, &refcheck); + } + return pos; +} + +static void vvp_pgcache_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations vvp_pgcache_ops = { + .start = vvp_pgcache_start, + .next = vvp_pgcache_next, + .stop = vvp_pgcache_stop, + .show = vvp_pgcache_show +}; + +static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) +{ + struct proc_dir_entry *dp = PDE(inode); + struct ll_sb_info *sbi = dp->data; + struct seq_file *seq; + int result; + + result = seq_open(filp, &vvp_pgcache_ops); + if (result == 0) { + seq = filp->private_data; + seq->private = sbi; + } + return result; +} + +struct file_operations vvp_dump_pgcache_file_ops = { + .owner = THIS_MODULE, + .open = vvp_dump_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h new file mode 100644 index 000000000000..c82bf17f55a6 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h @@ -0,0 +1,62 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal definitions for VVP layer. + * + * Author: Nikita Danilov + */ + +#ifndef VVP_INTERNAL_H +#define VVP_INTERNAL_H + + +#include +#include "llite_internal.h" + +int vvp_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int vvp_lock_init (const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +int vvp_page_init (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct ccc_object *cl_inode2ccc(struct inode *inode); + +extern struct kmem_cache *vvp_thread_kmem; + +#endif /* VVP_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c new file mode 100644 index 000000000000..8504d448aac8 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -0,0 +1,1175 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include +#include + +#include "vvp_internal.h" + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice); + +/** + * True, if \a io is a normal io, False for sendfile() / splice_{read|write} + */ +int cl_is_normalio(const struct lu_env *env, const struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + return vio->cui_io_subtype == IO_NORMAL; +} + +/** + * For swapping layout. The file's layout may have changed. + * To avoid populating pages to a wrong stripe, we have to verify the + * correctness of layout. It works because swapping layout processes + * have to acquire group lock. + */ +static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, + struct inode *inode) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct ccc_io *cio = ccc_env_io(env); + bool rc = true; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* don't need lock here to check lli_layout_gen as we have held + * extent lock and GROUP lock has to hold to swap layout */ + if (lli->lli_layout_gen != cio->cui_layout_gen) { + io->ci_need_restart = 1; + /* this will return application a short read/write */ + io->ci_continue = 0; + rc = false; + } + case CIT_FAULT: + /* fault is okay because we've already had a page. */ + default: + break; + } + + return rc; +} + +/***************************************************************************** + * + * io operations. + * + */ + +static int vvp_io_fault_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = ccc_object_inode(ios->cis_obj); + + LASSERT(inode == + cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode); + vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime); + return 0; +} + +static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct ccc_io *cio = cl2ccc_io(env, ios); + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "ignore/verify layout %d/%d, layout version %d.\n", + io->ci_ignore_layout, io->ci_verify_layout, cio->cui_layout_gen); + + if (!io->ci_ignore_layout && io->ci_verify_layout) { + __u32 gen = 0; + + /* check layout version */ + ll_layout_refresh(ccc_object_inode(obj), &gen); + io->ci_need_restart = cio->cui_layout_gen != gen; + if (io->ci_need_restart) + CDEBUG(D_VFSTRACE, "layout changed from %d to %d.\n", + cio->cui_layout_gen, gen); + } +} + +static void vvp_io_fault_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_page *page = io->u.ci_fault.ft_page; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "fault", io); + cl_page_put(env, page); + io->u.ci_fault.ft_page = NULL; + } + vvp_io_fini(env, ios); +} + +enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) +{ + /* + * we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return CLM_WRITE; + return CLM_READ; +} + +static int vvp_mmap_locks(const struct lu_env *env, + struct ccc_io *vio, struct cl_io *io) +{ + struct ccc_thread_info *cti = ccc_env_info(env); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct cl_lock_descr *descr = &cti->cti_descr; + ldlm_policy_data_t policy; + unsigned long addr; + unsigned long seg; + ssize_t count; + int result; + ENTRY; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + if (!cl_is_normalio(env, io)) + RETURN(0); + + if (vio->cui_iov == NULL) /* nfs or loop back device write */ + RETURN(0); + + /* No MM (e.g. NFS)? No vmas too. */ + if (mm == NULL) + RETURN(0); + + for (seg = 0; seg < vio->cui_nrsegs; seg++) { + const struct iovec *iv = &vio->cui_iov[seg]; + + addr = (unsigned long)iv->iov_base; + count = iv->iov_len; + if (count == 0) + continue; + + count += addr & (~CFS_PAGE_MASK); + addr &= CFS_PAGE_MASK; + + down_read(&mm->mmap_sem); + while((vma = our_vma(mm, addr, count)) != NULL) { + struct inode *inode = vma->vm_file->f_dentry->d_inode; + int flags = CEF_MUST; + + if (ll_file_nolock(vma->vm_file)) { + /* + * For no lock case, a lockless lock will be + * generated. + */ + flags = CEF_NEVER; + } + + /* + * XXX: Required lock mode can be weakened: CIT_WRITE + * io only ever reads user level buffer, and CIT_READ + * only writes on it. + */ + policy_from_vma(&policy, vma, addr, count); + descr->cld_mode = vvp_mode_from_vma(vma); + descr->cld_obj = ll_i2info(inode)->lli_clob; + descr->cld_start = cl_index(descr->cld_obj, + policy.l_extent.start); + descr->cld_end = cl_index(descr->cld_obj, + policy.l_extent.end); + descr->cld_enq_flags = flags; + result = cl_io_lock_alloc_add(env, io, descr); + + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + descr->cld_mode, descr->cld_start, + descr->cld_end); + + if (result < 0) + RETURN(result); + + if (vma->vm_end - addr >= count) + break; + + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + up_read(&mm->mmap_sem); + } + RETURN(0); +} + +static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + struct ccc_io *cio = ccc_env_io(env); + int result; + int ast_flags = 0; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + ccc_io_update_iov(env, cio, io); + + if (io->u.ci_rw.crw_nonblock) + ast_flags |= CEF_NONBLOCK; + result = vvp_mmap_locks(env, cio, io); + if (result == 0) + result = ccc_io_one_lock(env, io, ast_flags, mode, start, end); + RETURN(result); +} + +static int vvp_io_read_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj)); + int result; + + ENTRY; + /* XXX: Layer violation, we shouldn't see lsm at llite level. */ + if (lli->lli_has_smd) /* lsm-less file doesn't need to lock */ + result = vvp_io_rw_lock(env, io, CLM_READ, + io->u.ci_rd.rd.crw_pos, + io->u.ci_rd.rd.crw_pos + + io->u.ci_rd.rd.crw_count - 1); + else + result = 0; + RETURN(result); +} + +static int vvp_io_fault_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct vvp_io *vio = cl2vvp_io(env, ios); + /* + * XXX LDLM_FL_CBPENDING + */ + return ccc_io_one_lock_index + (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma), + io->u.ci_fault.ft_index, io->u.ci_fault.ft_index); +} + +static int vvp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + return vvp_io_rw_lock(env, io, CLM_WRITE, start, end); +} + +static int vvp_io_setattr_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + return 0; +} + +/** + * Implementation of cl_io_operations::cio_lock() method for CIT_SETATTR io. + * + * Handles "lockless io" mode when extent locking is done by server. + */ +static int vvp_io_setattr_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = ccc_env_io(env); + struct cl_io *io = ios->cis_io; + __u64 new_size; + __u32 enqflags = 0; + + if (cl_io_is_trunc(io)) { + new_size = io->u.ci_setattr.sa_attr.lvb_size; + if (new_size == 0) + enqflags = CEF_DISCARD_DATA; + } else { + if ((io->u.ci_setattr.sa_attr.lvb_mtime >= + io->u.ci_setattr.sa_attr.lvb_ctime) || + (io->u.ci_setattr.sa_attr.lvb_atime >= + io->u.ci_setattr.sa_attr.lvb_ctime)) + return 0; + new_size = 0; + } + cio->u.setattr.cui_local_lock = SETATTR_EXTENT_LOCK; + return ccc_io_one_lock(env, io, enqflags, CLM_WRITE, + new_size, OBD_OBJECT_EOF); +} + +static int vvp_do_vmtruncate(struct inode *inode, size_t size) +{ + int result; + /* + * Only ll_inode_size_lock is taken at this level. + */ + ll_inode_size_lock(inode); + result = inode_newsize_ok(inode, size); + if (result < 0) { + ll_inode_size_unlock(inode); + return result; + } + truncate_setsize(inode, size); + ll_inode_size_unlock(inode); + return result; +} + +static int vvp_io_setattr_trunc(const struct lu_env *env, + const struct cl_io_slice *ios, + struct inode *inode, loff_t size) +{ + inode_dio_wait(inode); + return 0; +} + +static int vvp_io_setattr_time(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct cl_attr *attr = ccc_env_thread_attr(env); + int result; + unsigned valid = CAT_CTIME; + + cl_object_attr_lock(obj); + attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; + if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) { + attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; + valid |= CAT_ATIME; + } + if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) { + attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; + valid |= CAT_MTIME; + } + result = cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + + return result; +} + +static int vvp_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = ccc_object_inode(io->ci_obj); + + mutex_lock(&inode->i_mutex); + if (cl_io_is_trunc(io)) + return vvp_io_setattr_trunc(env, ios, inode, + io->u.ci_setattr.sa_attr.lvb_size); + else + return vvp_io_setattr_time(env, ios); +} + +static void vvp_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct inode *inode = ccc_object_inode(io->ci_obj); + + if (cl_io_is_trunc(io)) { + /* Truncate in memory pages - they must be clean pages + * because osc has already notified to destroy osc_extents. */ + vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); + inode_dio_write_done(inode); + } + mutex_unlock(&inode->i_mutex); +} + +static void vvp_io_setattr_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + vvp_io_fini(env, ios); +} + +static ssize_t lustre_generic_file_read(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_aio_read(vio->cui_iocb, vio->cui_iov, + vio->cui_nrsegs, *ppos); +} + +static ssize_t lustre_generic_file_write(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_aio_write(vio->cui_iocb, vio->cui_iov, + vio->cui_nrsegs, *ppos); +} + +static int vvp_io_read_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct ll_ra_read *bead = &vio->cui_bead; + struct file *file = cio->cui_fd->fd_file; + + int result; + loff_t pos = io->u.ci_rd.rd.crw_pos; + long cnt = io->u.ci_rd.rd.crw_count; + long tot = cio->cui_tot_count; + int exceed = 0; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt); + + if (!can_populate_pages(env, io, inode)) + return 0; + + result = ccc_prep_size(env, obj, io, pos, tot, &exceed); + if (result != 0) + return result; + else if (exceed != 0) + goto out; + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, + "Read ino %lu, %lu bytes, offset %lld, size %llu\n", + inode->i_ino, cnt, pos, i_size_read(inode)); + + /* turn off the kernel's read-ahead */ + cio->cui_fd->fd_file->f_ra.ra_pages = 0; + + /* initialize read-ahead window once per syscall */ + if (!vio->cui_ra_window_set) { + vio->cui_ra_window_set = 1; + bead->lrr_start = cl_index(obj, pos); + /* + * XXX: explicit PAGE_CACHE_SIZE + */ + bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1); + ll_ra_read_in(file, bead); + } + + /* BUG: 5972 */ + file_accessed(file); + switch (vio->cui_io_subtype) { + case IO_NORMAL: + result = lustre_generic_file_read(file, cio, &pos); + break; + case IO_SPLICE: + result = generic_file_splice_read(file, &pos, + vio->u.splice.cui_pipe, cnt, + vio->u.splice.cui_flags); + /* LU-1109: do splice read stripe by stripe otherwise if it + * may make nfsd stuck if this read occupied all internal pipe + * buffers. */ + io->ci_continue = 0; + break; + default: + CERROR("Wrong IO type %u\n", vio->cui_io_subtype); + LBUG(); + } + +out: + if (result >= 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, 0); + result = 0; + } + return result; +} + +static void vvp_io_read_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + + if (vio->cui_ra_window_set) + ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead); + + vvp_io_fini(env, ios); +} + +static int vvp_io_write_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct file *file = cio->cui_fd->fd_file; + ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + + ENTRY; + + if (!can_populate_pages(env, io, inode)) + return 0; + + if (cl_io_is_append(io)) { + /* + * PARALLEL IO This has to be changed for parallel IO doing + * out-of-order writes. + */ + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + cio->cui_iocb->ki_pos = pos; + } + + CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); + + if (cio->cui_iov == NULL) /* from a temp io in ll_cl_init(). */ + result = 0; + else + result = lustre_generic_file_write(file, cio, &pos); + + if (result > 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, 0); + result = 0; + } + RETURN(result); +} + +static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) +{ + struct vm_fault *vmf = cfio->fault.ft_vmf; + + cfio->fault.ft_flags = filemap_fault(cfio->ft_vma, vmf); + + if (vmf->page) { + LL_CDEBUG_PAGE(D_PAGE, vmf->page, "got addr %p type NOPAGE\n", + vmf->virtual_address); + if (unlikely(!(cfio->fault.ft_flags & VM_FAULT_LOCKED))) { + lock_page(vmf->page); + cfio->fault.ft_flags &= VM_FAULT_LOCKED; + } + + cfio->ft_vmpage = vmf->page; + return 0; + } + + if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) { + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); + return -EFAULT; + } + + if (cfio->fault.ft_flags & VM_FAULT_OOM) { + CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); + return -ENOMEM; + } + + if (cfio->fault.ft_flags & VM_FAULT_RETRY) + return -EAGAIN; + + CERROR("unknow error in page fault %d!\n", cfio->fault.ft_flags); + return -EINVAL; +} + + +static int vvp_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct cl_fault_io *fio = &io->u.ci_fault; + struct vvp_fault_io *cfio = &vio->u.fault; + loff_t offset; + int result = 0; + struct page *vmpage = NULL; + struct cl_page *page; + loff_t size; + pgoff_t last; /* last page in a file data region */ + + if (fio->ft_executable && + LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime) + CWARN("binary "DFID + " changed while waiting for the page fault lock\n", + PFID(lu_object_fid(&obj->co_lu))); + + /* offset of the last byte on the page */ + offset = cl_offset(obj, fio->ft_index + 1) - 1; + LASSERT(cl_index(obj, offset) == fio->ft_index); + result = ccc_prep_size(env, obj, io, 0, offset + 1, NULL); + if (result != 0) + return result; + + /* must return locked page */ + if (fio->ft_mkwrite) { + LASSERT(cfio->ft_vmpage != NULL); + lock_page(cfio->ft_vmpage); + } else { + result = vvp_io_kernel_fault(cfio); + if (result != 0) + return result; + } + + vmpage = cfio->ft_vmpage; + LASSERT(PageLocked(vmpage)); + + if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) + ll_invalidate_page(vmpage); + + size = i_size_read(inode); + /* Though we have already held a cl_lock upon this page, but + * it still can be truncated locally. */ + if (unlikely((vmpage->mapping != inode->i_mapping) || + (page_offset(vmpage) > size))) { + CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); + + /* return +1 to stop cl_io_loop() and ll_fault() will catch + * and retry. */ + GOTO(out, result = +1); + } + + + if (fio->ft_mkwrite ) { + pgoff_t last_index; + /* + * Capture the size while holding the lli_trunc_sem from above + * we want to make sure that we complete the mkwrite action + * while holding this lock. We need to make sure that we are + * not past the end of the file. + */ + last_index = cl_index(obj, size - 1); + if (last_index < fio->ft_index) { + CDEBUG(D_PAGE, + "llite: mkwrite and truncate race happened: " + "%p: 0x%lx 0x%lx\n", + vmpage->mapping,fio->ft_index,last_index); + /* + * We need to return if we are + * passed the end of the file. This will propagate + * up the call stack to ll_page_mkwrite where + * we will return VM_FAULT_NOPAGE. Any non-negative + * value returned here will be silently + * converted to 0. If the vmpage->mapping is null + * the error code would be converted back to ENODATA + * in ll_page_mkwrite0. Thus we return -ENODATA + * to handle both cases + */ + GOTO(out, result = -ENODATA); + } + } + + page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); + if (IS_ERR(page)) + GOTO(out, result = PTR_ERR(page)); + + /* if page is going to be written, we should add this page into cache + * earlier. */ + if (fio->ft_mkwrite) { + wait_on_page_writeback(vmpage); + if (set_page_dirty(vmpage)) { + struct ccc_page *cp; + + /* vvp_page_assume() calls wait_on_page_writeback(). */ + cl_page_assume(env, io, page); + + cp = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + vvp_write_pending(cl2ccc(obj), cp); + + /* Do not set Dirty bit here so that in case IO is + * started before the page is really made dirty, we + * still have chance to detect it. */ + result = cl_page_cache_add(env, io, page, CRT_WRITE); + LASSERT(cl_page_is_owned(page, io)); + + vmpage = NULL; + if (result < 0) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + + cl_page_put(env, page); + + /* we're in big trouble, what can we do now? */ + if (result == -EDQUOT) + result = -ENOSPC; + GOTO(out, result); + } else + cl_page_disown(env, io, page); + } + } + + last = cl_index(obj, size - 1); + /* + * The ft_index is only used in the case of + * a mkwrite action. We need to check + * our assertions are correct, since + * we should have caught this above + */ + LASSERT(!fio->ft_mkwrite || fio->ft_index <= last); + if (fio->ft_index == last) + /* + * Last page is mapped partially. + */ + fio->ft_nob = size - cl_offset(obj, fio->ft_index); + else + fio->ft_nob = cl_page_size(obj); + + lu_ref_add(&page->cp_reference, "fault", io); + fio->ft_page = page; + EXIT; + +out: + /* return unlocked vmpage to avoid deadlocking */ + if (vmpage != NULL) + unlock_page(vmpage); + cfio->fault.ft_flags &= ~VM_FAULT_LOCKED; + return result; +} + +static int vvp_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + /* we should mark TOWRITE bit to each dirty page in radix tree to + * verify pages have been written, but this is difficult because of + * race. */ + return 0; +} + +static int vvp_io_read_page(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *page = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = cl2ccc_io(env, ios)->cui_fd; + struct ll_readahead_state *ras = &fd->fd_ras; + struct page *vmpage = cp->cpg_page; + struct cl_2queue *queue = &io->ci_queue; + int rc; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + LASSERT(slice->cpl_obj == obj); + + ENTRY; + + if (sbi->ll_ra_info.ra_max_pages_per_file && + sbi->ll_ra_info.ra_max_pages) + ras_update(sbi, inode, ras, page->cp_index, + cp->cpg_defer_uptodate); + + /* Sanity check whether the page is protected by a lock. */ + rc = cl_page_is_under_lock(env, io, page); + if (rc != -EBUSY) { + CL_PAGE_HEADER(D_WARNING, env, page, "%s: %d\n", + rc == -ENODATA ? "without a lock" : + "match failed", rc); + if (rc != -ENODATA) + RETURN(rc); + } + + if (cp->cpg_defer_uptodate) { + cp->cpg_ra_used = 1; + cl_page_export(env, page, 1); + } + /* + * Add page into the queue even when it is marked uptodate above. + * this will unlock it automatically as part of cl_page_list_disown(). + */ + cl_2queue_add(queue, page); + if (sbi->ll_ra_info.ra_max_pages_per_file && + sbi->ll_ra_info.ra_max_pages) + ll_readahead(env, io, ras, + vmpage->mapping, &queue->c2_qin, fd->fd_flags); + + RETURN(0); +} + +static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct ccc_page *cp, + enum cl_req_type crt) +{ + struct cl_2queue *queue; + int result; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + queue = &io->ci_queue; + cl_2queue_init_page(queue, page); + + result = cl_io_submit_sync(env, io, crt, queue, 0); + LASSERT(cl_page_is_owned(page, io)); + + if (crt == CRT_READ) + /* + * in CRT_WRITE case page is left locked even in case of + * error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + + return result; +} + +/** + * Prepare partially written-to page for a write. + */ +static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io, + struct cl_object *obj, struct cl_page *pg, + struct ccc_page *cp, + unsigned from, unsigned to) +{ + struct cl_attr *attr = ccc_env_thread_attr(env); + loff_t offset = cl_offset(obj, pg->cp_index); + int result; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result == 0) { + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = ll_kmap_atomic(cp->cpg_page, KM_USER0); + + memset(kaddr, 0, cl_page_size(obj)); + ll_kunmap_atomic(kaddr, KM_USER0); + } else if (cp->cpg_defer_uptodate) + cp->cpg_ra_used = 1; + else + result = vvp_page_sync_io(env, io, pg, cp, CRT_READ); + /* + * In older implementations, obdo_refresh_inode is called here + * to update the inode because the write might modify the + * object info at OST. However, this has been proven useless, + * since LVB functions will be called when user space program + * tries to retrieve inode attribute. Also, see bug 15909 for + * details. -jay + */ + if (result == 0) + cl_page_export(env, pg, 1); + } + return result; +} + +static int vvp_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = cp->cpg_page; + + int result; + + ENTRY; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == ccc_object_inode(obj)); + + result = 0; + + CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to); + if (!PageUptodate(vmpage)) { + /* + * We're completely overwriting an existing page, so _don't_ + * set it up to date until commit_write + */ + if (from == 0 && to == PAGE_CACHE_SIZE) { + CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n"); + POISON_PAGE(page, 0x11); + } else + result = vvp_io_prepare_partial(env, ios->cis_io, obj, + pg, cp, from, to); + } else + CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n"); + RETURN(result); +} + +static int vvp_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct cl_io *io = ios->cis_io; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct page *vmpage = cp->cpg_page; + + int result; + int tallyop; + loff_t size; + + ENTRY; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == inode); + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n"); + CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to); + + /* + * queue a write for some time in the future the first time we + * dirty the page. + * + * This is different from what other file systems do: they usually + * just mark page (and some of its buffers) dirty and rely on + * balance_dirty_pages() to start a write-back. Lustre wants write-back + * to be started earlier for the following reasons: + * + * (1) with a large number of clients we need to limit the amount + * of cached data on the clients a lot; + * + * (2) large compute jobs generally want compute-only then io-only + * and the IO should complete as quickly as possible; + * + * (3) IO is batched up to the RPC size and is async until the + * client max cache is hit + * (/proc/fs/lustre/osc/OSC.../max_dirty_mb) + * + */ + if (!PageDirty(vmpage)) { + tallyop = LPROC_LL_DIRTY_MISSES; + result = cl_page_cache_add(env, io, pg, CRT_WRITE); + if (result == 0) { + /* page was added into cache successfully. */ + set_page_dirty(vmpage); + vvp_write_pending(cl2ccc(obj), cp); + } else if (result == -EDQUOT) { + pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; + bool need_clip = true; + + /* + * Client ran out of disk space grant. Possible + * strategies are: + * + * (a) do a sync write, renewing grant; + * + * (b) stop writing on this stripe, switch to the + * next one. + * + * (b) is a part of "parallel io" design that is the + * ultimate goal. (a) is what "old" client did, and + * what the new code continues to do for the time + * being. + */ + if (last_index > pg->cp_index) { + to = PAGE_CACHE_SIZE; + need_clip = false; + } else if (last_index == pg->cp_index) { + int size_to = i_size_read(inode) & ~CFS_PAGE_MASK; + if (to < size_to) + to = size_to; + } + if (need_clip) + cl_page_clip(env, pg, 0, to); + result = vvp_page_sync_io(env, io, pg, cp, CRT_WRITE); + if (result) + CERROR("Write page %lu of inode %p failed %d\n", + pg->cp_index, inode, result); + } + } else { + tallyop = LPROC_LL_DIRTY_HITS; + result = 0; + } + ll_stats_ops_tally(sbi, tallyop, 1); + + /* Inode should be marked DIRTY even if no new page was marked DIRTY + * because page could have been not flushed between 2 modifications. + * It is important the file is marked DIRTY as soon as the I/O is done + * Indeed, when cache is flushed, file could be already closed and it + * is too late to warn the MDT. + * It is acceptable that file is marked DIRTY even if I/O is dropped + * for some reasons before being flushed to OST. + */ + if (result == 0) { + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + + size = cl_offset(obj, pg->cp_index) + to; + + ll_inode_size_lock(inode); + if (result == 0) { + if (size > i_size_read(inode)) { + cl_isize_write_nolock(inode, size); + CDEBUG(D_VFSTRACE, DFID" updating i_size %lu\n", + PFID(lu_object_fid(&obj->co_lu)), + (unsigned long)size); + } + cl_page_export(env, pg, 1); + } else { + if (size > i_size_read(inode)) + cl_page_discard(env, io, pg); + } + ll_inode_size_unlock(inode); + RETURN(result); +} + +static const struct cl_io_operations vvp_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = vvp_io_read_fini, + .cio_lock = vvp_io_read_lock, + .cio_start = vvp_io_read_start, + .cio_advance = ccc_io_advance + }, + [CIT_WRITE] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_write_lock, + .cio_start = vvp_io_write_start, + .cio_advance = ccc_io_advance + }, + [CIT_SETATTR] = { + .cio_fini = vvp_io_setattr_fini, + .cio_iter_init = vvp_io_setattr_iter_init, + .cio_lock = vvp_io_setattr_lock, + .cio_start = vvp_io_setattr_start, + .cio_end = vvp_io_setattr_end + }, + [CIT_FAULT] = { + .cio_fini = vvp_io_fault_fini, + .cio_iter_init = vvp_io_fault_iter_init, + .cio_lock = vvp_io_fault_lock, + .cio_start = vvp_io_fault_start, + .cio_end = ccc_io_end + }, + [CIT_FSYNC] = { + .cio_start = vvp_io_fsync_start, + .cio_fini = vvp_io_fini + }, + [CIT_MISC] = { + .cio_fini = vvp_io_fini + } + }, + .cio_read_page = vvp_io_read_page, + .cio_prepare_write = vvp_io_prepare_write, + .cio_commit_write = vvp_io_commit_write +}; + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + struct inode *inode = ccc_object_inode(obj); + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + ENTRY; + + CL_IO_SLICE_CLEAN(cio, cui_cl); + cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops); + vio->cui_ra_window_set = 0; + result = 0; + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; + struct ll_inode_info *lli = ll_i2info(inode); + + count = io->u.ci_rw.crw_count; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else { + cio->cui_tot_count = count; + cio->cui_tot_nrsegs = 0; + } + /* for read/write, we store the jobid in the inode, and + * it'll be fetched by osc when building RPC. + * + * it's not accurate if the file is shared by different + * jobs. + */ + lustre_get_jobid(lli->lli_jobid); + } else if (io->ci_type == CIT_SETATTR) { + if (!cl_io_is_trunc(io)) + io->ci_lockreq = CILR_MANDATORY; + } + + /* ignore layout change for generic CIT_MISC but not for glimpse. + * io context for glimpse must set ci_verify_layout to true, + * see cl_glimpse_size0() for details. */ + if (io->ci_type == CIT_MISC && !io->ci_verify_layout) + io->ci_ignore_layout = 1; + + /* Enqueue layout lock and get layout version. We need to do this + * even for operations requiring to open file, such as read and write, + * because it might not grant layout lock in IT_OPEN. */ + if (result == 0 && !io->ci_ignore_layout) + result = ll_layout_refresh(inode, &cio->cui_layout_gen); + + RETURN(result); +} + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + /* Caling just for assertion */ + cl2ccc_io(env, slice); + return vvp_env_io(env); +} diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c new file mode 100644 index 000000000000..9b8712bccd92 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_lock.c @@ -0,0 +1,85 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp lock functions. + * + */ + +/** + * Estimates lock value for the purpose of managing the lock cache during + * memory shortages. + * + * Locks for memory mapped files are almost infinitely precious, others are + * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are + * ordered within themselves by weights assigned from other layers. + */ +static unsigned long vvp_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct ccc_object *cob = cl2ccc(slice->cls_obj); + + ENTRY; + RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0); +} + +static const struct cl_lock_operations vvp_lock_ops = { + .clo_delete = ccc_lock_delete, + .clo_fini = ccc_lock_fini, + .clo_enqueue = ccc_lock_enqueue, + .clo_wait = ccc_lock_wait, + .clo_unuse = ccc_lock_unuse, + .clo_fits_into = ccc_lock_fits_into, + .clo_state = ccc_lock_state, + .clo_weigh = vvp_lock_weigh +}; + +int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops); +} diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c new file mode 100644 index 000000000000..01edc5b63e13 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_object.c @@ -0,0 +1,186 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_object implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Object operations. + * + */ + +static int vvp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct ccc_object *obj = lu2ccc(o); + struct inode *inode = obj->cob_inode; + struct ll_inode_info *lli; + + (*p)(env, cookie, "(%s %d %d) inode: %p ", + list_empty(&obj->cob_pending_list) ? "-" : "+", + obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt), + inode); + if (inode) { + lli = ll_i2info(inode); + (*p)(env, cookie, "%lu/%u %o %u %d %p "DFID, + inode->i_ino, inode->i_generation, inode->i_mode, + inode->i_nlink, atomic_read(&inode->i_count), + lli->lli_clob, PFID(&lli->lli_fid)); + } + return 0; +} + +static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = ccc_object_inode(obj); + + /* + * lov overwrites most of these fields in + * lov_attr_get()->...lov_merge_lvb_kms(), except when inode + * attributes are newer. + */ + + attr->cat_size = i_size_read(inode); + attr->cat_mtime = LTIME_S(inode->i_mtime); + attr->cat_atime = LTIME_S(inode->i_atime); + attr->cat_ctime = LTIME_S(inode->i_ctime); + attr->cat_blocks = inode->i_blocks; + attr->cat_uid = inode->i_uid; + attr->cat_gid = inode->i_gid; + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct inode *inode = ccc_object_inode(obj); + + if (valid & CAT_UID) + inode->i_uid = attr->cat_uid; + if (valid & CAT_GID) + inode->i_gid = attr->cat_gid; + if (valid & CAT_ATIME) + LTIME_S(inode->i_atime) = attr->cat_atime; + if (valid & CAT_MTIME) + LTIME_S(inode->i_mtime) = attr->cat_mtime; + if (valid & CAT_CTIME) + LTIME_S(inode->i_ctime) = attr->cat_ctime; + if (0 && valid & CAT_SIZE) + cl_isize_write_nolock(inode, attr->cat_size); + /* not currently necessary */ + if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE)) + mark_inode_dirty(inode); + return 0; +} + +int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct ll_inode_info *lli = ll_i2info(conf->coc_inode); + + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + lli->lli_layout_gen = LL_LAYOUT_GEN_NONE; + return 0; + } + + if (conf->coc_opc != OBJECT_CONF_SET) + return 0; + + if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) { + CDEBUG(D_VFSTRACE, "layout lock change: %u -> %u\n", + lli->lli_layout_gen, + conf->u.coc_md->lsm->lsm_layout_gen); + + lli->lli_has_smd = true; + lli->lli_layout_gen = conf->u.coc_md->lsm->lsm_layout_gen; + } else { + CDEBUG(D_VFSTRACE, "layout lock destroyed: %u.\n", + lli->lli_layout_gen); + + lli->lli_has_smd = false; + lli->lli_layout_gen = LL_LAYOUT_GEN_EMPTY; + } + return 0; +} + +static const struct cl_object_operations vvp_ops = { + .coo_page_init = vvp_page_init, + .coo_lock_init = vvp_lock_init, + .coo_io_init = vvp_io_init, + .coo_attr_get = vvp_attr_get, + .coo_attr_set = vvp_attr_set, + .coo_conf_set = vvp_conf_set, + .coo_glimpse = ccc_object_glimpse +}; + +static const struct lu_object_operations vvp_lu_obj_ops = { + .loo_object_init = ccc_object_init, + .loo_object_free = ccc_object_free, + .loo_object_print = vvp_object_print +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode) +{ + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_object *lu; + + LASSERT(obj != NULL); + lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); + LASSERT(lu != NULL); + return lu2ccc(lu); +} + +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops); +} diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c new file mode 100644 index 000000000000..4568e69bb9f0 --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -0,0 +1,558 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for VVP layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LLITE + + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Page operations. + * + */ + +static void vvp_page_fini_common(struct ccc_page *cp) +{ + struct page *vmpage = cp->cpg_page; + + LASSERT(vmpage != NULL); + page_cache_release(vmpage); +} + +static void vvp_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct page *vmpage = cp->cpg_page; + + /* + * vmpage->private was already cleared when page was moved into + * VPG_FREEING state. + */ + LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); + vvp_page_fini_common(cp); +} + +static int vvp_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct ccc_page *vpg = cl2ccc_page(slice); + struct page *vmpage = vpg->cpg_page; + + LASSERT(vmpage != NULL); + if (nonblock) { + if (!trylock_page(vmpage)) + return -EAGAIN; + + if (unlikely(PageWriteback(vmpage))) { + unlock_page(vmpage); + return -EAGAIN; + } + + return 0; + } + + lock_page(vmpage); + wait_on_page_writeback(vmpage); + return 0; +} + +static void vvp_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); +} + +static void vvp_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + unlock_page(cl2vm_page(slice)); +} + +static void vvp_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + struct address_space *mapping; + struct ccc_page *cpg = cl2ccc_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + mapping = vmpage->mapping; + + if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used) + ll_ra_stats_inc(mapping, RA_STAT_DISCARDED); + + /* + * truncate_complete_page() calls + * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). + */ + truncate_complete_page(mapping, vmpage); +} + +static int vvp_page_unmap(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + __u64 offset; + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + offset = vmpage->index << PAGE_CACHE_SHIFT; + + /* + * XXX is it safe to call this with the page lock held? + */ + ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE); + return 0; +} + +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct inode *inode = vmpage->mapping->host; + struct cl_object *obj = slice->cpl_obj; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == slice->cpl_page); + LASSERT(inode == ccc_object_inode(obj)); + + vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice)); + ClearPagePrivate(vmpage); + vmpage->private = 0; + /* + * Reference from vmpage to cl_page is removed, but the reference back + * is still here. It is removed later in vvp_page_fini(). + */ +} + +static void vvp_page_export(const struct lu_env *env, + const struct cl_page_slice *slice, + int uptodate) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + if (uptodate) + SetPageUptodate(vmpage); + else + ClearPageUptodate(vmpage); +} + +static int vvp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; +} + +static int vvp_page_prep_read(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + ENTRY; + /* Skip the page already marked as PG_uptodate. */ + RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0); +} + +static int vvp_page_prep_write(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct page *vmpage = cl2vm_page(slice); + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageDirty(vmpage)); + + set_page_writeback(vmpage); + vvp_write_pending(cl2ccc(slice->cpl_obj), cl2ccc_page(slice)); + + return 0; +} + +/** + * Handles page transfer errors at VM level. + * + * This takes inode as a separate argument, because inode on which error is to + * be set can be different from \a vmpage inode in case of direct-io. + */ +static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret) +{ + struct ccc_object *obj = cl_inode2ccc(inode); + + if (ioret == 0) { + ClearPageError(vmpage); + obj->cob_discard_page_warned = 0; + } else { + SetPageError(vmpage); + if (ioret == -ENOSPC) + set_bit(AS_ENOSPC, &inode->i_mapping->flags); + else + set_bit(AS_EIO, &inode->i_mapping->flags); + + if ((ioret == -ESHUTDOWN || ioret == -EINTR) && + obj->cob_discard_page_warned == 0) { + obj->cob_discard_page_warned = 1; + ll_dirty_page_discard_warn(vmpage, ioret); + } + } +} + +static void vvp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct page *vmpage = cp->cpg_page; + struct cl_page *page = cl_page_top(slice->cpl_page); + struct inode *inode = ccc_object_inode(page->cp_obj); + ENTRY; + + LASSERT(PageLocked(vmpage)); + CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); + + if (cp->cpg_defer_uptodate) + ll_ra_count_put(ll_i2sbi(inode), 1); + + if (ioret == 0) { + if (!cp->cpg_defer_uptodate) + cl_page_export(env, page, 1); + } else + cp->cpg_defer_uptodate = 0; + + if (page->cp_sync_io == NULL) + unlock_page(vmpage); + + EXIT; +} + +static void vvp_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct page *vmpage = cp->cpg_page; + ENTRY; + + LASSERT(ergo(pg->cp_sync_io != NULL, PageLocked(vmpage))); + LASSERT(PageWriteback(vmpage)); + + CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); + + /* + * TODO: Actually it makes sense to add the page into oap pending + * list again and so that we don't need to take the page out from + * SoM write pending list, if we just meet a recoverable error, + * -ENOMEM, etc. + * To implement this, we just need to return a non zero value in + * ->cpo_completion method. The underlying transfer should be notified + * and then re-add the page into pending transfer queue. -jay + */ + + cp->cpg_write_queued = 0; + vvp_write_complete(cl2ccc(slice->cpl_obj), cp); + + /* + * Only mark the page error only when it's an async write because + * applications won't wait for IO to finish. + */ + if (pg->cp_sync_io == NULL) + vvp_vmpage_error(ccc_object_inode(pg->cp_obj), vmpage, ioret); + + end_page_writeback(vmpage); + EXIT; +} + +/** + * Implements cl_page_operations::cpo_make_ready() method. + * + * This is called to yank a page from the transfer cache and to send it out as + * a part of transfer. This function try-locks the page. If try-lock failed, + * page is owned by some concurrent IO, and should be skipped (this is bad, + * but hopefully rare situation, as it usually results in transfer being + * shorter than possible). + * + * \retval 0 success, page can be placed into transfer + * + * \retval -EAGAIN page is either used by concurrent IO has been + * truncated. Skip it. + */ +static int vvp_page_make_ready(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct page *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + int result = 0; + + lock_page(vmpage); + if (clear_page_dirty_for_io(vmpage)) { + LASSERT(pg->cp_state == CPS_CACHED); + /* This actually clears the dirty bit in the radix + * tree. */ + set_page_writeback(vmpage); + vvp_write_pending(cl2ccc(slice->cpl_obj), + cl2ccc_page(slice)); + CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); + } else if (pg->cp_state == CPS_PAGEOUT) { + /* is it possible for osc_flush_async_page() to already + * make it ready? */ + result = -EALREADY; + } else { + CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", + pg->cp_state); + LBUG(); + } + unlock_page(vmpage); + RETURN(result); +} + +static int vvp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct ccc_page *vp = cl2ccc_page(slice); + struct page *vmpage = vp->cpg_page; + + (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) " + "vm@%p ", + vp, vp->cpg_defer_uptodate, vp->cpg_ra_used, + vp->cpg_write_queued, vmpage); + if (vmpage != NULL) { + (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", + (long)vmpage->flags, page_count(vmpage), + page_mapcount(vmpage), vmpage->private, + page_index(vmpage), + list_empty(&vmpage->lru) ? "not-" : ""); + } + (*printer)(env, cookie, "\n"); + return 0; +} + +static const struct cl_page_operations vvp_page_ops = { + .cpo_own = vvp_page_own, + .cpo_assume = vvp_page_assume, + .cpo_unassume = vvp_page_unassume, + .cpo_disown = vvp_page_disown, + .cpo_vmpage = ccc_page_vmpage, + .cpo_discard = vvp_page_discard, + .cpo_delete = vvp_page_delete, + .cpo_unmap = vvp_page_unmap, + .cpo_export = vvp_page_export, + .cpo_is_vmlocked = vvp_page_is_vmlocked, + .cpo_fini = vvp_page_fini, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_page_prep_read, + .cpo_completion = vvp_page_completion_read, + .cpo_make_ready = ccc_fail, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_page_prep_write, + .cpo_completion = vvp_page_completion_write, + .cpo_make_ready = vvp_page_make_ready, + } + } +}; + +static void vvp_transient_page_verify(const struct cl_page *page) +{ + struct inode *inode = ccc_object_inode(page->cp_obj); + + LASSERT(!mutex_trylock(&inode->i_mutex)); +} + +static int vvp_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused, int nonblock) +{ + vvp_transient_page_verify(slice->cpl_page); + return 0; +} + +static void vvp_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_page *page = slice->cpl_page; + + vvp_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +static int vvp_transient_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct inode *inode = ccc_object_inode(slice->cpl_obj); + int locked; + + locked = !mutex_trylock(&inode->i_mutex); + if (!locked) + mutex_unlock(&inode->i_mutex); + return locked ? -EBUSY : -ENODATA; +} + +static void +vvp_transient_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *clp = slice->cpl_page; + struct ccc_object *clobj = cl2ccc(clp->cp_obj); + + vvp_page_fini_common(cp); + LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + clobj->cob_transient_pages--; +} + +static const struct cl_page_operations vvp_transient_page_ops = { + .cpo_own = vvp_transient_page_own, + .cpo_assume = vvp_transient_page_assume, + .cpo_unassume = vvp_transient_page_unassume, + .cpo_disown = vvp_transient_page_disown, + .cpo_discard = vvp_transient_page_discard, + .cpo_vmpage = ccc_page_vmpage, + .cpo_fini = vvp_transient_page_fini, + .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_transient_page_completion, + } + } +}; + +int vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct ccc_page *cpg = cl_object_page_slice(obj, page); + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + cpg->cpg_page = vmpage; + page_cache_get(vmpage); + + INIT_LIST_HEAD(&cpg->cpg_pending_linkage); + if (page->cp_type == CPT_CACHEABLE) { + SetPagePrivate(vmpage); + vmpage->private = (unsigned long)page; + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_page_ops); + } else { + struct ccc_object *clobj = cl2ccc(obj); + + LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_transient_page_ops); + clobj->cob_transient_pages++; + } + return 0; +} diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c new file mode 100644 index 000000000000..4176264984bb --- /dev/null +++ b/drivers/staging/lustre/lustre/llite/xattr.c @@ -0,0 +1,578 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include + +#include "llite_internal.h" + +#define XATTR_USER_T (1) +#define XATTR_TRUSTED_T (2) +#define XATTR_SECURITY_T (3) +#define XATTR_ACL_ACCESS_T (4) +#define XATTR_ACL_DEFAULT_T (5) +#define XATTR_LUSTRE_T (6) +#define XATTR_OTHER_T (7) + +static +int get_xattr_type(const char *name) +{ + if (!strcmp(name, POSIX_ACL_XATTR_ACCESS)) + return XATTR_ACL_ACCESS_T; + + if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT)) + return XATTR_ACL_DEFAULT_T; + + if (!strncmp(name, XATTR_USER_PREFIX, + sizeof(XATTR_USER_PREFIX) - 1)) + return XATTR_USER_T; + + if (!strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1)) + return XATTR_TRUSTED_T; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1)) + return XATTR_SECURITY_T; + + if (!strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1)) + return XATTR_LUSTRE_T; + + return XATTR_OTHER_T; +} + +static +int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type) +{ + if ((xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T) && + !(sbi->ll_flags & LL_SBI_ACL)) + return -EOPNOTSUPP; + + if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR)) + return -EOPNOTSUPP; + if (xattr_type == XATTR_TRUSTED_T && !cfs_capable(CFS_CAP_SYS_ADMIN)) + return -EPERM; + if (xattr_type == XATTR_OTHER_T) + return -EOPNOTSUPP; + + return 0; +} + +static +int ll_setxattr_common(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req; + int xattr_type, rc; + struct obd_capa *oc; + posix_acl_xattr_header *new_value = NULL; + struct rmtacl_ctl_entry *rce = NULL; + ext_acl_xattr_header *acl = NULL; + const char *pv = value; + ENTRY; + + xattr_type = get_xattr_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + RETURN(rc); + + /* b10667: ignore lustre special xattr for now */ + if ((xattr_type == XATTR_TRUSTED_T && strcmp(name, "trusted.lov") == 0) || + (xattr_type == XATTR_LUSTRE_T && strcmp(name, "lustre.lov") == 0)) + RETURN(0); + + /* b15587: ignore security.capability xattr for now */ + if ((xattr_type == XATTR_SECURITY_T && + strcmp(name, "security.capability") == 0)) + RETURN(0); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + RETURN(-EOPNOTSUPP); + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + (xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T)) { + rce = rct_search(&sbi->ll_rct, current_pid()); + if (rce == NULL || + (rce->rce_ops != RMT_LSETFACL && + rce->rce_ops != RMT_RSETFACL)) + RETURN(-EOPNOTSUPP); + + if (rce->rce_ops == RMT_LSETFACL) { + struct eacl_entry *ee; + + ee = et_search_del(&sbi->ll_et, current_pid(), + ll_inode2fid(inode), xattr_type); + LASSERT(ee != NULL); + if (valid & OBD_MD_FLXATTR) { + acl = lustre_acl_xattr_merge2ext( + (posix_acl_xattr_header *)value, + size, ee->ee_acl); + if (IS_ERR(acl)) { + ee_free(ee); + RETURN(PTR_ERR(acl)); + } + size = CFS_ACL_XATTR_SIZE(\ + le32_to_cpu(acl->a_count), \ + ext_acl_xattr); + pv = (const char *)acl; + } + ee_free(ee); + } else if (rce->rce_ops == RMT_RSETFACL) { + size = lustre_posix_acl_xattr_filter( + (posix_acl_xattr_header *)value, + size, &new_value); + if (unlikely(size < 0)) + RETURN(size); + + pv = (const char *)new_value; + } else + RETURN(-EOPNOTSUPP); + + valid |= rce_ops2valid(rce->rce_ops); + } +#endif + oc = ll_mdscapa_get(inode); + rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + valid, name, pv, size, 0, flags, ll_i2suppgid(inode), + &req); + capa_put(oc); +#ifdef CONFIG_FS_POSIX_ACL + if (new_value != NULL) + lustre_posix_acl_xattr_free(new_value, size); + if (acl != NULL) + lustre_ext_acl_xattr_free(acl); +#endif + if (rc) { + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + RETURN(rc); + } + + ptlrpc_req_finished(req); + RETURN(0); +} + +int ll_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); + + if ((strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || + (strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + struct lov_user_md *lump = (struct lov_user_md *)value; + int rc = 0; + + /* Attributes that are saved via getxattr will always have + * the stripe_offset as 0. Instead, the MDS should be + * allowed to pick the starting OST index. b=17846 */ + if (lump != NULL && lump->lmm_stripe_offset == 0) + lump->lmm_stripe_offset = -1; + + if (lump != NULL && S_ISREG(inode->i_mode)) { + struct file f; + int flags = FMODE_WRITE; + int lum_size = (lump->lmm_magic == LOV_USER_MAGIC_V1) ? + sizeof(*lump) : sizeof(struct lov_user_md_v3); + + f.f_dentry = dentry; + rc = ll_lov_setstripe_ea_info(inode, &f, flags, lump, + lum_size); + /* b10667: rc always be 0 here for now */ + rc = 0; + } else if (S_ISDIR(inode->i_mode)) { + rc = ll_dir_setstripe(inode, lump, 0); + } + + return rc; + + } else if (strcmp(name, XATTR_NAME_LMA) == 0 || + strcmp(name, XATTR_NAME_LINK) == 0) + return 0; + + return ll_setxattr_common(inode, name, value, size, flags, + OBD_MD_FLXATTR); +} + +int ll_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); + return ll_setxattr_common(inode, name, NULL, 0, 0, + OBD_MD_FLXATTRRM); +} + +static +int ll_getxattr_common(struct inode *inode, const char *name, + void *buffer, size_t size, __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *req = NULL; + struct mdt_body *body; + int xattr_type, rc; + void *xdata; + struct obd_capa *oc; + struct rmtacl_ctl_entry *rce = NULL; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", + inode->i_ino, inode->i_generation, inode); + + /* listxattr have slightly different behavior from of ext3: + * without 'user_xattr' ext3 will list all xattr names but + * filtered out "^user..*"; we list them all for simplicity. + */ + if (!name) { + xattr_type = XATTR_OTHER_T; + goto do_getxattr; + } + + xattr_type = get_xattr_type(name); + rc = xattr_type_filter(sbi, xattr_type); + if (rc) + RETURN(rc); + + /* b15587: ignore security.capability xattr for now */ + if ((xattr_type == XATTR_SECURITY_T && + strcmp(name, "security.capability") == 0)) + RETURN(-ENODATA); + + /* LU-549: Disable security.selinux when selinux is disabled */ + if (xattr_type == XATTR_SECURITY_T && !selinux_is_enabled() && + strcmp(name, "security.selinux") == 0) + RETURN(-EOPNOTSUPP); + +#ifdef CONFIG_FS_POSIX_ACL + if (sbi->ll_flags & LL_SBI_RMT_CLIENT && + (xattr_type == XATTR_ACL_ACCESS_T || + xattr_type == XATTR_ACL_DEFAULT_T)) { + rce = rct_search(&sbi->ll_rct, current_pid()); + if (rce == NULL || + (rce->rce_ops != RMT_LSETFACL && + rce->rce_ops != RMT_LGETFACL && + rce->rce_ops != RMT_RSETFACL && + rce->rce_ops != RMT_RGETFACL)) + RETURN(-EOPNOTSUPP); + } + + /* posix acl is under protection of LOOKUP lock. when calling to this, + * we just have path resolution to the target inode, so we have great + * chance that cached ACL is uptodate. + */ + if (xattr_type == XATTR_ACL_ACCESS_T && + !(sbi->ll_flags & LL_SBI_RMT_CLIENT)) { + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl; + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(lli->lli_posix_acl); + spin_unlock(&lli->lli_lock); + + if (!acl) + RETURN(-ENODATA); + + rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + RETURN(rc); + } + if (xattr_type == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) + RETURN(-ENODATA); +#endif + +do_getxattr: + oc = ll_mdscapa_get(inode); + rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, + valid | (rce ? rce_ops2valid(rce->rce_ops) : 0), + name, NULL, 0, size, 0, &req); + capa_put(oc); + if (rc) { + if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) { + LCONSOLE_INFO("Disabling user_xattr feature because " + "it is not supported on the server\n"); + sbi->ll_flags &= ~LL_SBI_USER_XATTR; + } + RETURN(rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body); + + /* only detect the xattr size */ + if (size == 0) + GOTO(out, rc = body->eadatasize); + + if (size < body->eadatasize) { + CERROR("server bug: replied size %u > %u\n", + body->eadatasize, (int)size); + GOTO(out, rc = -ERANGE); + } + + if (body->eadatasize == 0) + GOTO(out, rc = -ENODATA); + + /* do not need swab xattr data */ + xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, + body->eadatasize); + if (!xdata) + GOTO(out, rc = -EFAULT); + +#ifdef CONFIG_FS_POSIX_ACL + if (body->eadatasize >= 0 && rce && rce->rce_ops == RMT_LSETFACL) { + ext_acl_xattr_header *acl; + + acl = lustre_posix_acl_xattr_2ext((posix_acl_xattr_header *)xdata, + body->eadatasize); + if (IS_ERR(acl)) + GOTO(out, rc = PTR_ERR(acl)); + + rc = ee_add(&sbi->ll_et, current_pid(), ll_inode2fid(inode), + xattr_type, acl); + if (unlikely(rc < 0)) { + lustre_ext_acl_xattr_free(acl); + GOTO(out, rc); + } + } +#endif + + if (body->eadatasize == 0) { + rc = -ENODATA; + } else { + LASSERT(buffer); + memcpy(buffer, xdata, body->eadatasize); + rc = body->eadatasize; + } + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +ssize_t ll_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + + LASSERT(inode); + LASSERT(name); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), xattr %s\n", + inode->i_ino, inode->i_generation, inode, name); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); + + if ((strncmp(name, XATTR_TRUSTED_PREFIX, + sizeof(XATTR_TRUSTED_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_TRUSTED_PREFIX) - 1, "lov") == 0) || + (strncmp(name, XATTR_LUSTRE_PREFIX, + sizeof(XATTR_LUSTRE_PREFIX) - 1) == 0 && + strcmp(name + sizeof(XATTR_LUSTRE_PREFIX) - 1, "lov") == 0)) { + struct lov_stripe_md *lsm; + struct lov_user_md *lump; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *request = NULL; + int rc = 0, lmmsize = 0; + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -ENODATA; + + if (size == 0 && S_ISDIR(inode->i_mode)) { + /* XXX directory EA is fix for now, optimize to save + * RPC transfer */ + GOTO(out, rc = sizeof(struct lov_user_md)); + } + + lsm = ccc_inode_lsm_get(inode); + if (lsm == NULL) { + if (S_ISDIR(inode->i_mode)) { + rc = ll_dir_getstripe(inode, &lmm, + &lmmsize, &request); + } else { + rc = -ENODATA; + } + } else { + /* LSM is present already after lookup/getattr call. + * we need to grab layout lock once it is implemented */ + rc = obd_packmd(ll_i2dtexp(inode), &lmm, lsm); + lmmsize = rc; + } + ccc_inode_lsm_put(inode, lsm); + + if (rc < 0) + GOTO(out, rc); + + if (size == 0) { + /* used to call ll_get_max_mdsize() forward to get + * the maximum buffer size, while some apps (such as + * rsync 3.0.x) care much about the exact xattr value + * size */ + rc = lmmsize; + GOTO(out, rc); + } + + if (size < lmmsize) { + CERROR("server bug: replied size %d > %d for %s (%s)\n", + lmmsize, (int)size, dentry->d_name.name, name); + GOTO(out, rc = -ERANGE); + } + + lump = (struct lov_user_md *)buffer; + memcpy(lump, lmm, lmmsize); + /* do not return layout gen for getxattr otherwise it would + * confuse tar --xattr by recognizing layout gen as stripe + * offset when the file is restored. See LU-2809. */ + lump->lmm_layout_gen = 0; + + rc = lmmsize; +out: + if (request) + ptlrpc_req_finished(request); + else if (lmm) + obd_free_diskmd(ll_i2dtexp(inode), &lmm); + return(rc); + } + + return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR); +} + +ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + int rc = 0, rc2 = 0; + struct lov_mds_md *lmm = NULL; + struct ptlrpc_request *request = NULL; + int lmmsize; + + LASSERT(inode); + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", + inode->i_ino, inode->i_generation, inode); + + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); + + rc = ll_getxattr_common(inode, NULL, buffer, size, OBD_MD_FLXATTRLS); + if (rc < 0) + GOTO(out, rc); + + if (buffer != NULL) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + char *xattr_name = buffer; + int xlen, rem = rc; + + while (rem > 0) { + xlen = strnlen(xattr_name, rem - 1) + 1; + rem -= xlen; + if (xattr_type_filter(sbi, + get_xattr_type(xattr_name)) == 0) { + /* skip OK xattr type + * leave it in buffer + */ + xattr_name += xlen; + continue; + } + /* move up remaining xattrs in buffer + * removing the xattr that is not OK + */ + memmove(xattr_name, xattr_name + xlen, rem); + rc -= xlen; + } + } + if (S_ISREG(inode->i_mode)) { + if (!ll_i2info(inode)->lli_has_smd) + rc2 = -1; + } else if (S_ISDIR(inode->i_mode)) { + rc2 = ll_dir_getstripe(inode, &lmm, &lmmsize, &request); + } + + if (rc2 < 0) { + GOTO(out, rc2 = 0); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) { + const int prefix_len = sizeof(XATTR_LUSTRE_PREFIX) - 1; + const size_t name_len = sizeof("lov") - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (buffer && (rc + total_len) <= size) { + buffer += rc; + memcpy(buffer, XATTR_LUSTRE_PREFIX, prefix_len); + memcpy(buffer + prefix_len, "lov", name_len); + buffer[prefix_len + name_len] = '\0'; + } + rc2 = total_len; + } +out: + ptlrpc_req_finished(request); + rc = rc + rc2; + + return rc; +} diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile new file mode 100644 index 000000000000..8cc81ade126c --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += lmv.o +lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c new file mode 100644 index 000000000000..a4805aefa684 --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/lmv_fld.c @@ -0,0 +1,88 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +int lmv_fld_lookup(struct lmv_obd *lmv, + const struct lu_fid *fid, + mdsno_t *mds) +{ + int rc; + ENTRY; + + + /* FIXME: Currently ZFS still use local seq for ROOT unfortunately, and + * this fid_is_local check should be removed once LU-2240 is fixed */ + LASSERTF((fid_seq_in_fldb(fid_seq(fid)) || + fid_seq_is_local_file(fid_seq(fid))) && + fid_is_sane(fid), DFID" is insane!\n", PFID(fid)); + + rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, + LU_SEQ_RANGE_MDT, NULL); + if (rc) { + CERROR("Error while looking for mds number. Seq "LPX64 + ", err = %d\n", fid_seq(fid), rc); + RETURN(rc); + } + + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", + *mds, PFID(fid)); + + if (*mds >= lmv->desc.ld_tgt_count) { + CERROR("FLD lookup got invalid mds #%x (max: %x) " + "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count, + PFID(fid)); + rc = -EINVAL; + } + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c new file mode 100644 index 000000000000..7eefab5ef5d0 --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/lmv_intent.c @@ -0,0 +1,328 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static int lmv_intent_remote(struct obd_export *exp, void *lmm, + int lmmsize, struct lookup_intent *it, + const struct lu_fid *parent_fid, int flags, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct ptlrpc_request *req = NULL; + struct lustre_handle plock; + struct md_op_data *op_data; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int pmode; + int rc = 0; + ENTRY; + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + LASSERT((body->valid & OBD_MD_MDS)); + + /* + * Unfortunately, we have to lie to MDC/MDS to retrieve + * attributes llite needs and provideproper locking. + */ + if (it->it_op & IT_LOOKUP) + it->it_op = IT_GETATTR; + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->d.lustre.it_lock_mode; + if (pmode) { + plock.cookie = it->d.lustre.it_lock_handle; + it->d.lustre.it_lock_mode = 0; + it->d.lustre.it_data = NULL; + } + + LASSERT(fid_is_sane(&body->fid1)); + + tgt = lmv_find_target(lmv, &body->fid1); + if (IS_ERR(tgt)) + GOTO(out, rc = PTR_ERR(tgt)); + + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out, rc = -ENOMEM); + + op_data->op_fid1 = body->fid1; + /* Sent the parent FID to the remote MDT */ + if (parent_fid != NULL) { + /* The parent fid is only for remote open to + * check whether the open is from OBF, + * see mdt_cross_open */ + LASSERT(it->it_op & IT_OPEN); + op_data->op_fid2 = *parent_fid; + /* Add object FID to op_fid3, in case it needs to check stale + * (M_CHECK_STALE), see mdc_finish_intent_lock */ + op_data->op_fid3 = body->fid1; + } + + op_data->op_bias = MDS_CROSS_REF; + CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%d\n", + PFID(&body->fid1), tgt->ltd_idx); + + it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE; + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, + flags, &req, cb_blocking, extra_lock_flags); + if (rc) + GOTO(out_free_op_data, rc); + + /* + * LLite needs LOOKUP lock to track dentry revocation in order to + * maintain dcache consistency. Thus drop UPDATE|PERM lock here + * and put LOOKUP in request. + */ + if (it->d.lustre.it_lock_mode != 0) { + it->d.lustre.it_remote_lock_handle = + it->d.lustre.it_lock_handle; + it->d.lustre.it_remote_lock_mode = it->d.lustre.it_lock_mode; + } + + it->d.lustre.it_lock_handle = plock.cookie; + it->d.lustre.it_lock_mode = pmode; + + EXIT; +out_free_op_data: + OBD_FREE_PTR(op_data); +out: + if (rc && pmode) + ldlm_lock_decref(&plock, pmode); + + ptlrpc_req_finished(*reqp); + *reqp = req; + return rc; +} + +/* + * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) + * may be split dir. + */ +int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + ENTRY; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* If it is ready to open the file by FID, do not need + * allocate FID at all, otherwise it will confuse MDT */ + if ((it->it_op & IT_CREAT) && + !(it->it_flags & MDS_OPEN_BY_FID)) { + /* + * For open with IT_CREATE and for IT_CREATE cases allocate new + * fid and setup FLD for it. + */ + op_data->op_fid3 = op_data->op_fid2; + rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc != 0) + RETURN(rc); + } + + CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID"," + " name='%s' -> mds #%d\n", PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx); + + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, flags, + reqp, cb_blocking, extra_lock_flags); + if (rc != 0) + RETURN(rc); + /* + * Nothing is found, do not access body->fid1 as it is zero and thus + * pointless. + */ + if ((it->d.lustre.it_disposition & DISP_LOOKUP_NEG) && + !(it->d.lustre.it_disposition & DISP_OPEN_CREATE) && + !(it->d.lustre.it_disposition & DISP_OPEN_OPEN)) + RETURN(rc); + + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + /* + * Not cross-ref case, just get out of here. + */ + if (likely(!(body->valid & OBD_MD_MDS))) + RETURN(0); + + /* + * Okay, MDS has returned success. Probably name has been resolved in + * remote inode. + */ + rc = lmv_intent_remote(exp, lmm, lmmsize, it, &op_data->op_fid1, flags, + reqp, cb_blocking, extra_lock_flags); + if (rc != 0) { + LASSERT(rc < 0); + /* + * This is possible, that some userspace application will try to + * open file as directory and we will have -ENOTDIR here. As + * this is normal situation, we should not print error here, + * only debug info. + */ + CDEBUG(D_INODE, "Can't handle remote %s: dir "DFID"("DFID"):" + "%*s: %d\n", LL_IT2STR(it), PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), op_data->op_namelen, + op_data->op_name, rc); + RETURN(rc); + } + + RETURN(rc); +} + +/* + * Handler for: getattr, lookup and revalidate cases. + */ +int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc = 0; + ENTRY; + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (!fid_is_sane(&op_data->op_fid2)) + fid_zero(&op_data->op_fid2); + + CDEBUG(D_INODE, "LOOKUP_INTENT with fid1="DFID", fid2="DFID + ", name='%s' -> mds #%d\n", PFID(&op_data->op_fid1), + PFID(&op_data->op_fid2), + op_data->op_name ? op_data->op_name : "", + tgt->ltd_idx); + + op_data->op_bias &= ~MDS_CROSS_REF; + + rc = md_intent_lock(tgt->ltd_exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, extra_lock_flags); + + if (rc < 0 || *reqp == NULL) + RETURN(rc); + + /* + * MDS has returned success. Probably name has been resolved in + * remote inode. Let's check this. + */ + body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->valid & OBD_MD_MDS))) + RETURN(0); + + rc = lmv_intent_remote(exp, lmm, lmmsize, it, NULL, flags, reqp, + cb_blocking, extra_lock_flags); + + RETURN(rc); +} + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + int rc; + ENTRY; + + LASSERT(it != NULL); + LASSERT(fid_is_sane(&op_data->op_fid1)); + + CDEBUG(D_INODE, "INTENT LOCK '%s' for '%*s' on "DFID"\n", + LL_IT2STR(it), op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid1)); + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT)) + rc = lmv_intent_lookup(exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, + extra_lock_flags); + else if (it->it_op & IT_OPEN) + rc = lmv_intent_open(exp, op_data, lmm, lmmsize, it, + flags, reqp, cb_blocking, + extra_lock_flags); + else + LBUG(); + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h new file mode 100644 index 000000000000..f75b0a987681 --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/lmv_internal.h @@ -0,0 +1,159 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LMV_INTERNAL_H_ +#define _LMV_INTERNAL_H_ + +#include +#include + +#define LMV_MAX_TGT_COUNT 128 + +#define lmv_init_lock(lmv) mutex_lock(&lmv->init_mutex); +#define lmv_init_unlock(lmv) mutex_unlock(&lmv->init_mutex); + +#define LL_IT2STR(it) \ + ((it) ? ldlm_it2str((it)->it_op) : "0") + +int lmv_check_connect(struct obd_device *obd); + +int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); + +int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *, int); +int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, + mdsno_t *mds); +int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, + mdsno_t mds); +int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + +static inline struct lmv_stripe_md *lmv_get_mea(struct ptlrpc_request *req) +{ + struct mdt_body *body; + struct lmv_stripe_md *mea; + + LASSERT(req != NULL); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + + if (!body || !S_ISDIR(body->mode) || !body->eadatasize) + return NULL; + + mea = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, + body->eadatasize); + LASSERT(mea != NULL); + + if (mea->mea_count == 0) + return NULL; + if( mea->mea_magic != MEA_MAGIC_LAST_CHAR && + mea->mea_magic != MEA_MAGIC_ALL_CHARS && + mea->mea_magic != MEA_MAGIC_HASH_SEGMENT) + return NULL; + + return mea; +} + +static inline int lmv_get_easize(struct lmv_obd *lmv) +{ + return sizeof(struct lmv_stripe_md) + + lmv->desc.ld_tgt_count * + sizeof(struct lu_fid); +} + +static inline struct lmv_tgt_desc * +lmv_get_target(struct lmv_obd *lmv, mdsno_t mds) +{ + int count = lmv->desc.ld_tgt_count; + int i; + + for (i = 0; i < count; i++) { + if (lmv->tgts[i] == NULL) + continue; + + if (lmv->tgts[i]->ltd_idx == mds) + return lmv->tgts[i]; + } + + return ERR_PTR(-ENODEV); +} + +static inline struct lmv_tgt_desc * +lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) +{ + mdsno_t mds = 0; + int rc; + + if (lmv->desc.ld_tgt_count > 1) { + rc = lmv_fld_lookup(lmv, fid, &mds); + if (rc) + return ERR_PTR(rc); + } + + return lmv_get_target(lmv, mds); +} + +struct lmv_tgt_desc +*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + struct lu_fid *fid); +/* lproc_lmv.c */ +#ifdef LPROCFS +void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif +extern struct file_operations lmv_proc_target_fops; + +#endif diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c new file mode 100644 index 000000000000..a13eead0a6cf --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c @@ -0,0 +1,2734 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LMV +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "lmv_internal.h" + +static void lmv_activate_target(struct lmv_obd *lmv, + struct lmv_tgt_desc *tgt, + int activate) +{ + if (tgt->ltd_active == activate) + return; + + tgt->ltd_active = activate; + lmv->desc.ld_active_tgt_count += (activate ? 1 : -1); +} + +/** + * Error codes: + * + * -EINVAL : UUID can't be found in the LMV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD of the wrong type (!) + */ +static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid, + int activate) +{ + struct lmv_tgt_desc *tgt; + struct obd_device *obd; + int i; + int rc = 0; + ENTRY; + + CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", + lmv, uuid->uuid, activate); + + spin_lock(&lmv->lmv_lock); + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + CDEBUG(D_INFO, "Target idx %d is %s conn "LPX64"\n", i, + tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie); + + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (i == lmv->desc.ld_tgt_count) + GOTO(out_lmv_lock, rc = -EINVAL); + + obd = class_exp2obd(tgt->ltd_exp); + if (obd == NULL) + GOTO(out_lmv_lock, rc = -ENOTCONN); + + CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n", + obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, + obd->obd_type->typ_name, i); + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0); + + if (tgt->ltd_active == activate) { + CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, + activate ? "" : "in"); + GOTO(out_lmv_lock, rc); + } + + CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, + activate ? "" : "in"); + lmv_activate_target(lmv, tgt, activate); + EXIT; + + out_lmv_lock: + spin_unlock(&lmv->lmv_lock); + return rc; +} + +struct obd_uuid *lmv_get_uuid(struct obd_export *exp) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return obd_get_uuid(lmv->tgts[0]->ltd_exp); +} + +static int lmv_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data) +{ + struct obd_connect_data *conn_data; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *uuid; + int rc = 0; + ENTRY; + + if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) { + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + RETURN(-EINVAL); + } + + uuid = &watched->u.cli.cl_target_uuid; + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { + /* + * Set MDC as active before notifying the observer, so the + * observer can use the MDC normally. + */ + rc = lmv_set_mdc_active(lmv, uuid, + ev == OBD_NOTIFY_ACTIVE); + if (rc) { + CERROR("%sactivation of %s failed: %d\n", + ev == OBD_NOTIFY_ACTIVE ? "" : "de", + uuid->uuid, rc); + RETURN(rc); + } + } else if (ev == OBD_NOTIFY_OCD) { + conn_data = &watched->u.cli.cl_import->imp_connect_data; + /* + * XXX: Make sure that ocd_connect_flags from all targets are + * the same. Otherwise one of MDTs runs wrong version or + * something like this. --umka + */ + obd->obd_self_export->exp_connect_data = *conn_data; + } +#if 0 + else if (ev == OBD_NOTIFY_DISCON) { + /* + * For disconnect event, flush fld cache for failout MDS case. + */ + fld_client_flush(&lmv->lmv_fld); + } +#endif + /* + * Pass the notification up the chain. + */ + if (obd->obd_observer) + rc = obd_notify(obd->obd_observer, watched, ev, data); + + RETURN(rc); +} + +/** + * This is fake connect function. Its purpose is to initialize lmv and say + * caller that everything is okay. Real connection will be performed later. + */ +static int lmv_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle conn = { 0 }; + int rc = 0; + ENTRY; + + /* + * We don't want to actually do the underlying connections more than + * once, so keep track. + */ + lmv->refcount++; + if (lmv->refcount > 1) { + *exp = NULL; + RETURN(0); + } + + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("class_connection() returned %d\n", rc); + RETURN(rc); + } + + *exp = class_conn2export(&conn); + class_export_get(*exp); + + lmv->exp = *exp; + lmv->connected = 0; + lmv->cluuid = *cluuid; + + if (data) + lmv->conn_data = *data; + + lmv_proc_dir = lprocfs_register("target_obds", obd->obd_proc_entry, + NULL, NULL); + if (IS_ERR(lmv_proc_dir)) { + CERROR("could not register /proc/fs/lustre/%s/%s/target_obds.", + obd->obd_type->typ_name, obd->obd_name); + lmv_proc_dir = NULL; + } + + /* + * All real clients should perform actual connection right away, because + * it is possible, that LMV will not have opportunity to connect targets + * and MDC stuff will be called directly, for instance while reading + * ../mdc/../kbytesfree procfs file, etc. + */ + if (data->ocd_connect_flags & OBD_CONNECT_REAL) + rc = lmv_check_connect(obd); + + if (rc) { + if (lmv_proc_dir) + lprocfs_remove(&lmv_proc_dir); + } + + RETURN(rc); +} + +static void lmv_set_timeouts(struct obd_device *obd) +{ + struct lmv_tgt_desc *tgt; + struct lmv_obd *lmv; + int i; + + lmv = &obd->u.lmv; + if (lmv->server_timeout == 0) + return; + + if (lmv->connected == 0) + return; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0) + continue; + + obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS), + KEY_INTERMDS, 0, NULL, NULL); + } +} + +static int lmv_init_ea_size(struct obd_export *exp, int easize, + int def_easize, int cookiesize) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc = 0; + int change = 0; + ENTRY; + + if (lmv->max_easize < easize) { + lmv->max_easize = easize; + change = 1; + } + if (lmv->max_def_easize < def_easize) { + lmv->max_def_easize = def_easize; + change = 1; + } + if (lmv->max_cookiesize < cookiesize) { + lmv->max_cookiesize = cookiesize; + change = 1; + } + if (change == 0) + RETURN(0); + + if (lmv->connected == 0) + RETURN(0); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) { + CWARN("%s: NULL export for %d\n", obd->obd_name, i); + continue; + } + + rc = md_init_ea_size(lmv->tgts[i]->ltd_exp, easize, def_easize, + cookiesize); + if (rc) { + CERROR("%s: obd_init_ea_size() failed on MDT target %d:" + " rc = %d.\n", obd->obd_name, i, rc); + break; + } + } + RETURN(rc); +} + +#define MAX_STRING_SIZE 128 + +int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_uuid *cluuid = &lmv->cluuid; + struct obd_uuid lmv_mdc_uuid = { "LMV_MDC_UUID" }; + struct obd_device *mdc_obd; + struct obd_export *mdc_exp; + struct lu_fld_target target; + int rc; + ENTRY; + + mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + tgt->ltd_uuid.uuid, obd->obd_uuid.uuid, + cluuid->uuid); + + if (!mdc_obd->obd_set_up) { + CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } + + rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid, + &lmv->conn_data, NULL); + if (rc) { + CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + /* + * Init fid sequence client for this mdc and add new fld target. + */ + rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA); + if (rc) + RETURN(rc); + + target.ft_srv = NULL; + target.ft_exp = mdc_exp; + target.ft_idx = tgt->ltd_idx; + + fld_client_add_target(&lmv->lmv_fld, &target); + + rc = obd_register_observer(mdc_obd, obd); + if (rc) { + obd_disconnect(mdc_exp); + CERROR("target %s register_observer error %d\n", + tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } + + if (obd->obd_observer) { + /* + * Tell the observer about the new target. + */ + rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, + OBD_NOTIFY_ACTIVE, + (void *)(tgt - lmv->tgts[0])); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + } + + tgt->ltd_active = 1; + tgt->ltd_exp = mdc_exp; + lmv->desc.ld_active_tgt_count++; + + md_init_ea_size(tgt->ltd_exp, lmv->max_easize, + lmv->max_def_easize, lmv->max_cookiesize); + + CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", + mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + + lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); + if (lmv_proc_dir) { + struct proc_dir_entry *mdc_symlink; + + LASSERT(mdc_obd->obd_type != NULL); + LASSERT(mdc_obd->obd_type->typ_name != NULL); + mdc_symlink = lprocfs_add_symlink(mdc_obd->obd_name, + lmv_proc_dir, + "../../../%s/%s", + mdc_obd->obd_type->typ_name, + mdc_obd->obd_name); + if (mdc_symlink == NULL) { + CERROR("Could not register LMV target " + "/proc/fs/lustre/%s/%s/target_obds/%s.", + obd->obd_type->typ_name, obd->obd_name, + mdc_obd->obd_name); + lprocfs_remove(&lmv_proc_dir); + lmv_proc_dir = NULL; + } + } + RETURN(0); +} + +static void lmv_del_target(struct lmv_obd *lmv, int index) +{ + if (lmv->tgts[index] == NULL) + return; + + OBD_FREE_PTR(lmv->tgts[index]); + lmv->tgts[index] = NULL; + return; +} + +static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + ENTRY; + + CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); + + lmv_init_lock(lmv); + + if (lmv->desc.ld_tgt_count == 0) { + struct obd_device *mdc_obd; + + mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + lmv_init_unlock(lmv); + CERROR("%s: Target %s not attached: rc = %d\n", + obd->obd_name, uuidp->uuid, -EINVAL); + RETURN(-EINVAL); + } + } + + if ((index < lmv->tgts_size) && (lmv->tgts[index] != NULL)) { + tgt = lmv->tgts[index]; + CERROR("%s: UUID %s already assigned at LOV target index %d:" + " rc = %d\n", obd->obd_name, + obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST); + lmv_init_unlock(lmv); + RETURN(-EEXIST); + } + + if (index >= lmv->tgts_size) { + /* We need to reallocate the lmv target array. */ + struct lmv_tgt_desc **newtgts, **old = NULL; + __u32 newsize = 1; + __u32 oldsize = 0; + + while (newsize < index + 1) + newsize = newsize << 1; + OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize); + if (newtgts == NULL) { + lmv_init_unlock(lmv); + RETURN(-ENOMEM); + } + + if (lmv->tgts_size) { + memcpy(newtgts, lmv->tgts, + sizeof(*newtgts) * lmv->tgts_size); + old = lmv->tgts; + oldsize = lmv->tgts_size; + } + + lmv->tgts = newtgts; + lmv->tgts_size = newsize; + smp_rmb(); + if (old) + OBD_FREE(old, sizeof(*old) * oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts, + lmv->tgts_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + lmv_init_unlock(lmv); + RETURN(-ENOMEM); + } + + mutex_init(&tgt->ltd_fid_mutex); + tgt->ltd_idx = index; + tgt->ltd_uuid = *uuidp; + tgt->ltd_active = 0; + lmv->tgts[index] = tgt; + if (index >= lmv->desc.ld_tgt_count) + lmv->desc.ld_tgt_count = index + 1; + + if (lmv->connected) { + rc = lmv_connect_mdc(obd, tgt); + if (rc) { + spin_lock(&lmv->lmv_lock); + lmv->desc.ld_tgt_count--; + memset(tgt, 0, sizeof(*tgt)); + spin_unlock(&lmv->lmv_lock); + } else { + int easize = sizeof(struct lmv_stripe_md) + + lmv->desc.ld_tgt_count * + sizeof(struct lu_fid); + lmv_init_ea_size(obd->obd_self_export, easize, 0, 0); + } + } + + lmv_init_unlock(lmv); + RETURN(rc); +} + +int lmv_check_connect(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int i; + int rc; + int easize; + ENTRY; + + if (lmv->connected) + RETURN(0); + + lmv_init_lock(lmv); + if (lmv->connected) { + lmv_init_unlock(lmv); + RETURN(0); + } + + if (lmv->desc.ld_tgt_count == 0) { + lmv_init_unlock(lmv); + CERROR("%s: no targets configured.\n", obd->obd_name); + RETURN(-EINVAL); + } + + CDEBUG(D_CONFIG, "Time to connect %s to %s\n", + lmv->cluuid.uuid, obd->obd_name); + + LASSERT(lmv->tgts != NULL); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + rc = lmv_connect_mdc(obd, tgt); + if (rc) + GOTO(out_disc, rc); + } + + lmv_set_timeouts(obd); + class_export_put(lmv->exp); + lmv->connected = 1; + easize = lmv_get_easize(lmv); + lmv_init_ea_size(obd->obd_self_export, easize, 0, 0); + lmv_init_unlock(lmv); + RETURN(0); + + out_disc: + while (i-- > 0) { + int rc2; + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + tgt->ltd_active = 0; + if (tgt->ltd_exp) { + --lmv->desc.ld_active_tgt_count; + rc2 = obd_disconnect(tgt->ltd_exp); + if (rc2) { + CERROR("LMV target %s disconnect on " + "MDC idx %d: error %d\n", + tgt->ltd_uuid.uuid, i, rc2); + } + } + } + class_disconnect(lmv->exp); + lmv_init_unlock(lmv); + RETURN(rc); +} + +static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_device *mdc_obd; + int rc; + ENTRY; + + LASSERT(tgt != NULL); + LASSERT(obd != NULL); + + mdc_obd = class_exp2obd(tgt->ltd_exp); + + if (mdc_obd) { + mdc_obd->obd_force = obd->obd_force; + mdc_obd->obd_fail = obd->obd_fail; + mdc_obd->obd_no_recov = obd->obd_no_recov; + } + + lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); + if (lmv_proc_dir) { + struct proc_dir_entry *mdc_symlink; + + mdc_symlink = lprocfs_srch(lmv_proc_dir, mdc_obd->obd_name); + if (mdc_symlink) { + lprocfs_remove(&mdc_symlink); + } else { + CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing\n", + obd->obd_type->typ_name, obd->obd_name, + mdc_obd->obd_name); + } + } + rc = obd_fid_fini(tgt->ltd_exp->exp_obd); + if (rc) + CERROR("Can't finanize fids factory\n"); + + CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n", + tgt->ltd_exp->exp_obd->obd_name, + tgt->ltd_exp->exp_obd->obd_uuid.uuid); + + obd_register_observer(tgt->ltd_exp->exp_obd, NULL); + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + if (tgt->ltd_active) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + } + } + + lmv_activate_target(lmv, tgt, 0); + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lmv_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct proc_dir_entry *lmv_proc_dir; + struct lmv_obd *lmv = &obd->u.lmv; + int rc; + int i; + ENTRY; + + if (!lmv->tgts) + goto out_local; + + /* + * Only disconnect the underlying layers on the final disconnect. + */ + lmv->refcount--; + if (lmv->refcount != 0) + goto out_local; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + + lmv_disconnect_mdc(obd, lmv->tgts[i]); + } + + lmv_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); + if (lmv_proc_dir) { + lprocfs_remove(&lmv_proc_dir); + } else { + CERROR("/proc/fs/lustre/%s/%s/target_obds missing\n", + obd->obd_type->typ_name, obd->obd_name); + } + +out_local: + /* + * This is the case when no real connection is established by + * lmv_check_connect(). + */ + if (!lmv->connected) + class_export_put(exp); + rc = class_disconnect(exp); + if (lmv->refcount == 0) + lmv->connected = 0; + RETURN(rc); +} + +static int lmv_fid2path(struct obd_export *exp, int len, void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + struct getinfo_fid2path *gf; + struct lmv_tgt_desc *tgt; + struct getinfo_fid2path *remote_gf = NULL; + int remote_gf_size = 0; + int rc; + + gf = (struct getinfo_fid2path *)karg; + tgt = lmv_find_target(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + +repeat_fid2path: + rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg); + if (rc != 0 && rc != -EREMOTE) + GOTO(out_fid2path, rc); + + /* If remote_gf != NULL, it means just building the + * path on the remote MDT, copy this path segement to gf */ + if (remote_gf != NULL) { + struct getinfo_fid2path *ori_gf; + char *ptr; + + ori_gf = (struct getinfo_fid2path *)karg; + if (strlen(ori_gf->gf_path) + + strlen(gf->gf_path) > ori_gf->gf_pathlen) + GOTO(out_fid2path, rc = -EOVERFLOW); + + ptr = ori_gf->gf_path; + + memmove(ptr + strlen(gf->gf_path) + 1, ptr, + strlen(ori_gf->gf_path)); + + strncpy(ptr, gf->gf_path, strlen(gf->gf_path)); + ptr += strlen(gf->gf_path); + *ptr = '/'; + } + + CDEBUG(D_INFO, "%s: get path %s "DFID" rec: "LPU64" ln: %u\n", + tgt->ltd_exp->exp_obd->obd_name, + gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno, + gf->gf_linkno); + + if (rc == 0) + GOTO(out_fid2path, rc); + + /* sigh, has to go to another MDT to do path building further */ + if (remote_gf == NULL) { + remote_gf_size = sizeof(*remote_gf) + PATH_MAX; + OBD_ALLOC(remote_gf, remote_gf_size); + if (remote_gf == NULL) + GOTO(out_fid2path, rc = -ENOMEM); + remote_gf->gf_pathlen = PATH_MAX; + } + + if (!fid_is_sane(&gf->gf_fid)) { + CERROR("%s: invalid FID "DFID": rc = %d\n", + tgt->ltd_exp->exp_obd->obd_name, + PFID(&gf->gf_fid), -EINVAL); + GOTO(out_fid2path, rc = -EINVAL); + } + + tgt = lmv_find_target(lmv, &gf->gf_fid); + if (IS_ERR(tgt)) + GOTO(out_fid2path, rc = -EINVAL); + + remote_gf->gf_fid = gf->gf_fid; + remote_gf->gf_recno = -1; + remote_gf->gf_linkno = -1; + memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen); + gf = remote_gf; + goto repeat_fid2path; + +out_fid2path: + if (remote_gf != NULL) + OBD_FREE(remote_gf, remote_gf_size); + RETURN(rc); +} + +static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lmv_obd *lmv = &obddev->u.lmv; + int i = 0; + int rc = 0; + int set = 0; + int count = lmv->desc.ld_tgt_count; + ENTRY; + + if (count == 0) + RETURN(-ENOTTY); + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *mdc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + if ((index >= count)) + RETURN(-ENODEV); + + if (lmv->tgts[index] == NULL || + lmv->tgts[index]->ltd_active == 0) + RETURN(-ENODATA); + + mdc_obd = class_exp2obd(lmv->tgts[index]->ltd_exp); + if (!mdc_obd) + RETURN(-EINVAL); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + rc = obd_statfs(NULL, lmv->tgts[index]->ltd_exp, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lmv_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_MDTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = lmv->tgts[qctl->qc_idx]; + if (tgt == NULL || tgt->ltd_exp == NULL) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lmv->tgts[i]; + if (tgt == NULL) + continue; + if (!obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + case OBD_IOC_CHANGELOG_SEND: + case OBD_IOC_CHANGELOG_CLEAR: { + struct ioc_changelog *icc = karg; + + if (icc->icc_mdtindex >= count) + RETURN(-ENODEV); + + if (lmv->tgts[icc->icc_mdtindex] == NULL || + lmv->tgts[icc->icc_mdtindex]->ltd_exp == NULL || + lmv->tgts[icc->icc_mdtindex]->ltd_active == 0) + RETURN(-ENODEV); + rc = obd_iocontrol(cmd, lmv->tgts[icc->icc_mdtindex]->ltd_exp, + sizeof(*icc), icc, NULL); + break; + } + case LL_IOC_GET_CONNECT_FLAGS: { + if (lmv->tgts[0] == NULL) + RETURN(-ENODATA); + rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg); + break; + } + case OBD_IOC_FID2PATH: { + rc = lmv_fid2path(exp, len, karg, uarg); + break; + } + case LL_IOC_HSM_STATE_GET: + case LL_IOC_HSM_STATE_SET: + case LL_IOC_HSM_ACTION: + case LL_IOC_LOV_SWAP_LAYOUTS: { + struct md_op_data *op_data = karg; + struct lmv_tgt_desc *tgt1, *tgt2; + + tgt1 = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt1)) + RETURN(PTR_ERR(tgt1)); + + tgt2 = lmv_find_target(lmv, &op_data->op_fid2); + if (IS_ERR(tgt2)) + RETURN(PTR_ERR(tgt2)); + + if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL)) + RETURN(-EINVAL); + + /* only files on same MDT can have their layouts swapped */ + if (tgt1->ltd_idx != tgt2->ltd_idx) + RETURN(-EPERM); + + rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); + break; + } + default: + for (i = 0; i < count; i++) { + struct obd_device *mdc_obd; + int err; + + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL) + continue; + /* ll_umount_begin() sets force flag but for lmv, not + * mdc. Let's pass it through */ + mdc_obd = class_exp2obd(lmv->tgts[i]->ltd_exp); + mdc_obd->obd_force = obddev->obd_force; + err = obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, + karg, uarg); + if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { + RETURN(err); + } else if (err) { + if (lmv->tgts[i]->ltd_active) { + CERROR("error: iocontrol MDC %s on MDT" + "idx %d cmd %x: err = %d\n", + lmv->tgts[i]->ltd_uuid.uuid, + i, cmd, err); + if (!rc) + rc = err; + } + } else + set = 1; + } + if (!set && !rc) + rc = -EIO; + } + RETURN(rc); +} + +#if 0 +static int lmv_all_chars_policy(int count, const char *name, + int len) +{ + unsigned int c = 0; + + while (len > 0) + c += name[--len]; + c = c % count; + return c; +} + +static int lmv_nid_policy(struct lmv_obd *lmv) +{ + struct obd_import *imp; + __u32 id; + + /* + * XXX: To get nid we assume that underlying obd device is mdc. + */ + imp = class_exp2cliimp(lmv->tgts[0].ltd_exp); + id = imp->imp_connection->c_self ^ (imp->imp_connection->c_self >> 32); + return id % lmv->desc.ld_tgt_count; +} + +static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + placement_policy_t placement) +{ + switch (placement) { + case PLACEMENT_CHAR_POLICY: + return lmv_all_chars_policy(lmv->desc.ld_tgt_count, + op_data->op_name, + op_data->op_namelen); + case PLACEMENT_NID_POLICY: + return lmv_nid_policy(lmv); + + default: + break; + } + + CERROR("Unsupported placement policy %x\n", placement); + return -EINVAL; +} +#endif + +/** + * This is _inode_ placement policy function (not name). + */ +static int lmv_placement_policy(struct obd_device *obd, + struct md_op_data *op_data, + mdsno_t *mds) +{ + struct lmv_obd *lmv = &obd->u.lmv; + ENTRY; + + LASSERT(mds != NULL); + + if (lmv->desc.ld_tgt_count == 1) { + *mds = 0; + RETURN(0); + } + + /** + * If stripe_offset is provided during setdirstripe + * (setdirstripe -i xx), xx MDS will be choosen. + */ + if (op_data->op_cli_flags & CLI_SET_MEA) { + struct lmv_user_md *lum; + + lum = (struct lmv_user_md *)op_data->op_data; + if (lum->lum_type == LMV_STRIPE_TYPE && + lum->lum_stripe_offset != -1) { + if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) { + CERROR("%s: Stripe_offset %d > MDT count %d:" + " rc = %d\n", obd->obd_name, + lum->lum_stripe_offset, + lmv->desc.ld_tgt_count, -ERANGE); + RETURN(-ERANGE); + } + *mds = lum->lum_stripe_offset; + RETURN(0); + } + } + + /* Allocate new fid on target according to operation type and parent + * home mds. */ + *mds = op_data->op_mds; + RETURN(0); +} + +int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, + mdsno_t mds) +{ + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + tgt = lmv_get_target(lmv, mds); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* + * New seq alloc and FLD setup should be atomic. Otherwise we may find + * on server that seq in new allocated fid is not yet known. + */ + mutex_lock(&tgt->ltd_fid_mutex); + + if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) + GOTO(out, rc = -ENODEV); + + /* + * Asking underlaying tgt layer to allocate new fid. + */ + rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL); + if (rc > 0) { + LASSERT(fid_is_sane(fid)); + rc = 0; + } + + EXIT; +out: + mutex_unlock(&tgt->ltd_fid_mutex); + return rc; +} + +int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + mdsno_t mds = 0; + int rc; + ENTRY; + + LASSERT(op_data != NULL); + LASSERT(fid != NULL); + + rc = lmv_placement_policy(obd, op_data, &mds); + if (rc) { + CERROR("Can't get target for allocating fid, " + "rc %d\n", rc); + RETURN(rc); + } + + rc = __lmv_fid_alloc(lmv, fid, mds); + if (rc) { + CERROR("Can't alloc new fid, rc %d\n", rc); + RETURN(rc); + } + + RETURN(rc); +} + +static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lmv_obd *lmv = &obd->u.lmv; + struct lprocfs_static_vars lvars; + struct lmv_desc *desc; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LMV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1); + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("Lmv descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + OBD_ALLOC(lmv->tgts, sizeof(*lmv->tgts) * 32); + if (lmv->tgts == NULL) + RETURN(-ENOMEM); + lmv->tgts_size = 32; + + obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); + lmv->desc.ld_tgt_count = 0; + lmv->desc.ld_active_tgt_count = 0; + lmv->max_cookiesize = 0; + lmv->max_def_easize = 0; + lmv->max_easize = 0; + lmv->lmv_placement = PLACEMENT_CHAR_POLICY; + + spin_lock_init(&lmv->lmv_lock); + mutex_init(&lmv->init_mutex); + + lprocfs_lmv_init_vars(&lvars); + + lprocfs_obd_setup(obd, lvars.obd_vars); +#ifdef LPROCFS + { + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lmv_proc_target_fops, obd); + if (rc) + CWARN("%s: error adding LMV target_obd file: rc = %d\n", + obd->obd_name, rc); + } +#endif + rc = fld_client_init(&lmv->lmv_fld, obd->obd_name, + LUSTRE_CLI_FLD_HASH_DHT); + if (rc) { + CERROR("Can't init FLD, err %d\n", rc); + GOTO(out, rc); + } + + RETURN(0); + +out: + return rc; +} + +static int lmv_cleanup(struct obd_device *obd) +{ + struct lmv_obd *lmv = &obd->u.lmv; + ENTRY; + + fld_client_fini(&lmv->lmv_fld); + if (lmv->tgts != NULL) { + int i; + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL) + continue; + lmv_del_target(lmv, i); + } + OBD_FREE(lmv->tgts, sizeof(*lmv->tgts) * lmv->tgts_size); + lmv->tgts_size = 0; + } + RETURN(0); +} + +static int lmv_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct obd_uuid obd_uuid; + int gen; + __u32 index; + int rc; + ENTRY; + + switch (lcfg->lcfg_command) { + case LCFG_ADD_MDC: + /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID + * 2:0 3:1 4:lustre-MDT0000-mdc_UUID */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) + GOTO(out, rc = -EINVAL); + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) + GOTO(out, rc = -EINVAL); + rc = lmv_add_target(obd, &obd_uuid, index, gen); + GOTO(out, rc); + default: + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + } +out: + RETURN(rc); +} + +static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct obd_statfs *temp; + int rc = 0; + int i; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + OBD_ALLOC(temp, sizeof(*temp)); + if (temp == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + + rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp, + max_age, flags); + if (rc) { + CERROR("can't stat MDS #%d (%s), error %d\n", i, + lmv->tgts[i]->ltd_exp->exp_obd->obd_name, + rc); + GOTO(out_free_temp, rc); + } + + if (i == 0) { + *osfs = *temp; + /* If the statfs is from mount, it will needs + * retrieve necessary information from MDT0. + * i.e. mount does not need the merged osfs + * from all of MDT. + * And also clients can be mounted as long as + * MDT0 is in service*/ + if (flags & OBD_STATFS_FOR_MDT0) + GOTO(out_free_temp, rc); + } else { + osfs->os_bavail += temp->os_bavail; + osfs->os_blocks += temp->os_blocks; + osfs->os_ffree += temp->os_ffree; + osfs->os_files += temp->os_files; + } + } + + EXIT; +out_free_temp: + OBD_FREE(temp, sizeof(*temp)); + return rc; +} + +static int lmv_getstatus(struct obd_export *exp, + struct lu_fid *fid, + struct obd_capa **pc) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid, pc); + RETURN(rc); +} + +static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, obd_valid valid, const char *name, + const char *input, int input_size, int output_size, + int flags, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_getxattr(tgt->ltd_exp, fid, oc, valid, name, input, + input_size, output_size, flags, request); + + RETURN(rc); +} + +static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, obd_valid valid, const char *name, + const char *input, int input_size, int output_size, + int flags, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input, + input_size, output_size, flags, suppgid, + request); + + RETURN(rc); +} + +static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = tgt->ltd_idx; + RETURN(0); + } + + rc = md_getattr(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + md_null_inode(lmv->tgts[i]->ltd_exp, fid); + } + + RETURN(0); +} + +static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int i; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid)); + + /* + * With DNE every object can have two locks in different namespaces: + * lookup lock in space of MDT storing direntry and update/open lock in + * space of MDT storing inode. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL) + continue; + rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data); + if (rc) + RETURN(rc); + } + + RETURN(rc); +} + + +static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1)); + rc = md_close(tgt->ltd_exp, op_data, mod, request); + RETURN(rc); +} + +struct lmv_tgt_desc +*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, + struct lu_fid *fid) +{ + struct lmv_tgt_desc *tgt; + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + return tgt; + + op_data->op_mds = tgt->ltd_idx; + + return tgt; +} + +int lmv_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, + __u32 gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + if (!lmv->desc.ld_active_tgt_count) + RETURN(-EIO); + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + op_data->op_mds); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, + cap_effective, rdev, request); + + if (rc == 0) { + if (*request == NULL) + RETURN(rc); + CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2)); + } + RETURN(rc); +} + +static int lmv_done_writing(struct obd_export *exp, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_done_writing(tgt->ltd_exp, op_data, mod); + RETURN(rc); +} + +static int +lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + int extra_lock_flags) +{ + struct ptlrpc_request *req = it->d.lustre.it_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lustre_handle plock; + struct lmv_tgt_desc *tgt; + struct md_op_data *rdata; + struct lu_fid fid1; + struct mdt_body *body; + int rc = 0; + int pmode; + ENTRY; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (!(body->valid & OBD_MD_MDS)) + RETURN(0); + + CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1)); + + /* + * We got LOOKUP lock, but we really need attrs. + */ + pmode = it->d.lustre.it_lock_mode; + LASSERT(pmode != 0); + memcpy(&plock, lockh, sizeof(plock)); + it->d.lustre.it_lock_mode = 0; + it->d.lustre.it_data = NULL; + fid1 = body->fid1; + + it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE; + ptlrpc_req_finished(req); + + tgt = lmv_find_target(lmv, &fid1); + if (IS_ERR(tgt)) + GOTO(out, rc = PTR_ERR(tgt)); + + OBD_ALLOC_PTR(rdata); + if (rdata == NULL) + GOTO(out, rc = -ENOMEM); + + rdata->op_fid1 = fid1; + rdata->op_bias = MDS_CROSS_REF; + + rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh, + lmm, lmmsize, NULL, extra_lock_flags); + OBD_FREE_PTR(rdata); + EXIT; +out: + ldlm_lock_decref(&plock, pmode); + return rc; +} + +static int +lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **req, __u64 extra_lock_flags) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n", + LL_IT2STR(it), PFID(&op_data->op_fid1)); + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n", + LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx); + + rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh, + lmm, lmmsize, req, extra_lock_flags); + + if (rc == 0 && it && it->it_op == IT_OPEN) { + rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh, + lmm, lmmsize, extra_lock_flags); + } + RETURN(rc); +} + +static int +lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req = NULL; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + struct mdt_body *body; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + tgt->ltd_idx); + + rc = md_getattr_name(tgt->ltd_exp, op_data, request); + if (rc != 0) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, + &RMF_MDT_BODY); + LASSERT(body != NULL); + + if (body->valid & OBD_MD_MDS) { + struct lu_fid rid = body->fid1; + CDEBUG(D_INODE, "Request attrs for "DFID"\n", + PFID(&rid)); + + tgt = lmv_find_target(lmv, &rid); + if (IS_ERR(tgt)) { + ptlrpc_req_finished(*request); + RETURN(PTR_ERR(tgt)); + } + + op_data->op_fid1 = rid; + op_data->op_valid |= OBD_MD_FLCROSSREF; + op_data->op_namelen = 0; + op_data->op_name = NULL; + rc = md_getattr_name(tgt->ltd_exp, op_data, &req); + ptlrpc_req_finished(*request); + *request = req; + } + + RETURN(rc); +} + +#define md_op_data_fid(op_data, fl) \ + (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \ + fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \ + fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \ + fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ + NULL) + +static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data, + int op_tgt, ldlm_mode_t mode, int bits, int flag) +{ + struct lu_fid *fid = md_op_data_fid(op_data, flag); + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + ldlm_policy_data_t policy = {{0}}; + int rc = 0; + ENTRY; + + if (!fid_is_sane(fid)) + RETURN(0); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + if (tgt->ltd_idx != op_tgt) { + CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); + policy.l_inodebits.bits = bits; + rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, + mode, LCF_ASYNC, NULL); + } else { + CDEBUG(D_INODE, + "EARLY_CANCEL skip operation target %d on "DFID"\n", + op_tgt, PFID(fid)); + op_data->op_flags |= flag; + rc = 0; + } + + RETURN(rc); +} + +/* + * llite passes fid of an target inode in op_data->op_fid1 and id of directory in + * op_data->op_fid2 + */ +static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + LASSERT(op_data->op_namelen != 0); + + CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n", + PFID(&op_data->op_fid2), op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid1)); + + op_data->op_fsuid = current_fsuid(); + op_data->op_fsgid = current_fsgid(); + op_data->op_cap = cfs_curproc_cap_pack(); + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + /* + * Cancel UPDATE lock on child (fid1). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID2; + rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, + MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); + if (rc != 0) + RETURN(rc); + + rc = md_link(tgt->ltd_exp, op_data, request); + + RETURN(rc); +} + +static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *src_tgt; + struct lmv_tgt_desc *tgt_tgt; + int rc; + ENTRY; + + LASSERT(oldlen != 0); + + CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n", + oldlen, old, PFID(&op_data->op_fid1), + newlen, new, PFID(&op_data->op_fid2)); + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + op_data->op_fsuid = current_fsuid(); + op_data->op_fsgid = current_fsgid(); + op_data->op_cap = cfs_curproc_cap_pack(); + src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(src_tgt)) + RETURN(PTR_ERR(src_tgt)); + + tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + if (IS_ERR(tgt_tgt)) + RETURN(PTR_ERR(tgt_tgt)); + /* + * LOOKUP lock on src child (fid3) should also be cancelled for + * src_tgt in mdc_rename. + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + /* + * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its + * own target. + */ + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_UPDATE, + MF_MDC_CANCEL_FID2); + + /* + * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt. + */ + if (rc == 0) { + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_LOOKUP, + MF_MDC_CANCEL_FID4); + } + + /* + * Cancel all the locks on tgt child (fid4). + */ + if (rc == 0) + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_FULL, + MF_MDC_CANCEL_FID4); + + if (rc == 0) + rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen, + new, newlen, request); + RETURN(rc); +} + +static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, + struct md_open_data **mod) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc = 0; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n", + PFID(&op_data->op_fid1), op_data->op_attr.ia_valid); + + op_data->op_flags |= MF_MDC_CANCEL_FID1; + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2, + ea2len, request, mod); + + RETURN(rc); +} + +static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_sync(tgt->ltd_exp, fid, oc, request); + RETURN(rc); +} + +/* + * Adjust a set of pages, each page containing an array of lu_dirpages, + * so that each page can be used as a single logical lu_dirpage. + * + * A lu_dirpage is laid out as follows, where s = ldp_hash_start, + * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a + * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end + * value is used as a cookie to request the next lu_dirpage in a + * directory listing that spans multiple pages (two in this example): + * ________ + * | | + * .|--------v------- -----. + * |s|e|f|p|ent|ent| ... |ent| + * '--|-------------- -----' Each CFS_PAGE contains a single + * '------. lu_dirpage. + * .---------v------- -----. + * |s|e|f|p|ent| 0 | ... | 0 | + * '----------------- -----' + * + * However, on hosts where the native VM page size (PAGE_CACHE_SIZE) is + * larger than LU_PAGE_SIZE, a single host page may contain multiple + * lu_dirpages. After reading the lu_dirpages from the MDS, the + * ldp_hash_end of the first lu_dirpage refers to the one immediately + * after it in the same CFS_PAGE (arrows simplified for brevity, but + * in general e0==s1, e1==s2, etc.): + * + * .-------------------- -----. + * |s0|e0|f0|p|ent|ent| ... |ent| + * |---v---------------- -----| + * |s1|e1|f1|p|ent|ent| ... |ent| + * |---v---------------- -----| Here, each CFS_PAGE contains + * ... multiple lu_dirpages. + * |---v---------------- -----| + * |s'|e'|f'|p|ent|ent| ... |ent| + * '---|---------------- -----' + * v + * .----------------------------. + * | next CFS_PAGE | + * + * This structure is transformed into a single logical lu_dirpage as follows: + * + * - Replace e0 with e' so the request for the next lu_dirpage gets the page + * labeled 'next CFS_PAGE'. + * + * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether + * a hash collision with the next page exists. + * + * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span + * to the first entry of the next lu_dirpage. + */ +#if PAGE_CACHE_SIZE > LU_PAGE_SIZE +static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs) +{ + int i; + + for (i = 0; i < ncfspgs; i++) { + struct lu_dirpage *dp = kmap(pages[i]); + struct lu_dirpage *first = dp; + struct lu_dirent *end_dirent = NULL; + struct lu_dirent *ent; + __u64 hash_end = dp->ldp_hash_end; + __u32 flags = dp->ldp_flags; + + for (; nlupgs > 1; nlupgs--) { + ent = lu_dirent_start(dp); + for (end_dirent = ent; ent != NULL; + end_dirent = ent, ent = lu_dirent_next(ent)); + + /* Advance dp to next lu_dirpage. */ + dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); + + /* Check if we've reached the end of the CFS_PAGE. */ + if (!((unsigned long)dp & ~CFS_PAGE_MASK)) + break; + + /* Save the hash and flags of this lu_dirpage. */ + hash_end = dp->ldp_hash_end; + flags = dp->ldp_flags; + + /* Check if lu_dirpage contains no entries. */ + if (!end_dirent) + break; + + /* Enlarge the end entry lde_reclen from 0 to + * first entry of next lu_dirpage. */ + LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0); + end_dirent->lde_reclen = + cpu_to_le16((char *)(dp->ldp_entries) - + (char *)end_dirent); + } + + first->ldp_hash_end = hash_end; + first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); + first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); + + kunmap(pages[i]); + } +} +#else +#define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0) +#endif /* PAGE_CACHE_SIZE > LU_PAGE_SIZE */ + +static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data, + struct page **pages, struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + __u64 offset = op_data->op_offset; + int rc; + int ncfspgs; /* pages read in PAGE_CACHE_SIZE */ + int nlupgs; /* pages read in LU_PAGE_SIZE */ + struct lmv_tgt_desc *tgt; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + CDEBUG(D_INODE, "READPAGE at "LPX64" from "DFID"\n", + offset, PFID(&op_data->op_fid1)); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_readpage(tgt->ltd_exp, op_data, pages, request); + if (rc != 0) + RETURN(rc); + + ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1) + >> PAGE_CACHE_SHIFT; + nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT; + LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); + LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages); + + CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs, + op_data->op_npages); + + lmv_adjust_dirpages(pages, ncfspgs, nlupgs); + + RETURN(rc); +} + +static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + struct mdt_body *body; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); +retry: + /* Send unlink requests to the MDT where the child is located */ + if (likely(!fid_is_zero(&op_data->op_fid2))) + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); + else + tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_fsuid = current_fsuid(); + op_data->op_fsgid = current_fsgid(); + op_data->op_cap = cfs_curproc_cap_pack(); + + /* + * If child's fid is given, cancel unused locks for it if it is from + * another export than parent. + * + * LOOKUP lock for child (fid3) should also be cancelled on parent + * tgt_tgt in mdc_unlink(). + */ + op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; + + /* + * Cancel FULL locks on child (fid3). + */ + rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, + MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); + + if (rc != 0) + RETURN(rc); + + CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n", + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx); + + rc = md_unlink(tgt->ltd_exp, op_data, request); + if (rc != 0 && rc != -EREMOTE) + RETURN(rc); + + body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + /* Not cross-ref case, just get out of here. */ + if (likely(!(body->valid & OBD_MD_MDS))) + RETURN(0); + + CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n", + exp->exp_obd->obd_name, PFID(&body->fid1)); + + /* This is a remote object, try remote MDT, Note: it may + * try more than 1 time here, Considering following case + * /mnt/lustre is root on MDT0, remote1 is on MDT1 + * 1. Initially A does not know where remote1 is, it send + * unlink RPC to MDT0, MDT0 return -EREMOTE, it will + * resend unlink RPC to MDT1 (retry 1st time). + * + * 2. During the unlink RPC in flight, + * client B mv /mnt/lustre/remote1 /mnt/lustre/remote2 + * and create new remote1, but on MDT0 + * + * 3. MDT1 get unlink RPC(from A), then do remote lock on + * /mnt/lustre, then lookup get fid of remote1, and find + * it is remote dir again, and replay -EREMOTE again. + * + * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times). + * + * In theory, it might try unlimited time here, but it should + * be very rare case. */ + op_data->op_fid2 = body->fid1; + ptlrpc_req_finished(*request); + *request = NULL; + + goto retry; +} + +static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + struct lmv_obd *lmv = &obd->u.lmv; + int rc = 0; + + switch (stage) { + case OBD_CLEANUP_EARLY: + /* XXX: here should be calling obd_precleanup() down to + * stack. */ + break; + case OBD_CLEANUP_EXPORTS: + fld_client_proc_fini(&lmv->lmv_fld); + lprocfs_obd_cleanup(obd); + break; + default: + break; + } + RETURN(rc); +} + +static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + lmv = &obd->u.lmv; + if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) { + struct lmv_tgt_desc *tgt; + int i; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + LASSERT(*vallen == sizeof(__u32)); + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + /* + * All tgts should be connected when this gets called. + */ + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + if (!obd_get_info(env, tgt->ltd_exp, keylen, key, + vallen, val, NULL)) + RETURN(0); + } + RETURN(-EINVAL); + } else if (KEY_IS(KEY_MAX_EASIZE) || KEY_IS(KEY_CONN_DATA)) { + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + /* + * Forwarding this request to first MDS, it should know LOV + * desc. + */ + rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key, + vallen, val, NULL); + if (!rc && KEY_IS(KEY_CONN_DATA)) + exp->exp_connect_data = *(struct obd_connect_data *)val; + RETURN(rc); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lmv->desc.ld_tgt_count; + RETURN(0); + } + + CDEBUG(D_IOCTL, "Invalid key\n"); + RETURN(-EINVAL); +} + +int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct lmv_tgt_desc *tgt; + struct obd_device *obd; + struct lmv_obd *lmv; + int rc = 0; + ENTRY; + + obd = class_exp2obd(exp); + if (obd == NULL) { + CDEBUG(D_IOCTL, "Invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + lmv = &obd->u.lmv; + + if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) { + int i, err = 0; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + tgt = lmv->tgts[i]; + + if (tgt == NULL || tgt->ltd_exp == NULL) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + if (err && rc == 0) + rc = err; + } + + RETURN(rc); + } + + RETURN(-EINVAL); +} + +int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_stripe_md *meap; + struct lmv_stripe_md *lsmp; + int mea_size; + int i; + ENTRY; + + mea_size = lmv_get_easize(lmv); + if (!lmmp) + RETURN(mea_size); + + if (*lmmp && !lsm) { + OBD_FREE_LARGE(*lmmp, mea_size); + *lmmp = NULL; + RETURN(0); + } + + if (*lmmp == NULL) { + OBD_ALLOC_LARGE(*lmmp, mea_size); + if (*lmmp == NULL) + RETURN(-ENOMEM); + } + + if (!lsm) + RETURN(mea_size); + + lsmp = (struct lmv_stripe_md *)lsm; + meap = (struct lmv_stripe_md *)*lmmp; + + if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR && + lsmp->mea_magic != MEA_MAGIC_ALL_CHARS) + RETURN(-EINVAL); + + meap->mea_magic = cpu_to_le32(lsmp->mea_magic); + meap->mea_count = cpu_to_le32(lsmp->mea_count); + meap->mea_master = cpu_to_le32(lsmp->mea_master); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + meap->mea_ids[i] = lsmp->mea_ids[i]; + fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]); + } + + RETURN(mea_size); +} + +int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_size) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_stripe_md **tmea = (struct lmv_stripe_md **)lsmp; + struct lmv_stripe_md *mea = (struct lmv_stripe_md *)lmm; + struct lmv_obd *lmv = &obd->u.lmv; + int mea_size; + int i; + __u32 magic; + ENTRY; + + mea_size = lmv_get_easize(lmv); + if (lsmp == NULL) + return mea_size; + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE_LARGE(*tmea, mea_size); + *lsmp = NULL; + RETURN(0); + } + + LASSERT(mea_size == lmm_size); + + OBD_ALLOC_LARGE(*tmea, mea_size); + if (*tmea == NULL) + RETURN(-ENOMEM); + + if (!lmm) + RETURN(mea_size); + + if (mea->mea_magic == MEA_MAGIC_LAST_CHAR || + mea->mea_magic == MEA_MAGIC_ALL_CHARS || + mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) + { + magic = le32_to_cpu(mea->mea_magic); + } else { + /* + * Old mea is not handled here. + */ + CERROR("Old not supportable EA is found\n"); + LBUG(); + } + + (*tmea)->mea_magic = magic; + (*tmea)->mea_count = le32_to_cpu(mea->mea_count); + (*tmea)->mea_master = le32_to_cpu(mea->mea_master); + + for (i = 0; i < (*tmea)->mea_count; i++) { + (*tmea)->mea_ids[i] = mea->mea_ids[i]; + fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]); + } + RETURN(mea_size); +} + +static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + ldlm_cancel_flags_t flags, void *opaque) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int rc = 0; + int err; + int i; + ENTRY; + + LASSERT(fid != NULL); + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) + continue; + + err = md_cancel_unused(lmv->tgts[i]->ltd_exp, fid, + policy, mode, flags, opaque); + if (!rc) + rc = err; + } + RETURN(rc); +} + +int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, + __u64 *bits) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + int rc; + ENTRY; + + rc = md_set_lock_data(lmv->tgts[0]->ltd_exp, lockh, data, bits); + RETURN(rc); +} + +ldlm_mode_t lmv_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + ldlm_mode_t rc; + int i; + ENTRY; + + CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); + + /* + * With CMD every object can have two locks in different namespaces: + * lookup lock in space of mds storing direntry and update/open lock in + * space of mds storing inode. Thus we check all targets, not only that + * one fid was created in. + */ + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + if (lmv->tgts[i] == NULL || + lmv->tgts[i]->ltd_exp == NULL || + lmv->tgts[i]->ltd_active == 0) + continue; + + rc = md_lock_match(lmv->tgts[i]->ltd_exp, flags, fid, + type, policy, mode, lockh); + if (rc) + RETURN(rc); + } + + RETURN(0); +} + +int lmv_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return md_get_lustre_md(lmv->tgts[0]->ltd_exp, req, dt_exp, md_exp, md); +} + +int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + ENTRY; + + if (md->mea) + obd_free_memmd(exp, (void *)&md->mea); + RETURN(md_free_lustre_md(lmv->tgts[0]->ltd_exp, md)); +} + +int lmv_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct ptlrpc_request *open_req) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + ENTRY; + + tgt = lmv_find_target(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_set_open_replay_data(tgt->ltd_exp, och, open_req)); +} + +int lmv_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + ENTRY; + + tgt = lmv_find_target(lmv, &och->och_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + RETURN(md_clear_open_replay_data(tgt->ltd_exp, och)); +} + +static int lmv_get_remote_perm(struct obd_export *exp, + const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_get_remote_perm(tgt->ltd_exp, fid, oc, suppgid, request); + RETURN(rc); +} + +static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc, + renew_capa_cb_t cb) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, &oc->c_capa.lc_fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_renew_capa(tgt->ltd_exp, oc, cb); + RETURN(rc); +} + +int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa **oc) +{ + struct lmv_obd *lmv = &exp->exp_obd->u.lmv; + + return md_unpack_capa(lmv->tgts[0]->ltd_exp, req, field, oc); +} + +int lmv_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = NULL; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, &op_data->op_fid1); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo); + RETURN(rc); +} + +int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; + + rc = lmv_check_connect(obd); + if (rc) + RETURN(rc); + + tgt = lmv_find_target(lmv, fid); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits); + RETURN(rc); +} + +/** + * For lmv, only need to send request to master MDT, and the master MDT will + * process with other slave MDTs. The only exception is Q_GETOQUOTA for which + * we directly fetch data from the slave MDTs. + */ +int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = lmv->tgts[0]; + int rc = 0, i; + __u64 curspace, curinodes; + ENTRY; + + if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) { + CERROR("master lmv inactive\n"); + RETURN(-EIO); + } + + if (oqctl->qc_cmd != Q_GETOQUOTA) { + rc = obd_quotactl(tgt->ltd_exp, oqctl); + RETURN(rc); + } + + curspace = curinodes = 0; + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + int err; + tgt = lmv->tgts[i]; + + if (tgt == NULL || tgt->ltd_exp == NULL || tgt->ltd_active == 0) + continue; + if (!tgt->ltd_active) { + CDEBUG(D_HA, "mdt %d is inactive.\n", i); + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + CERROR("getquota on mdt %d failed. %d\n", i, err); + if (!rc) + rc = err; + } else { + curspace += oqctl->qc_dqblk.dqb_curspace; + curinodes += oqctl->qc_dqblk.dqb_curinodes; + } + } + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_curinodes = curinodes; + + RETURN(rc); +} + +int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int i, rc = 0; + ENTRY; + + for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + int err; + tgt = lmv->tgts[i]; + if (tgt == NULL || tgt->ltd_exp == NULL || !tgt->ltd_active) { + CERROR("lmv idx %d inactive\n", i); + RETURN(-EIO); + } + + err = obd_quotacheck(tgt->ltd_exp, oqctl); + if (err && !rc) + rc = err; + } + + RETURN(rc); +} + +struct obd_ops lmv_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lmv_setup, + .o_cleanup = lmv_cleanup, + .o_precleanup = lmv_precleanup, + .o_process_config = lmv_process_config, + .o_connect = lmv_connect, + .o_disconnect = lmv_disconnect, + .o_statfs = lmv_statfs, + .o_get_info = lmv_get_info, + .o_set_info_async = lmv_set_info_async, + .o_packmd = lmv_packmd, + .o_unpackmd = lmv_unpackmd, + .o_notify = lmv_notify, + .o_get_uuid = lmv_get_uuid, + .o_iocontrol = lmv_iocontrol, + .o_quotacheck = lmv_quotacheck, + .o_quotactl = lmv_quotactl +}; + +struct md_ops lmv_md_ops = { + .m_getstatus = lmv_getstatus, + .m_null_inode = lmv_null_inode, + .m_find_cbdata = lmv_find_cbdata, + .m_close = lmv_close, + .m_create = lmv_create, + .m_done_writing = lmv_done_writing, + .m_enqueue = lmv_enqueue, + .m_getattr = lmv_getattr, + .m_getxattr = lmv_getxattr, + .m_getattr_name = lmv_getattr_name, + .m_intent_lock = lmv_intent_lock, + .m_link = lmv_link, + .m_rename = lmv_rename, + .m_setattr = lmv_setattr, + .m_setxattr = lmv_setxattr, + .m_sync = lmv_sync, + .m_readpage = lmv_readpage, + .m_unlink = lmv_unlink, + .m_init_ea_size = lmv_init_ea_size, + .m_cancel_unused = lmv_cancel_unused, + .m_set_lock_data = lmv_set_lock_data, + .m_lock_match = lmv_lock_match, + .m_get_lustre_md = lmv_get_lustre_md, + .m_free_lustre_md = lmv_free_lustre_md, + .m_set_open_replay_data = lmv_set_open_replay_data, + .m_clear_open_replay_data = lmv_clear_open_replay_data, + .m_renew_capa = lmv_renew_capa, + .m_unpack_capa = lmv_unpack_capa, + .m_get_remote_perm = lmv_get_remote_perm, + .m_intent_getattr_async = lmv_intent_getattr_async, + .m_revalidate_lock = lmv_revalidate_lock +}; + +int __init lmv_init(void) +{ + struct lprocfs_static_vars lvars; + int rc; + + lprocfs_lmv_init_vars(&lvars); + + rc = class_register_type(&lmv_obd_ops, &lmv_md_ops, + lvars.module_vars, LUSTRE_LMV_NAME, NULL); + return rc; +} + +static void lmv_exit(void) +{ + class_unregister_type(LUSTRE_LMV_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Metadata Volume OBD driver"); +MODULE_LICENSE("GPL"); + +module_init(lmv_init); +module_exit(lmv_exit); diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c new file mode 100644 index 000000000000..4bbe0241c93d --- /dev/null +++ b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c @@ -0,0 +1,239 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include + +#ifndef LPROCFS +static struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +static struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +#else +static int lmv_rd_numobd(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct lmv_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lmv.desc; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_tgt_count); + +} + +static const char *placement_name[] = { + [PLACEMENT_CHAR_POLICY] = "CHAR", + [PLACEMENT_NID_POLICY] = "NID", + [PLACEMENT_INVAL_POLICY] = "INVAL" +}; + +static placement_policy_t placement_name2policy(char *name, int len) +{ + int i; + + for (i = 0; i < PLACEMENT_MAX_POLICY; i++) { + if (!strncmp(placement_name[i], name, len)) + return i; + } + return PLACEMENT_INVAL_POLICY; +} + +static const char *placement_policy2name(placement_policy_t placement) +{ + LASSERT(placement < PLACEMENT_MAX_POLICY); + return placement_name[placement]; +} + +static int lmv_rd_placement(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct lmv_obd *lmv; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + *eof = 1; + return snprintf(page, count, "%s\n", + placement_policy2name(lmv->lmv_placement)); + +} + +#define MAX_POLICY_STRING_SIZE 64 + +static int lmv_wr_placement(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + char dummy[MAX_POLICY_STRING_SIZE + 1]; + int len = count; + placement_policy_t policy; + struct lmv_obd *lmv; + + if (copy_from_user(dummy, buffer, MAX_POLICY_STRING_SIZE)) + return -EFAULT; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + + if (len > MAX_POLICY_STRING_SIZE) + len = MAX_POLICY_STRING_SIZE; + + if (dummy[len - 1] == '\n') + len--; + dummy[len] = '\0'; + + policy = placement_name2policy(dummy, len); + if (policy != PLACEMENT_INVAL_POLICY) { + spin_lock(&lmv->lmv_lock); + lmv->lmv_placement = policy; + spin_unlock(&lmv->lmv_lock); + } else { + CERROR("Invalid placement policy \"%s\"!\n", dummy); + return -EINVAL; + } + return count; +} + +static int lmv_rd_activeobd(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct lmv_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lmv.desc; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_active_tgt_count); +} + +static int lmv_rd_desc_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*) data; + struct lmv_obd *lmv; + + LASSERT(dev != NULL); + lmv = &dev->u.lmv; + *eof = 1; + return snprintf(page, count, "%s\n", lmv->desc.ld_uuid.uuid); +} + +static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos]; +} + +static void lmv_tgt_seq_stop(struct seq_file *p, void *v) +{ + return; +} + +static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lmv_obd *lmv = &dev->u.lmv; + ++*pos; + return (*pos >= lmv->desc.ld_tgt_count) ? NULL : lmv->tgts[*pos]; +} + +static int lmv_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lmv_tgt_desc *tgt = v; + + if (tgt == NULL) + return 0; + return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_idx, + tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN"); +} + +struct seq_operations lmv_tgt_sops = { + .start = lmv_tgt_seq_start, + .stop = lmv_tgt_seq_stop, + .next = lmv_tgt_seq_next, + .show = lmv_tgt_seq_show, +}; + +static int lmv_target_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lmv_tgt_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = dp->data; + + return 0; +} + +struct lprocfs_vars lprocfs_lmv_obd_vars[] = { + { "numobd", lmv_rd_numobd, 0, 0 }, + { "placement", lmv_rd_placement, lmv_wr_placement, 0 }, + { "activeobd", lmv_rd_activeobd, 0, 0 }, + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "desc_uuid", lmv_rd_desc_uuid, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_lmv_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +struct file_operations lmv_proc_target_fops = { + .owner = THIS_MODULE, + .open = lmv_target_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* LPROCFS */ +void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_lmv_module_vars; + lvars->obd_vars = lprocfs_lmv_obd_vars; +} diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile new file mode 100644 index 000000000000..67eaec29bef1 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_LUSTRE_FS) += lov.o +lov-y := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o \ + lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o \ + lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o \ + lovsub_lock.o lovsub_io.o lov_pool.o + + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h new file mode 100644 index 000000000000..28801b8b5fdf --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -0,0 +1,820 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef LOV_CL_INTERNAL_H +#define LOV_CL_INTERNAL_H + +# include + +#include +#include +#include "lov_internal.h" + +/** \defgroup lov lov + * Logical object volume layer. This layer implements data striping (raid0). + * + * At the lov layer top-entity (object, page, lock, io) is connected to one or + * more sub-entities: top-object, representing a file is connected to a set of + * sub-objects, each representing a stripe, file-level top-lock is connected + * to a set of per-stripe sub-locks, top-page is connected to a (single) + * sub-page, and a top-level IO is connected to a set of (potentially + * concurrent) sub-IO's. + * + * Sub-object, sub-page, and sub-io have well-defined top-object and top-page + * respectively, while a single sub-lock can be part of multiple top-locks. + * + * Reference counting models are different for different types of entities: + * + * - top-object keeps a reference to its sub-objects, and destroys them + * when it is destroyed. + * + * - top-page keeps a reference to its sub-page, and destroys it when it + * is destroyed. + * + * - sub-lock keep a reference to its top-locks. Top-lock keeps a + * reference (and a hold, see cl_lock_hold()) on its sub-locks when it + * actively using them (that is, in cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When + * moving into cl_lock_state::CLS_CACHED state, top-lock releases a + * hold. From this moment top-lock has only a 'weak' reference to its + * sub-locks. This reference is protected by top-lock + * cl_lock::cll_guard, and will be automatically cleared by the sub-lock + * when the latter is destroyed. When a sub-lock is canceled, a + * reference to it is removed from the top-lock array, and top-lock is + * moved into CLS_NEW state. It is guaranteed that all sub-locks exist + * while their top-lock is in CLS_HELD or CLS_CACHED states. + * + * - IO's are not reference counted. + * + * To implement a connection between top and sub entities, lov layer is split + * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both + * implementing full set of cl-interfaces. For example, top-object has vvp and + * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is + * used to track child-parent relationship. + * + * @{ + */ + +struct lovsub_device; +struct lovsub_object; +struct lovsub_lock; + +enum lov_device_flags { + LOV_DEV_INITIALIZED = 1 << 0 +}; + +/* + * Upper half. + */ + +/** + * Resources that are used in memory-cleaning path, and whose allocation + * cannot fail even when memory is tight. They are preallocated in sufficient + * quantities in lov_device::ld_emerg[], and access to them is serialized + * lov_device::ld_mutex. + */ +struct lov_device_emerg { + /** + * Page list used to submit IO when memory is in pressure. + */ + struct cl_page_list emrg_page_list; + /** + * sub-io's shared by all threads accessing this device when memory is + * too low to allocate sub-io's dynamically. + */ + struct cl_io emrg_subio; + /** + * Environments used by sub-io's in + * lov_device_emerg::emrg_subio. + */ + struct lu_env *emrg_env; + /** + * Refchecks for lov_device_emerg::emrg_env. + * + * \see cl_env_get() + */ + int emrg_refcheck; +}; + +struct lov_device { + /* + * XXX Locking of lov-private data is missing. + */ + struct cl_device ld_cl; + struct lov_obd *ld_lov; + /** size of lov_device::ld_target[] array */ + __u32 ld_target_nr; + struct lovsub_device **ld_target; + __u32 ld_flags; + + /** Emergency resources used in memory-cleansing paths. */ + struct lov_device_emerg **ld_emrg; + /** + * Serializes access to lov_device::ld_emrg in low-memory + * conditions. + */ + struct mutex ld_mutex; +}; + +/** + * Layout type. + */ +enum lov_layout_type { + /** empty file without body */ + LLT_EMPTY, + /** striped file */ + LLT_RAID0, + LLT_NR +}; + +/** + * lov-specific file state. + * + * lov object has particular layout type, determining how top-object is built + * on top of sub-objects. Layout type can change dynamically. When this + * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, + * all state pertaining to the old layout type is destroyed, and new state is + * constructed. All object methods take said semaphore in the shared mode, + * providing serialization against transition between layout types. + * + * To avoid multiple `if' or `switch' statements, selecting behavior for the + * current layout type, object methods perform double-dispatch, invoking + * function corresponding to the current layout type. + */ +struct lov_object { + struct cl_object lo_cl; + /** + * Serializes object operations with transitions between layout types. + * + * This semaphore is taken in shared mode by all object methods, and + * is taken in exclusive mode when object type is changed. + * + * \see lov_object::lo_type + */ + struct rw_semaphore lo_type_guard; + /** + * Type of an object. Protected by lov_object::lo_type_guard. + */ + enum lov_layout_type lo_type; + /** + * True if layout is invalid. This bit is cleared when layout lock + * is lost. + */ + bool lo_layout_invalid; + /** + * How many IOs are on going on this object. Layout can be changed + * only if there is no active IO. + */ + atomic_t lo_active_ios; + /** + * Waitq - wait for no one else is using lo_lsm + */ + wait_queue_head_t lo_waitq; + /** + * Layout metadata. NULL if empty layout. + */ + struct lov_stripe_md *lo_lsm; + + union lov_layout_state { + struct lov_layout_raid0 { + unsigned lo_nr; + /** + * When this is true, lov_object::lo_attr contains + * valid up to date attributes for a top-level + * object. This field is reset to 0 when attributes of + * any sub-object change. + */ + int lo_attr_valid; + /** + * Array of sub-objects. Allocated when top-object is + * created (lov_init_raid0()). + * + * Top-object is a strict master of its sub-objects: + * it is created before them, and outlives its + * children (this later is necessary so that basic + * functions like cl_object_top() always + * work). Top-object keeps a reference on every + * sub-object. + * + * When top-object is destroyed (lov_delete_raid0()) + * it releases its reference to a sub-object and waits + * until the latter is finally destroyed. + */ + struct lovsub_object **lo_sub; + /** + * protect lo_sub + */ + spinlock_t lo_sub_lock; + /** + * Cached object attribute, built from sub-object + * attributes. + */ + struct cl_attr lo_attr; + } raid0; + struct lov_layout_state_empty { + } empty; + } u; + /** + * Thread that acquired lov_object::lo_type_guard in an exclusive + * mode. + */ + task_t *lo_owner; +}; + +/** + * Flags that top-lock can set on each of its sub-locks. + */ +enum lov_sub_flags { + /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */ + LSF_HELD = 1 << 0 +}; + +/** + * State lov_lock keeps for each sub-lock. + */ +struct lov_lock_sub { + /** sub-lock itself */ + struct lovsub_lock *sub_lock; + /** An array of per-sub-lock flags, taken from enum lov_sub_flags */ + unsigned sub_flags; + int sub_stripe; + struct cl_lock_descr sub_descr; + struct cl_lock_descr sub_got; +}; + +/** + * lov-specific lock state. + */ +struct lov_lock { + struct cl_lock_slice lls_cl; + /** Number of sub-locks in this lock */ + int lls_nr; + /** + * Number of existing sub-locks. + */ + unsigned lls_nr_filled; + /** + * Set when sub-lock was canceled, while top-lock was being + * used, or unused. + */ + unsigned int lls_cancel_race:1; + /** + * An array of sub-locks + * + * There are two issues with managing sub-locks: + * + * - sub-locks are concurrently canceled, and + * + * - sub-locks are shared with other top-locks. + * + * To manage cancellation, top-lock acquires a hold on a sublock + * (lov_sublock_adopt()) when the latter is inserted into + * lov_lock::lls_sub[]. This hold is released (lov_sublock_release()) + * when top-lock is going into CLS_CACHED state or destroyed. Hold + * prevents sub-lock from cancellation. + * + * Sub-lock sharing means, among other things, that top-lock that is + * in the process of creation (i.e., not yet inserted into lock list) + * is already accessible to other threads once at least one of its + * sub-locks is created, see lov_lock_sub_init(). + * + * Sub-lock can be in one of the following states: + * + * - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such + * sub-lock was either never created (top-lock is in CLS_NEW + * state), or it was created, then canceled, then destroyed + * (lov_lock_unlink() cleared sub-lock pointer in the top-lock). + * + * - sub-lock exists and is on + * hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a + * normal state of a sub-lock in CLS_HELD and CLS_CACHED states + * of a top-lock. + * + * - sub-lock exists, but is not held by the top-lock. This + * happens after top-lock released a hold on sub-locks before + * going into cache (lov_lock_unuse()). + * + * \todo To support wide-striping, array has to be replaced with a set + * of queues to avoid scanning. + */ + struct lov_lock_sub *lls_sub; + /** + * Original description with which lock was enqueued. + */ + struct cl_lock_descr lls_orig; +}; + +struct lov_page { + struct cl_page_slice lps_cl; + int lps_invalid; +}; + +/* + * Bottom half. + */ + +struct lovsub_device { + struct cl_device acid_cl; + struct lov_device *acid_super; + int acid_idx; + struct cl_device *acid_next; +}; + +struct lovsub_object { + struct cl_object_header lso_header; + struct cl_object lso_cl; + struct lov_object *lso_super; + int lso_index; +}; + +/** + * A link between a top-lock and a sub-lock. Separate data-structure is + * necessary, because top-locks and sub-locks are in M:N relationship. + * + * \todo This can be optimized for a (by far) most frequent case of a single + * top-lock per sub-lock. + */ +struct lov_lock_link { + struct lov_lock *lll_super; + /** An index within parent lock. */ + int lll_idx; + /** + * A linkage into per sub-lock list of all corresponding top-locks, + * hanging off lovsub_lock::lss_parents. + */ + struct list_head lll_list; +}; + +/** + * Lock state at lovsub layer. + */ +struct lovsub_lock { + struct cl_lock_slice lss_cl; + /** + * List of top-locks that have given sub-lock as their part. Protected + * by cl_lock::cll_guard mutex. + */ + struct list_head lss_parents; + /** + * Top-lock that initiated current operation on this sub-lock. This is + * only set during top-to-bottom lock operations like enqueue, and is + * used to optimize state change notification. Protected by + * cl_lock::cll_guard mutex. + * + * \see lovsub_lock_state_one(). + */ + struct cl_lock *lss_active; +}; + +/** + * Describe the environment settings for sublocks. + */ +struct lov_sublock_env { + const struct lu_env *lse_env; + struct cl_io *lse_io; + struct lov_io_sub *lse_sub; +}; + +struct lovsub_page { + struct cl_page_slice lsb_cl; +}; + + +struct lov_thread_info { + struct cl_object_conf lti_stripe_conf; + struct lu_fid lti_fid; + struct cl_lock_descr lti_ldescr; + struct ost_lvb lti_lvb; + struct cl_2queue lti_cl2q; + struct cl_lock_closure lti_closure; + wait_queue_t lti_waiter; +}; + +/** + * State that lov_io maintains for every sub-io. + */ +struct lov_io_sub { + int sub_stripe; + /** + * sub-io for a stripe. Ideally sub-io's can be stopped and resumed + * independently, with lov acting as a scheduler to maximize overall + * throughput. + */ + struct cl_io *sub_io; + /** + * Linkage into a list (hanging off lov_io::lis_active) of all + * sub-io's active for the current IO iteration. + */ + struct list_head sub_linkage; + /** + * true, iff cl_io_init() was successfully executed against + * lov_io_sub::sub_io. + */ + int sub_io_initialized; + /** + * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't + * allocated, but borrowed from a per-device emergency pool. + */ + int sub_borrowed; + /** + * environment, in which sub-io executes. + */ + struct lu_env *sub_env; + /** + * environment's refcheck. + * + * \see cl_env_get() + */ + int sub_refcheck; + int sub_refcheck2; + int sub_reenter; + void *sub_cookie; +}; + +/** + * IO state private for LOV. + */ +struct lov_io { + /** super-class */ + struct cl_io_slice lis_cl; + /** + * Pointer to the object slice. This is a duplicate of + * lov_io::lis_cl::cis_object. + */ + struct lov_object *lis_object; + /** + * Original end-of-io position for this IO, set by the upper layer as + * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, + * changes pos and count to fit IO into a single stripe and uses saved + * value to determine when IO iterations have to stop. + * + * This is used only for CIT_READ and CIT_WRITE io's. + */ + loff_t lis_io_endpos; + + /** + * starting position within a file, for the current io loop iteration + * (stripe), used by ci_io_loop(). + */ + obd_off lis_pos; + /** + * end position with in a file, for the current stripe io. This is + * exclusive (i.e., next offset after last byte affected by io). + */ + obd_off lis_endpos; + + int lis_mem_frozen; + int lis_stripe_count; + int lis_active_subios; + + /** + * the index of ls_single_subio in ls_subios array + */ + int lis_single_subio_index; + struct cl_io lis_single_subio; + + /** + * size of ls_subios array, actually the highest stripe # + */ + int lis_nr_subios; + struct lov_io_sub *lis_subs; + /** + * List of active sub-io's. + */ + struct list_head lis_active; +}; + +struct lov_session { + struct lov_io ls_io; + struct lov_sublock_env ls_subenv; +}; + +/** + * State of transfer for lov. + */ +struct lov_req { + struct cl_req_slice lr_cl; +}; + +/** + * State of transfer for lovsub. + */ +struct lovsub_req { + struct cl_req_slice lsrq_cl; +}; + +extern struct lu_device_type lov_device_type; +extern struct lu_device_type lovsub_device_type; + +extern struct lu_context_key lov_key; +extern struct lu_context_key lov_session_key; + +extern struct kmem_cache *lov_lock_kmem; +extern struct kmem_cache *lov_object_kmem; +extern struct kmem_cache *lov_thread_kmem; +extern struct kmem_cache *lov_session_kmem; +extern struct kmem_cache *lov_req_kmem; + +extern struct kmem_cache *lovsub_lock_kmem; +extern struct kmem_cache *lovsub_object_kmem; +extern struct kmem_cache *lovsub_req_kmem; + +extern struct kmem_cache *lov_lock_link_kmem; + +int lov_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lovsub_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lov_lock_init (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lovsub_lock_init (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); + +int lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_lock_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init_raid0 (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +void lov_lock_unlink (const struct lu_env *env, struct lov_lock_link *link, + struct lovsub_lock *sub); + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio, + int stripe); +void lov_sub_put (struct lov_io_sub *sub); +int lov_sublock_modify (const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx); + + +int lov_page_init (const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, struct page *vmpage); +int lovsub_page_init (const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, struct page *vmpage); + +int lov_page_init_empty (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +int lov_page_init_raid0 (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, struct page *vmpage); +struct lu_object *lov_object_alloc (const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub); +struct lov_io_sub *lov_page_subio (const struct lu_env *env, + struct lov_io *lio, + const struct cl_page_slice *slice); + +void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm); +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); + +#define lov_foreach_target(lov, var) \ + for (var = 0; var < lov_targets_nr(lov); ++var) + +/***************************************************************************** + * + * Type conversions. + * + * Accessors. + * + */ + +static inline struct lov_session *lov_env_session(const struct lu_env *env) +{ + struct lov_session *ses; + + ses = lu_context_key_get(env->le_ses, &lov_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct lov_io *lov_env_io(const struct lu_env *env) +{ + return &lov_env_session(env)->ls_io; +} + +static inline int lov_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lov_device_type; +} + +static inline int lovsub_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lovsub_device_type; +} + +static inline struct lu_device *lov2lu_dev(struct lov_device *lov) +{ + return &lov->ld_cl.cd_lu_dev; +} + +static inline struct lov_device *lu2lov_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lov_device_type); + return container_of0(d, struct lov_device, ld_cl.cd_lu_dev); +} + +static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) +{ + return &lovsub->acid_cl; +} + +static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) +{ + return &lovsub2cl_dev(lovsub)->cd_lu_dev; +} + +static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev); +} + +static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) +{ + LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl); +} + +static inline struct lu_object *lov2lu(struct lov_object *lov) +{ + return &lov->lo_cl.co_lu; +} + +static inline struct cl_object *lov2cl(struct lov_object *lov) +{ + return &lov->lo_cl; +} + +static inline struct lov_object *lu2lov(const struct lu_object *obj) +{ + LINVRNT(lov_is_object(obj)); + return container_of0(obj, struct lov_object, lo_cl.co_lu); +} + +static inline struct lov_object *cl2lov(const struct cl_object *obj) +{ + LINVRNT(lov_is_object(&obj->co_lu)); + return container_of0(obj, struct lov_object, lo_cl); +} + +static inline struct lu_object *lovsub2lu(struct lovsub_object *los) +{ + return &los->lso_cl.co_lu; +} + +static inline struct cl_object *lovsub2cl(struct lovsub_object *los) +{ + return &los->lso_cl; +} + +static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) +{ + LINVRNT(lovsub_is_object(&obj->co_lu)); + return container_of0(obj, struct lovsub_object, lso_cl); +} + +static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) +{ + LINVRNT(lovsub_is_object(obj)); + return container_of0(obj, struct lovsub_object, lso_cl.co_lu); +} + +static inline struct lovsub_lock * +cl2lovsub_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lovsub_lock, lss_cl); +} + +static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + + slice = cl_lock_at(lock, &lovsub_device_type); + LASSERT(slice != NULL); + return cl2lovsub_lock(slice); +} + +static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lov_lock, lls_cl); +} + +static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lov_page, lps_cl); +} + +static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lov_req, lr_cl); +} + +static inline struct lovsub_page * +cl2lovsub_page(const struct cl_page_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lovsub_page, lsb_cl); +} + +static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lovsub_req, lsrq_cl); +} + +static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice) +{ + return slice->cpl_page->cp_child; +} + +static inline struct lov_io *cl2lov_io(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio; + + lio = container_of(ios, struct lov_io, lis_cl); + LASSERT(lio == lov_env_io(env)); + return lio; +} + +static inline int lov_targets_nr(const struct lov_device *lov) +{ + return lov->ld_lov->desc.ld_tgt_count; +} + +static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) +{ + struct lov_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &lov_key); + LASSERT(info != NULL); + return info; +} + +static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov) +{ + LASSERT(lov->lo_type == LLT_RAID0); + LASSERT(lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC || + lov->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC_V3); + return &lov->u.raid0; +} + +/** @} lov */ + +#endif diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c new file mode 100644 index 000000000000..f94f8d9d33d7 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_dev.c @@ -0,0 +1,533 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +/* class_name2obd() */ +#include + +#include "lov_cl_internal.h" + +struct kmem_cache *lov_lock_kmem; +struct kmem_cache *lov_object_kmem; +struct kmem_cache *lov_thread_kmem; +struct kmem_cache *lov_session_kmem; +struct kmem_cache *lov_req_kmem; + +struct kmem_cache *lovsub_lock_kmem; +struct kmem_cache *lovsub_object_kmem; +struct kmem_cache *lovsub_req_kmem; + +struct kmem_cache *lov_lock_link_kmem; + +/** Lock class of lov_device::ld_mutex. */ +struct lock_class_key cl_lov_device_mutex_class; + +struct lu_kmem_descr lov_caches[] = { + { + .ckd_cache = &lov_lock_kmem, + .ckd_name = "lov_lock_kmem", + .ckd_size = sizeof (struct lov_lock) + }, + { + .ckd_cache = &lov_object_kmem, + .ckd_name = "lov_object_kmem", + .ckd_size = sizeof (struct lov_object) + }, + { + .ckd_cache = &lov_thread_kmem, + .ckd_name = "lov_thread_kmem", + .ckd_size = sizeof (struct lov_thread_info) + }, + { + .ckd_cache = &lov_session_kmem, + .ckd_name = "lov_session_kmem", + .ckd_size = sizeof (struct lov_session) + }, + { + .ckd_cache = &lov_req_kmem, + .ckd_name = "lov_req_kmem", + .ckd_size = sizeof (struct lov_req) + }, + { + .ckd_cache = &lovsub_lock_kmem, + .ckd_name = "lovsub_lock_kmem", + .ckd_size = sizeof (struct lovsub_lock) + }, + { + .ckd_cache = &lovsub_object_kmem, + .ckd_name = "lovsub_object_kmem", + .ckd_size = sizeof (struct lovsub_object) + }, + { + .ckd_cache = &lovsub_req_kmem, + .ckd_name = "lovsub_req_kmem", + .ckd_size = sizeof (struct lovsub_req) + }, + { + .ckd_cache = &lov_lock_link_kmem, + .ckd_name = "lov_lock_link_kmem", + .ckd_size = sizeof (struct lov_lock_link) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Lov transfer operations. + * + */ + +static void lov_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lov_req *lr; + + ENTRY; + lr = cl2lov_req(slice); + OBD_SLAB_FREE_PTR(lr, lov_req_kmem); + EXIT; +} + +static const struct cl_req_operations lov_req_ops = { + .cro_completion = lov_req_completion +}; + +/***************************************************************************** + * + * Lov device and device type functions. + * + */ + +static void *lov_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_thread_kmem, __GFP_IO); + if (info != NULL) + INIT_LIST_HEAD(&info->lti_closure.clc_list); + else + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_thread_info *info = data; + LINVRNT(list_empty(&info->lti_closure.clc_list)); + OBD_SLAB_FREE_PTR(info, lov_thread_kmem); +} + +struct lu_context_key lov_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = lov_key_init, + .lct_fini = lov_key_fini +}; + +static void *lov_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, lov_session_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_session *info = data; + OBD_SLAB_FREE_PTR(info, lov_session_kmem); +} + +struct lu_context_key lov_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = lov_session_key_init, + .lct_fini = lov_session_key_fini +}; + +/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ +LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); + +static struct lu_device *lov_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + int i; + struct lov_device *ld = lu2lov_dev(d); + + LASSERT(ld->ld_lov != NULL); + if (ld->ld_target == NULL) + RETURN(NULL); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + + lsd = ld->ld_target[i]; + if (lsd != NULL) { + cl_stack_fini(env, lovsub2cl_dev(lsd)); + ld->ld_target[i] = NULL; + } + } + RETURN(NULL); +} + +static int lov_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + int rc = 0; + + LASSERT(d->ld_site != NULL); + if (ld->ld_target == NULL) + RETURN(rc); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + struct cl_device *cl; + struct lov_tgt_desc *desc; + + desc = ld->ld_lov->lov_tgts[i]; + if (desc == NULL) + continue; + + cl = cl_type_setup(env, d->ld_site, &lovsub_device_type, + desc->ltd_obd->obd_lu_dev); + if (IS_ERR(cl)) { + rc = PTR_ERR(cl); + break; + } + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = i; + lsd->acid_super = ld; + ld->ld_target[i] = lsd; + } + + if (rc) + lov_device_fini(env, d); + else + ld->ld_flags |= LOV_DEV_INITIALIZED; + + RETURN(rc); +} + +static int lov_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lov_req *lr; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lr, lov_req_kmem, __GFP_IO); + if (lr != NULL) { + cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops); + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +static const struct cl_device_operations lov_cl_ops = { + .cdo_req_init = lov_req_init +}; + +static void lov_emerg_free(struct lov_device_emerg **emrg, int nr) +{ + int i; + + for (i = 0; i < nr; ++i) { + struct lov_device_emerg *em; + + em = emrg[i]; + if (em != NULL) { + LASSERT(em->emrg_page_list.pl_nr == 0); + if (em->emrg_env != NULL) + cl_env_put(em->emrg_env, &em->emrg_refcheck); + OBD_FREE_PTR(em); + } + } + OBD_FREE(emrg, nr * sizeof emrg[0]); +} + +static struct lu_device *lov_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + const int nr = ld->ld_target_nr; + + cl_device_fini(lu2cl_dev(d)); + if (ld->ld_target != NULL) + OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]); + if (ld->ld_emrg != NULL) + lov_emerg_free(ld->ld_emrg, nr); + OBD_FREE_PTR(ld); + return NULL; +} + +static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct lov_device *ld = lu2lov_dev(dev); + ENTRY; + + if (ld->ld_target[index] != NULL) { + cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); + ld->ld_target[index] = NULL; + } + EXIT; +} + +static struct lov_device_emerg **lov_emerg_alloc(int nr) +{ + struct lov_device_emerg **emerg; + int i; + int result; + + OBD_ALLOC(emerg, nr * sizeof emerg[0]); + if (emerg == NULL) + return ERR_PTR(-ENOMEM); + for (result = i = 0; i < nr && result == 0; i++) { + struct lov_device_emerg *em; + + OBD_ALLOC_PTR(em); + if (em != NULL) { + emerg[i] = em; + cl_page_list_init(&em->emrg_page_list); + em->emrg_env = cl_env_alloc(&em->emrg_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (!IS_ERR(em->emrg_env)) + em->emrg_env->le_ctx.lc_cookie = 0x2; + else { + result = PTR_ERR(em->emrg_env); + em->emrg_env = NULL; + } + } else + result = -ENOMEM; + } + if (result != 0) { + lov_emerg_free(emerg, nr); + emerg = ERR_PTR(result); + } + return emerg; +} + +static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) +{ + int result; + __u32 tgt_size; + __u32 sub_size; + + ENTRY; + result = 0; + tgt_size = dev->ld_lov->lov_tgt_size; + sub_size = dev->ld_target_nr; + if (sub_size < tgt_size) { + struct lovsub_device **newd; + struct lov_device_emerg **emerg; + const size_t sz = sizeof newd[0]; + + emerg = lov_emerg_alloc(tgt_size); + if (IS_ERR(emerg)) + RETURN(PTR_ERR(emerg)); + + OBD_ALLOC(newd, tgt_size * sz); + if (newd != NULL) { + mutex_lock(&dev->ld_mutex); + if (sub_size > 0) { + memcpy(newd, dev->ld_target, sub_size * sz); + OBD_FREE(dev->ld_target, sub_size * sz); + } + dev->ld_target = newd; + dev->ld_target_nr = tgt_size; + + if (dev->ld_emrg != NULL) + lov_emerg_free(dev->ld_emrg, sub_size); + dev->ld_emrg = emerg; + mutex_unlock(&dev->ld_mutex); + } else { + lov_emerg_free(emerg, tgt_size); + result = -ENOMEM; + } + } + RETURN(result); +} + +static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct obd_device *obd = dev->ld_obd; + struct lov_device *ld = lu2lov_dev(dev); + struct lov_tgt_desc *tgt; + struct lovsub_device *lsd; + struct cl_device *cl; + int rc; + ENTRY; + + obd_getref(obd); + + tgt = obd->u.lov.lov_tgts[index]; + LASSERT(tgt != NULL); + LASSERT(tgt->ltd_obd != NULL); + + if (!tgt->ltd_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid)); + RETURN(-EINVAL); + } + + rc = lov_expand_targets(env, ld); + if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { + LASSERT(dev->ld_site != NULL); + + cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type, + tgt->ltd_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = index; + lsd->acid_super = ld; + ld->ld_target[index] = lsd; + } else { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_cl_del_target(env, dev, index); + rc = PTR_ERR(cl); + } + } + obd_putref(obd); + RETURN(rc); +} + +static int lov_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct obd_device *obd = d->ld_obd; + int cmd; + int rc; + int gen; + __u32 index; + + obd_getref(obd); + + cmd = cfg->lcfg_command; + rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); + if (rc == 0) { + switch(cmd) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + rc = lov_cl_add_target(env, d, index); + if (rc != 0) + lov_del_target(d->ld_obd, index, 0, 0); + break; + case LCFG_LOV_DEL_OBD: + lov_cl_del_target(env, d, index); + break; + } + } + obd_putref(obd); + RETURN(rc); +} + +static const struct lu_device_operations lov_lu_ops = { + .ldo_object_alloc = lov_object_alloc, + .ldo_process_config = lov_process_config, +}; + +static struct lu_device *lov_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lov_device *ld; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(ld); + if (ld == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&ld->ld_cl, t); + d = lov2lu_dev(ld); + d->ld_ops = &lov_lu_ops; + ld->ld_cl.cd_ops = &lov_cl_ops; + + mutex_init(&ld->ld_mutex); + lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class); + + /* setup the LOV OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = lov_setup(obd, cfg); + if (rc) { + lov_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + + ld->ld_lov = &obd->u.lov; + RETURN(d); +} + +static const struct lu_device_type_operations lov_device_type_ops = { + .ldto_init = lov_type_init, + .ldto_fini = lov_type_fini, + + .ldto_start = lov_type_start, + .ldto_stop = lov_type_stop, + + .ldto_device_alloc = lov_device_alloc, + .ldto_device_free = lov_device_free, + + .ldto_device_init = lov_device_init, + .ldto_device_fini = lov_device_fini +}; + +struct lu_device_type lov_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOV_NAME, + .ldt_ops = &lov_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; +EXPORT_SYMBOL(lov_device_type); + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c new file mode 100644 index 000000000000..481e8631fc3e --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_ea.c @@ -0,0 +1,334 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_ea.c + * + * Author: Wang Di + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include + +#include +#include +#include + +#include "lov_internal.h" + +struct lovea_unpack_args { + struct lov_stripe_md *lsm; + int cursor; +}; + +static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, + __u16 stripe_count) +{ + + if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CERROR("bad stripe count %d\n", stripe_count); + lov_dump_lmm(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm_oi_id(&lmm->lmm_oi) == 0) { + CERROR("zero object id\n"); + lov_dump_lmm(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) { + CERROR("bad striping pattern\n"); + lov_dump_lmm(D_WARNING, lmm); + return -EINVAL; + } + + if (lmm->lmm_stripe_size == 0 || + (le32_to_cpu(lmm->lmm_stripe_size)&(LOV_MIN_STRIPE_SIZE-1)) != 0) { + CERROR("bad stripe size %u\n", + le32_to_cpu(lmm->lmm_stripe_size)); + lov_dump_lmm(D_WARNING, lmm); + return -EINVAL; + } + return 0; +} + +struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size) +{ + struct lov_stripe_md *lsm; + struct lov_oinfo *loi; + int i, oinfo_ptrs_size; + + LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT); + + oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count; + *size = sizeof(struct lov_stripe_md) + oinfo_ptrs_size; + + OBD_ALLOC_LARGE(lsm, *size); + if (!lsm) + return NULL;; + + for (i = 0; i < stripe_count; i++) { + OBD_SLAB_ALLOC_PTR_GFP(loi, lov_oinfo_slab, __GFP_IO); + if (loi == NULL) + goto err; + lsm->lsm_oinfo[i] = loi; + } + lsm->lsm_stripe_count = stripe_count; + return lsm; + +err: + while (--i >= 0) + OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, sizeof(*loi)); + OBD_FREE_LARGE(lsm, *size); + return NULL; +} + +void lsm_free_plain(struct lov_stripe_md *lsm) +{ + __u16 stripe_count = lsm->lsm_stripe_count; + int i; + + for (i = 0; i < stripe_count; i++) + OBD_SLAB_FREE(lsm->lsm_oinfo[i], lov_oinfo_slab, + sizeof(struct lov_oinfo)); + OBD_FREE_LARGE(lsm, sizeof(struct lov_stripe_md) + + stripe_count * sizeof(struct lov_oinfo *)); +} + +static void lsm_unpackmd_common(struct lov_stripe_md *lsm, + struct lov_mds_md *lmm) +{ + /* + * This supposes lov_mds_md_v1/v3 first fields are + * are the same + */ + lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi); + lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); + lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); + lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); + lsm->lsm_pool_name[0] = '\0'; +} + +static void +lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, obd_off *swidth) +{ + if (swidth) + *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static void +lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno, + obd_off *lov_off, obd_off *swidth) +{ + if (swidth) + *swidth = (obd_off)lsm->lsm_stripe_size * lsm->lsm_stripe_count; +} + +static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa, + struct obd_export *md_exp) +{ + return 0; +} + +/* Find minimum stripe maxbytes value. For inactive or + * reconnecting targets use LUSTRE_STRIPE_MAXBYTES. */ +static void lov_tgt_maxbytes(struct lov_tgt_desc *tgt, __u64 *stripe_maxbytes) +{ + struct obd_import *imp = tgt->ltd_obd->u.cli.cl_import; + + if (imp == NULL || !tgt->ltd_active) { + *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + return; + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) && + imp->imp_connect_data.ocd_maxbytes > 0) { + if (*stripe_maxbytes > imp->imp_connect_data.ocd_maxbytes) + *stripe_maxbytes = imp->imp_connect_data.ocd_maxbytes; + } else { + *stripe_maxbytes = LUSTRE_STRIPE_MAXBYTES; + } + spin_unlock(&imp->imp_lock); +} + +static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes, + __u16 *stripe_count) +{ + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md_v1 too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) { + CERROR("LOV EA V1 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count); +} + +int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md_v1 *lmm) +{ + struct lov_oinfo *loi; + int i; + __u64 stripe_maxbytes = OBD_OBJECT_EOF; + + lsm_unpackmd_common(lsm, lmm); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi = lsm->lsm_oinfo[i]; + ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CERROR("OST index %d missing\n", loi->loi_ost_idx); + lov_dump_lmm_v1(D_WARNING, lmm); + return -EINVAL; + } + /* calculate the minimum stripe max bytes */ + lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx], + &stripe_maxbytes); + } + + lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count; + + return 0; +} + +const struct lsm_operations lsm_v1_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_v1, + .lsm_unpackmd = lsm_unpackmd_v1, +}; + +static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes, + __u16 *stripe_count) +{ + struct lov_mds_md_v3 *lmm; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md_v3 too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); + + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) { + CERROR("LOV EA V3 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes, + *stripe_count); +} + +int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmmv1) +{ + struct lov_mds_md_v3 *lmm; + struct lov_oinfo *loi; + int i; + __u64 stripe_maxbytes = OBD_OBJECT_EOF; + int cplen = 0; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm); + cplen = strlcpy(lsm->lsm_pool_name, lmm->lmm_pool_name, + sizeof(lsm->lsm_pool_name)); + if (cplen >= sizeof(lsm->lsm_pool_name)) + return -E2BIG; + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi = lsm->lsm_oinfo[i]; + ostid_le_to_cpu(&lmm->lmm_objects[i].l_ost_oi, &loi->loi_oi); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CERROR("OST index %d missing\n", loi->loi_ost_idx); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + /* calculate the minimum stripe max bytes */ + lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx], + &stripe_maxbytes); + } + + lsm->lsm_maxbytes = stripe_maxbytes * lsm->lsm_stripe_count; + + return 0; +} + +const struct lsm_operations lsm_v3_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_v3, + .lsm_unpackmd = lsm_unpackmd_v3, +}; diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h new file mode 100644 index 000000000000..146d5e310283 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_internal.h @@ -0,0 +1,322 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LOV_INTERNAL_H +#define LOV_INTERNAL_H + +#include +#include +#include + +struct lov_lock_handles { + struct portals_handle llh_handle; + atomic_t llh_refcount; + int llh_stripe_count; + struct lustre_handle llh_handles[0]; +}; + +struct lov_request { + struct obd_info rq_oi; + struct lov_request_set *rq_rqset; + + struct list_head rq_link; + + int rq_idx; /* index in lov->tgts array */ + int rq_stripe; /* stripe number */ + int rq_complete; + int rq_rc; + int rq_buflen; /* length of sub_md */ + + obd_count rq_oabufs; + obd_count rq_pgaidx; +}; + +struct lov_request_set { + struct ldlm_enqueue_info *set_ei; + struct obd_info *set_oi; + atomic_t set_refcount; + struct obd_export *set_exp; + /* XXX: There is @set_exp already, however obd_statfs gets obd_device + only. */ + struct obd_device *set_obd; + int set_count; + atomic_t set_completes; + atomic_t set_success; + atomic_t set_finish_checked; + struct llog_cookie *set_cookies; + int set_cookie_sent; + struct obd_trans_info *set_oti; + obd_count set_oabufs; + struct brw_page *set_pga; + struct lov_lock_handles *set_lockh; + struct list_head set_list; + wait_queue_head_t set_waitq; + spinlock_t set_lock; +}; + +extern struct kmem_cache *lov_oinfo_slab; + +void lov_finish_set(struct lov_request_set *set); + +static inline void lov_get_reqset(struct lov_request_set *set) +{ + LASSERT(set != NULL); + LASSERT(atomic_read(&set->set_refcount) > 0); + atomic_inc(&set->set_refcount); +} + +static inline void lov_put_reqset(struct lov_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + lov_finish_set(set); +} + +static inline struct lov_lock_handles * +lov_handle2llh(struct lustre_handle *handle) +{ + LASSERT(handle != NULL); + return(class_handle2object(handle->cookie)); +} + +static inline void lov_llh_put(struct lov_lock_handles *llh) +{ + CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh, + atomic_read(&llh->llh_refcount) - 1); + LASSERT(atomic_read(&llh->llh_refcount) > 0 && + atomic_read(&llh->llh_refcount) < 0x5a5a); + if (atomic_dec_and_test(&llh->llh_refcount)) { + class_handle_unhash(&llh->llh_handle); + /* The structure may be held by other threads because RCU. + * -jxiong */ + if (atomic_read(&llh->llh_refcount)) + return; + + OBD_FREE_RCU(llh, sizeof *llh + + sizeof(*llh->llh_handles) * llh->llh_stripe_count, + &llh->llh_handle); + } +} + +#define lov_uuid2str(lv, index) \ + (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid) + +/* lov_merge.c */ +void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid, + struct lov_stripe_md *lsm, int stripeno, int *set); +int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only); +int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, + obd_off size, int shrink); +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place); + +/* lov_offset.c */ +obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, + int stripeno); +int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, + int stripeno, obd_off *obd_off); +obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size, + int stripeno); +int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, + obd_off start, obd_off end, + obd_off *obd_start, obd_off *obd_end); +int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off); + +/* lov_qos.c */ +#define LOV_USES_ASSIGNED_STRIPE 0 +#define LOV_USES_DEFAULT_STRIPE 1 +int qos_add_tgt(struct obd_device *obd, __u32 index); +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); +void qos_shrink_lsm(struct lov_request_set *set); +int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); +void qos_update(struct lov_obd *lov); +void qos_statfs_done(struct lov_obd *lov); +void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait); +int qos_remedy_create(struct lov_request_set *set, struct lov_request *req); + +/* lov_request.c */ +void lov_set_add_req(struct lov_request *req, struct lov_request_set *set); +int lov_set_finished(struct lov_request_set *set, int idempotent); +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_update_common_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx); +int lov_prep_create_set(struct obd_export *exp, struct obd_info *oifo, + struct lov_stripe_md **ea, struct obdo *src_oa, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int cb_create_update(void *cookie, int rc); +int lov_fini_create_set(struct lov_request_set *set, struct lov_stripe_md **ea); +int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_fini_brw_set(struct lov_request_set *set); +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_request_set **reqset); +int lov_fini_getattr_set(struct lov_request_set *set); +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_update_destroy_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_fini_destroy_set(struct lov_request_set *set); +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_update_setattr_set(struct lov_request_set *set, + struct lov_request *req, int rc); +int lov_fini_setattr_set(struct lov_request_set *set); +int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset); +int lov_fini_punch_set(struct lov_request_set *set); +int lov_prep_sync_set(struct obd_export *exp, struct obd_info *obd_info, + obd_off start, obd_off end, + struct lov_request_set **reqset); +int lov_fini_sync_set(struct lov_request_set *set); +int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct lov_request_set **reqset); +int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc, + struct ptlrpc_request_set *rqset); +int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, + ldlm_policy_data_t *policy, __u32 mode, + struct lustre_handle *lockh, + struct lov_request_set **reqset); +int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags); +int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, + __u32 mode, struct lustre_handle *lockh, + struct lov_request_set **reqset); +int lov_fini_cancel_set(struct lov_request_set *set); +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset); +void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success); +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, + int success); +int lov_fini_statfs_set(struct lov_request_set *set); +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc); + +/* lov_obd.c */ +void lov_fix_desc(struct lov_desc *desc); +void lov_fix_desc_stripe_size(__u64 *val); +void lov_fix_desc_stripe_count(__u32 *val); +void lov_fix_desc_pattern(__u32 *val); +void lov_fix_desc_qos_maxage(__u32 *val); +__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count); +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data); +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp); +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen); +/* lov_log.c */ +int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *tgt, int *idx); +int lov_llog_finish(struct obd_device *obd, int count); + +/* lov_pack.c */ +int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmm, + struct lov_stripe_md *lsm); +int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes); +int lov_setstripe(struct obd_export *exp, int max_lmm_size, + struct lov_stripe_md **lsmp, struct lov_user_md *lump); +int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_user_md *lump); +int lov_getstripe(struct obd_export *exp, + struct lov_stripe_md *lsm, struct lov_user_md *lump); +int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, + int pattern, int magic); +int lov_free_memmd(struct lov_stripe_md **lsmp); + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm); +void lov_dump_lmm(int level, void *lmm); + +/* lov_ea.c */ +struct lov_stripe_md *lsm_alloc_plain(__u16 stripe_count, int *size); +void lsm_free_plain(struct lov_stripe_md *lsm); + +int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct obdo *oa, void *data); +/* lproc_lov.c */ +extern struct file_operations lov_proc_target_fops; +#ifdef LPROCFS +void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +/* lov_cl.c */ +extern struct lu_device_type lov_device_type; + +/* pools */ +extern cfs_hash_ops_t pool_hash_operations; +/* ost_pool methods */ +int lov_ost_pool_init(struct ost_pool *op, unsigned int count); +int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count); +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count); +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx); +int lov_ost_pool_free(struct ost_pool *op); + +/* high level pool methods */ +int lov_pool_new(struct obd_device *obd, char *poolname); +int lov_pool_del(struct obd_device *obd, char *poolname); +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); +void lov_dump_pool(int level, struct pool_desc *pool); +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname); +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool); +void lov_pool_putref(struct pool_desc *pool); + +static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm) +{ + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + atomic_inc(&lsm->lsm_refc); + return lsm; +} + +#endif diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c new file mode 100644 index 000000000000..1a87abdf0953 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_io.c @@ -0,0 +1,967 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static inline void lov_sub_enter(struct lov_io_sub *sub) +{ + sub->sub_reenter++; +} +static inline void lov_sub_exit(struct lov_io_sub *sub) +{ + sub->sub_reenter--; +} + +static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + ENTRY; + if (sub->sub_io != NULL) { + if (sub->sub_io_initialized) { + lov_sub_enter(sub); + cl_io_fini(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + sub->sub_io_initialized = 0; + lio->lis_active_subios--; + } + if (sub->sub_stripe == lio->lis_single_subio_index) + lio->lis_single_subio_index = -1; + else if (!sub->sub_borrowed) + OBD_FREE_PTR(sub->sub_io); + sub->sub_io = NULL; + } + if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) { + if (!sub->sub_borrowed) + cl_env_put(sub->sub_env, &sub->sub_refcheck); + sub->sub_env = NULL; + } + EXIT; +} + +static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio, + int stripe, loff_t start, loff_t end) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_io *parent = lio->lis_cl.cis_io; + + switch(io->ci_type) { + case CIT_SETATTR: { + io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; + io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid; + io->u.ci_setattr.sa_capa = parent->u.ci_setattr.sa_capa; + if (cl_io_is_trunc(io)) { + loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; + + new_size = lov_size_to_stripe(lsm, new_size, stripe); + io->u.ci_setattr.sa_attr.lvb_size = new_size; + } + break; + } + case CIT_FAULT: { + struct cl_object *obj = parent->ci_obj; + loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); + + io->u.ci_fault = parent->u.ci_fault; + off = lov_size_to_stripe(lsm, off, stripe); + io->u.ci_fault.ft_index = cl_index(obj, off); + break; + } + case CIT_FSYNC: { + io->u.ci_fsync.fi_start = start; + io->u.ci_fsync.fi_end = end; + io->u.ci_fsync.fi_capa = parent->u.ci_fsync.fi_capa; + io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; + io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; + break; + } + case CIT_READ: + case CIT_WRITE: { + io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); + if (cl_io_is_append(parent)) { + io->u.ci_wr.wr_append = 1; + } else { + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; + } + break; + } + default: + break; + } +} + +static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + struct lov_object *lov = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev); + struct cl_io *sub_io; + struct cl_object *sub_obj; + struct cl_io *io = lio->lis_cl.cis_io; + + int stripe = sub->sub_stripe; + int result; + + LASSERT(sub->sub_io == NULL); + LASSERT(sub->sub_env == NULL); + LASSERT(sub->sub_stripe < lio->lis_stripe_count); + ENTRY; + + result = 0; + sub->sub_io_initialized = 0; + sub->sub_borrowed = 0; + + if (lio->lis_mem_frozen) { + LASSERT(mutex_is_locked(&ld->ld_mutex)); + sub->sub_io = &ld->ld_emrg[stripe]->emrg_subio; + sub->sub_env = ld->ld_emrg[stripe]->emrg_env; + sub->sub_borrowed = 1; + } else { + void *cookie; + + /* obtain new environment */ + cookie = cl_env_reenter(); + sub->sub_env = cl_env_get(&sub->sub_refcheck); + cl_env_reexit(cookie); + if (IS_ERR(sub->sub_env)) + result = PTR_ERR(sub->sub_env); + + if (result == 0) { + /* + * First sub-io. Use ->lis_single_subio to + * avoid dynamic allocation. + */ + if (lio->lis_active_subios == 0) { + sub->sub_io = &lio->lis_single_subio; + lio->lis_single_subio_index = stripe; + } else { + OBD_ALLOC_PTR(sub->sub_io); + if (sub->sub_io == NULL) + result = -ENOMEM; + } + } + } + + if (result == 0) { + sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]); + sub_io = sub->sub_io; + + sub_io->ci_obj = sub_obj; + sub_io->ci_result = 0; + + sub_io->ci_parent = io; + sub_io->ci_lockreq = io->ci_lockreq; + sub_io->ci_type = io->ci_type; + sub_io->ci_no_srvlock = io->ci_no_srvlock; + + lov_sub_enter(sub); + result = cl_io_sub_init(sub->sub_env, sub_io, + io->ci_type, sub_obj); + lov_sub_exit(sub); + if (result >= 0) { + lio->lis_active_subios++; + sub->sub_io_initialized = 1; + result = 0; + } + } + if (result != 0) + lov_io_sub_fini(env, lio, sub); + RETURN(result); +} + +struct lov_io_sub *lov_sub_get(const struct lu_env *env, + struct lov_io *lio, int stripe) +{ + int rc; + struct lov_io_sub *sub = &lio->lis_subs[stripe]; + + LASSERT(stripe < lio->lis_stripe_count); + ENTRY; + + if (!sub->sub_io_initialized) { + sub->sub_stripe = stripe; + rc = lov_io_sub_init(env, lio, sub); + } else + rc = 0; + if (rc == 0) + lov_sub_enter(sub); + else + sub = ERR_PTR(rc); + RETURN(sub); +} + +void lov_sub_put(struct lov_io_sub *sub) +{ + lov_sub_exit(sub); +} + +/***************************************************************************** + * + * Lov io operations. + * + */ + +static int lov_page_stripe(const struct cl_page *page) +{ + struct lovsub_object *subobj; + + ENTRY; + subobj = lu2lovsub( + lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header, + &lovsub_device_type)); + LASSERT(subobj != NULL); + RETURN(subobj->lso_index); +} + +struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio, + const struct cl_page_slice *slice) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct cl_page *page = slice->cpl_page; + int stripe; + + LASSERT(lio->lis_cl.cis_io != NULL); + LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object); + LASSERT(lsm != NULL); + LASSERT(lio->lis_nr_subios > 0); + ENTRY; + + stripe = lov_page_stripe(page); + RETURN(lov_sub_get(env, lio, stripe)); +} + + +static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, + struct cl_io *io) +{ + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + int result; + + LASSERT(lio->lis_object != NULL); + ENTRY; + + /* + * Need to be optimized, we can't afford to allocate a piece of memory + * when writing a page. -jay + */ + OBD_ALLOC_LARGE(lio->lis_subs, + lsm->lsm_stripe_count * sizeof lio->lis_subs[0]); + if (lio->lis_subs != NULL) { + lio->lis_nr_subios = lio->lis_stripe_count; + lio->lis_single_subio_index = -1; + lio->lis_active_subios = 0; + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +static void lov_io_slice_init(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + ENTRY; + + io->ci_result = 0; + lio->lis_object = obj; + + LASSERT(obj->lo_lsm != NULL); + lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + lio->lis_io_endpos = lio->lis_endpos; + if (cl_io_is_append(io)) { + LASSERT(io->ci_type == CIT_WRITE); + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_SETATTR: + if (cl_io_is_trunc(io)) + lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; + else + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } + + case CIT_FSYNC: { + lio->lis_pos = io->u.ci_fsync.fi_start; + lio->lis_endpos = io->u.ci_fsync.fi_end; + break; + } + + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } + + EXIT; +} + +static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *lov = cl2lov(ios->cis_obj); + int i; + + ENTRY; + if (lio->lis_subs != NULL) { + for (i = 0; i < lio->lis_nr_subios; i++) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + OBD_FREE_LARGE(lio->lis_subs, + lio->lis_nr_subios * sizeof lio->lis_subs[0]); + lio->lis_nr_subios = 0; + } + + LASSERT(atomic_read(&lov->lo_active_ios) > 0); + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); + EXIT; +} + +static obd_off lov_offset_mod(obd_off val, int delta) +{ + if (val != OBD_OBJECT_EOF) + val += delta; + return val; +} + +static int lov_io_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + struct lov_io_sub *sub; + obd_off endpos; + obd_off start; + obd_off end; + int stripe; + int rc = 0; + + ENTRY; + endpos = lov_offset_mod(lio->lis_endpos, -1); + for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) { + if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos, + endpos, &start, &end)) + continue; + + end = lov_offset_mod(end, +1); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + lov_io_sub_inherit(sub->sub_io, lio, stripe, + start, end); + rc = cl_io_iter_init(sub->sub_env, sub->sub_io); + lov_sub_put(sub); + CDEBUG(D_VFSTRACE, "shrink: %d ["LPU64", "LPU64")\n", + stripe, start, end); + } else + rc = PTR_ERR(sub); + + if (!rc) + list_add_tail(&sub->sub_linkage, &lio->lis_active); + else + break; + } + RETURN(rc); +} + +static int lov_io_rw_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; + loff_t start = io->u.ci_rw.crw_pos; + loff_t next; + unsigned long ssize = lsm->lsm_stripe_size; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + /* fast path for common case. */ + if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) { + + lov_do_div64(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = ~0ull; + + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos, + next) - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + CDEBUG(D_VFSTRACE, "stripe: "LPU64" chunk: ["LPU64", "LPU64") " + LPU64"\n", (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos); + } + /* + * XXX The following call should be optimized: we know, that + * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. + */ + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_call(const struct lu_env *env, struct lov_io *lio, + int (*iofunc)(const struct lu_env *, struct cl_io *)) +{ + struct cl_io *parent = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + lov_sub_enter(sub); + rc = iofunc(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + if (rc) + break; + + if (parent->ci_result == 0) + parent->ci_result = sub->sub_io->ci_result; + } + RETURN(rc); +} + +static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); +} + +static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); +} + +static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) +{ + ENTRY; + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + RETURN(0); +} + +static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_iter_fini(env, io); + RETURN(0); +} + +static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_unlock(env, io); + RETURN(0); +} + +static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + int rc; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); +} + +static void lov_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int rc; + + ENTRY; + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); + while (!list_empty(&lio->lis_active)) + list_del_init(lio->lis_active.next); + EXIT; +} + +static void lov_io_unlock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + int rc; + + ENTRY; + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); + EXIT; +} + + +static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld, + struct cl_page_list *qin, + int idx, int alloc) +{ + return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list; +} + +/** + * lov implementation of cl_operations::cio_submit() method. It takes a list + * of pages in \a queue, splits it into per-stripe sub-lists, invokes + * cl_io_submit() on underlying devices to submit sub-lists, and then splices + * everything back. + * + * Major complication of this function is a need to handle memory cleansing: + * cl_io_submit() is called to write out pages as a part of VM memory + * reclamation, and hence it may not fail due to memory shortages (system + * dead-locks otherwise). To deal with this, some resources (sub-lists, + * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a + * not-memory cleansing context), and in case of memory shortage, these + * pre-allocated resources are used by lov_io_submit() under + * lov_device::ld_mutex mutex. + */ +static int lov_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *obj = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev); + struct cl_page_list *qin = &queue->c2_qin; + struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; + struct cl_page_list *stripes_qin = NULL; + struct cl_page *page; + struct cl_page *tmp; + int stripe; + +#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc) + + int rc = 0; + int alloc = + !(current->flags & PF_MEMALLOC); + ENTRY; + if (lio->lis_active_subios == 1) { + int idx = lio->lis_single_subio_index; + struct lov_io_sub *sub; + + LASSERT(idx < lio->lis_nr_subios); + sub = lov_sub_get(env, lio, idx); + LASSERT(!IS_ERR(sub)); + LASSERT(sub->sub_io == &lio->lis_single_subio); + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, + crt, queue); + lov_sub_put(sub); + RETURN(rc); + } + + LASSERT(lio->lis_subs != NULL); + if (alloc) { + OBD_ALLOC_LARGE(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + if (stripes_qin == NULL) + RETURN(-ENOMEM); + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) + cl_page_list_init(&stripes_qin[stripe]); + } else { + /* + * If we get here, it means pageout & swap doesn't help. + * In order to not make things worse, even don't try to + * allocate the memory with __GFP_NOWARN. -jay + */ + mutex_lock(&ld->ld_mutex); + lio->lis_mem_frozen = 1; + } + + cl_2queue_init(cl2q); + cl_page_list_for_each_safe(page, tmp, qin) { + stripe = lov_page_stripe(page); + cl_page_list_move(QIN(stripe), qin, page); + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct lov_io_sub *sub; + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, &cl2q->c2_qin); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, + crt, cl2q); + lov_sub_put(sub); + } else + rc = PTR_ERR(sub); + cl_page_list_splice(&cl2q->c2_qin, &queue->c2_qin); + cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); + if (rc != 0) + break; + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, qin); + } + + if (alloc) { + OBD_FREE_LARGE(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + } else { + int i; + + for (i = 0; i < lio->lis_nr_subios; i++) { + struct cl_io *cio = lio->lis_subs[i].sub_io; + + if (cio && cio == &ld->ld_emrg[i]->emrg_subio) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + } + lio->lis_mem_frozen = 0; + mutex_unlock(&ld->ld_mutex); + } + + RETURN(rc); +#undef QIN +} + +static int lov_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + ENTRY; + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_prepare_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + RETURN(result); +} + +static int lov_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + ENTRY; + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_commit_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + RETURN(result); +} + +static int lov_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_fault_io *fio; + struct lov_io *lio; + struct lov_io_sub *sub; + + ENTRY; + fio = &ios->cis_io->u.ci_fault; + lio = cl2lov_io(env, ios); + sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page)); + sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob; + lov_sub_put(sub); + RETURN(lov_io_start(env, ios)); +} + +static void lov_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_io_sub *sub; + unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written; + ENTRY; + + *written = 0; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + struct cl_io *subio = sub->sub_io; + + lov_sub_enter(sub); + lov_io_end_wrapper(sub->sub_env, subio); + lov_sub_exit(sub); + + if (subio->ci_result == 0) + *written += subio->u.ci_fsync.fi_nr_written; + } + RETURN_EXIT; +} + +static const struct cl_io_operations lov_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_SETATTR] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end + }, + [CIT_FSYNC] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_fsync_end + }, + [CIT_MISC] = { + .cio_fini = lov_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = lov_io_submit + }, + [CRT_WRITE] = { + .cio_submit = lov_io_submit + } + }, + .cio_prepare_write = lov_io_prepare_write, + .cio_commit_write = lov_io_commit_write +}; + +/***************************************************************************** + * + * Empty lov io operations. + * + */ + +static void lov_empty_io_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_object *lov = cl2lov(ios->cis_obj); + ENTRY; + + if (atomic_dec_and_test(&lov->lo_active_ios)) + wake_up_all(&lov->lo_waitq); + EXIT; +} + +static void lov_empty_impossible(const struct lu_env *env, + struct cl_io_slice *ios) +{ + LBUG(); +} + +#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) + +/** + * An io operation vector for files without stripes. + */ +static const struct cl_io_operations lov_empty_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_empty_io_fini, +#if 0 + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE +#endif + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_SETATTR] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FSYNC] = { + .cio_fini = lov_empty_io_fini + }, + [CIT_MISC] = { + .cio_fini = lov_empty_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + }, + [CRT_WRITE] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + } + }, + .cio_commit_write = LOV_EMPTY_IMPOSSIBLE +}; + +int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + + ENTRY; + INIT_LIST_HEAD(&lio->lis_active); + lov_io_slice_init(lio, lov, io); + if (io->ci_result == 0) { + io->ci_result = lov_io_subio_init(env, lio, io); + if (io->ci_result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + atomic_inc(&lov->lo_active_ios); + } + } + RETURN(io->ci_result); +} + +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_io *lio = lov_env_io(env); + int result; + ENTRY; + + lio->lis_object = lov; + switch (io->ci_type) { + default: + LBUG(); + case CIT_MISC: + case CIT_READ: + result = 0; + break; + case CIT_FSYNC: + case CIT_SETATTR: + result = +1; + break; + case CIT_WRITE: + result = -EBADF; + break; + case CIT_FAULT: + result = -EFAULT; + CERROR("Page fault on a file without stripes: "DFID"\n", + PFID(lu_object_fid(&obj->co_lu))); + break; + } + if (result == 0) { + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + atomic_inc(&lov->lo_active_ios); + } + + io->ci_result = result < 0 ? result : 0; + RETURN(result != 0); +} + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c new file mode 100644 index 000000000000..bdf3334e0c9f --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_lock.c @@ -0,0 +1,1253 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent); + +static int lov_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice); +/***************************************************************************** + * + * Lov lock operations. + * + */ + +static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env, + struct cl_lock *parent, + struct lov_lock_sub *lls) +{ + struct lov_sublock_env *subenv; + struct lov_io *lio = lov_env_io(env); + struct cl_io *io = lio->lis_cl.cis_io; + struct lov_io_sub *sub; + + subenv = &lov_env_session(env)->ls_subenv; + + /* + * FIXME: We tend to use the subio's env & io to call the sublock + * lock operations because osc lock sometimes stores some control + * variables in thread's IO infomation(Now only lockless information). + * However, if the lock's host(object) is different from the object + * for current IO, we have no way to get the subenv and subio because + * they are not initialized at all. As a temp fix, in this case, + * we still borrow the parent's env to call sublock operations. + */ + if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) { + subenv->lse_env = env; + subenv->lse_io = io; + subenv->lse_sub = NULL; + } else { + sub = lov_sub_get(env, lio, lls->sub_stripe); + if (!IS_ERR(sub)) { + subenv->lse_env = sub->sub_env; + subenv->lse_io = sub->sub_io; + subenv->lse_sub = sub; + } else { + subenv = (void*)sub; + } + } + return subenv; +} + +static void lov_sublock_env_put(struct lov_sublock_env *subenv) +{ + if (subenv && subenv->lse_sub) + lov_sub_put(subenv->lse_sub); +} + +static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, int idx, + struct lov_lock_link *link) +{ + struct lovsub_lock *lsl; + struct cl_lock *parent = lck->lls_cl.cls_lock; + int rc; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sublock)); + ENTRY; + + lsl = cl2sub_lock(sublock); + /* + * check that sub-lock doesn't have lock link to this top-lock. + */ + LASSERT(lov_lock_link_find(env, lck, lsl) == NULL); + LASSERT(idx < lck->lls_nr); + + lck->lls_sub[idx].sub_lock = lsl; + lck->lls_nr_filled++; + LASSERT(lck->lls_nr_filled <= lck->lls_nr); + list_add_tail(&link->lll_list, &lsl->lss_parents); + link->lll_idx = idx; + link->lll_super = lck; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lov-child", sublock); + lck->lls_sub[idx].sub_flags |= LSF_HELD; + cl_lock_user_add(env, sublock); + + rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx); + LASSERT(rc == 0); /* there is no way this can fail, currently */ + EXIT; +} + +static struct cl_lock *lov_sublock_alloc(const struct lu_env *env, + const struct cl_io *io, + struct lov_lock *lck, + int idx, struct lov_lock_link **out) +{ + struct cl_lock *sublock; + struct cl_lock *parent; + struct lov_lock_link *link; + + LASSERT(idx < lck->lls_nr); + ENTRY; + + OBD_SLAB_ALLOC_PTR_GFP(link, lov_lock_link_kmem, __GFP_IO); + if (link != NULL) { + struct lov_sublock_env *subenv; + struct lov_lock_sub *lls; + struct cl_lock_descr *descr; + + parent = lck->lls_cl.cls_lock; + lls = &lck->lls_sub[idx]; + descr = &lls->sub_got; + + subenv = lov_sublock_env_get(env, parent, lls); + if (!IS_ERR(subenv)) { + /* CAVEAT: Don't try to add a field in lov_lock_sub + * to remember the subio. This is because lock is able + * to be cached, but this is not true for IO. This + * further means a sublock might be referenced in + * different io context. -jay */ + + sublock = cl_lock_hold(subenv->lse_env, subenv->lse_io, + descr, "lov-parent", parent); + lov_sublock_env_put(subenv); + } else { + /* error occurs. */ + sublock = (void*)subenv; + } + + if (!IS_ERR(sublock)) + *out = link; + else + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + } else + sublock = ERR_PTR(-ENOMEM); + RETURN(sublock); +} + +static void lov_sublock_unlock(const struct lu_env *env, + struct lovsub_lock *lsl, + struct cl_lock_closure *closure, + struct lov_sublock_env *subenv) +{ + ENTRY; + lov_sublock_env_put(subenv); + lsl->lss_active = NULL; + cl_lock_disclosure(env, closure); + EXIT; +} + +static int lov_sublock_lock(const struct lu_env *env, + struct lov_lock *lck, + struct lov_lock_sub *lls, + struct cl_lock_closure *closure, + struct lov_sublock_env **lsep) +{ + struct lovsub_lock *sublock; + struct cl_lock *child; + int result = 0; + ENTRY; + + LASSERT(list_empty(&closure->clc_list)); + + sublock = lls->sub_lock; + child = sublock->lss_cl.cls_lock; + result = cl_lock_closure_build(env, child, closure); + if (result == 0) { + struct cl_lock *parent = closure->clc_origin; + + LASSERT(cl_lock_is_mutexed(child)); + sublock->lss_active = parent; + + if (unlikely((child->cll_state == CLS_FREEING) || + (child->cll_flags & CLF_CANCELLED))) { + struct lov_lock_link *link; + /* + * we could race with lock deletion which temporarily + * put the lock in freeing state, bug 19080. + */ + LASSERT(!(lls->sub_flags & LSF_HELD)); + + link = lov_lock_link_find(env, lck, sublock); + LASSERT(link != NULL); + lov_lock_unlink(env, link, sublock); + lov_sublock_unlock(env, sublock, closure, NULL); + lck->lls_cancel_race = 1; + result = CLO_REPEAT; + } else if (lsep) { + struct lov_sublock_env *subenv; + subenv = lov_sublock_env_get(env, parent, lls); + if (IS_ERR(subenv)) { + lov_sublock_unlock(env, sublock, + closure, NULL); + result = PTR_ERR(subenv); + } else { + *lsep = subenv; + } + } + } + RETURN(result); +} + +/** + * Updates the result of a top-lock operation from a result of sub-lock + * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate + * over sub-locks and lov_subresult() is used to calculate return value of a + * top-operation. To this end, possible return values of sub-operations are + * ordered as + * + * - 0 success + * - CLO_WAIT wait for event + * - CLO_REPEAT repeat top-operation + * - -ne fundamental error + * + * Top-level return code can only go down through this list. CLO_REPEAT + * overwrites CLO_WAIT, because lock mutex was released and sleeping condition + * has to be rechecked by the upper layer. + */ +static int lov_subresult(int result, int rc) +{ + int result_rank; + int rc_rank; + + ENTRY; + + LASSERTF(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT, + "result = %d", result); + LASSERTF(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT, + "rc = %d\n", rc); + CLASSERT(CLO_WAIT < CLO_REPEAT); + + /* calculate ranks in the ordering above */ + result_rank = result < 0 ? 1 + CLO_REPEAT : result; + rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc; + + if (result_rank < rc_rank) + result = rc; + RETURN(result); +} + +/** + * Creates sub-locks for a given lov_lock for the first time. + * + * Goes through all sub-objects of top-object, and creates sub-locks on every + * sub-object intersecting with top-lock extent. This is complicated by the + * fact that top-lock (that is being created) can be accessed concurrently + * through already created sub-locks (possibly shared with other top-locks). + */ +static int lov_lock_sub_init(const struct lu_env *env, + struct lov_lock *lck, const struct cl_io *io) +{ + int result = 0; + int i; + int nr; + obd_off start; + obd_off end; + obd_off file_start; + obd_off file_end; + + struct lov_object *loo = cl2lov(lck->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct cl_lock *parent = lck->lls_cl.cls_lock; + + ENTRY; + + lck->lls_orig = parent->cll_descr; + file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start); + file_end = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1; + + for (i = 0, nr = 0; i < r0->lo_nr; i++) { + /* + * XXX for wide striping smarter algorithm is desirable, + * breaking out of the loop, early. + */ + if (lov_stripe_intersects(loo->lo_lsm, i, + file_start, file_end, &start, &end)) + nr++; + } + LASSERT(nr > 0); + OBD_ALLOC_LARGE(lck->lls_sub, nr * sizeof lck->lls_sub[0]); + if (lck->lls_sub == NULL) + RETURN(-ENOMEM); + + lck->lls_nr = nr; + /* + * First, fill in sub-lock descriptions in + * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc() + * (called below in this function, and by lov_lock_enqueue()) to + * create sub-locks. At this moment, no other thread can access + * top-lock. + */ + for (i = 0, nr = 0; i < r0->lo_nr; ++i) { + if (lov_stripe_intersects(loo->lo_lsm, i, + file_start, file_end, &start, &end)) { + struct cl_lock_descr *descr; + + descr = &lck->lls_sub[nr].sub_descr; + + LASSERT(descr->cld_obj == NULL); + descr->cld_obj = lovsub2cl(r0->lo_sub[i]); + descr->cld_start = cl_index(descr->cld_obj, start); + descr->cld_end = cl_index(descr->cld_obj, end); + descr->cld_mode = parent->cll_descr.cld_mode; + descr->cld_gid = parent->cll_descr.cld_gid; + descr->cld_enq_flags = parent->cll_descr.cld_enq_flags; + /* XXX has no effect */ + lck->lls_sub[nr].sub_got = *descr; + lck->lls_sub[nr].sub_stripe = i; + nr++; + } + } + LASSERT(nr == lck->lls_nr); + /* + * Then, create sub-locks. Once at least one sub-lock was created, + * top-lock can be reached by other threads. + */ + for (i = 0; i < lck->lls_nr; ++i) { + struct cl_lock *sublock; + struct lov_lock_link *link; + + if (lck->lls_sub[i].sub_lock == NULL) { + sublock = lov_sublock_alloc(env, io, lck, i, &link); + if (IS_ERR(sublock)) { + result = PTR_ERR(sublock); + break; + } + cl_lock_get_trust(sublock); + cl_lock_mutex_get(env, sublock); + cl_lock_mutex_get(env, parent); + /* + * recheck under mutex that sub-lock wasn't created + * concurrently, and that top-lock is still alive. + */ + if (lck->lls_sub[i].sub_lock == NULL && + parent->cll_state < CLS_FREEING) { + lov_sublock_adopt(env, lck, sublock, i, link); + cl_lock_mutex_put(env, parent); + } else { + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, + "lov-parent", parent); + } + cl_lock_mutex_put(env, sublock); + cl_lock_put(env, sublock); + } + } + /* + * Some sub-locks can be missing at this point. This is not a problem, + * because enqueue will create them anyway. Main duty of this function + * is to fill in sub-lock descriptions in a race free manner. + */ + RETURN(result); +} + +static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck, + int i, int deluser, int rc) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + ENTRY; + + if (lck->lls_sub[i].sub_flags & LSF_HELD) { + struct cl_lock *sublock; + int dying; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + + lck->lls_sub[i].sub_flags &= ~LSF_HELD; + if (deluser) + cl_lock_user_del(env, sublock); + /* + * If the last hold is released, and cancellation is pending + * for a sub-lock, release parent mutex, to avoid keeping it + * while sub-lock is being paged out. + */ + dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM || + sublock->cll_descr.cld_mode == CLM_GROUP || + (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) && + sublock->cll_holds == 1; + if (dying) + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + if (dying) { + cl_lock_mutex_get(env, parent); + rc = lov_subresult(rc, CLO_REPEAT); + } + /* + * From now on lck->lls_sub[i].sub_lock is a "weak" pointer, + * not backed by a reference on a + * sub-lock. lovsub_lock_delete() will clear + * lck->lls_sub[i].sub_lock under semaphores, just before + * sub-lock is destroyed. + */ + } + RETURN(rc); +} + +static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck, + int i) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + ENTRY; + + if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) { + struct cl_lock *sublock; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + LASSERT(sublock->cll_state != CLS_FREEING); + + lck->lls_sub[i].sub_flags |= LSF_HELD; + + cl_lock_get_trust(sublock); + cl_lock_hold_add(env, sublock, "lov-parent", parent); + cl_lock_user_add(env, sublock); + cl_lock_put(env, sublock); + } + EXIT; +} + +static void lov_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck; + int i; + + ENTRY; + lck = cl2lov_lock(slice); + LASSERT(lck->lls_nr_filled == 0); + if (lck->lls_sub != NULL) { + for (i = 0; i < lck->lls_nr; ++i) + /* + * No sub-locks exists at this point, as sub-lock has + * a reference on its parent. + */ + LASSERT(lck->lls_sub[i].sub_lock == NULL); + OBD_FREE_LARGE(lck->lls_sub, + lck->lls_nr * sizeof lck->lls_sub[0]); + } + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); + EXIT; +} + +static int lov_lock_enqueue_wait(const struct lu_env *env, + struct lov_lock *lck, + struct cl_lock *sublock) +{ + struct cl_lock *lock = lck->lls_cl.cls_lock; + int result; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + + cl_lock_mutex_put(env, lock); + result = cl_lock_enqueue_wait(env, sublock, 0); + cl_lock_mutex_get(env, lock); + RETURN(result ?: CLO_REPEAT); +} + +/** + * Tries to advance a state machine of a given sub-lock toward enqueuing of + * the top-lock. + * + * \retval 0 if state-transition can proceed + * \retval -ve otherwise. + */ +static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, + struct cl_io *io, __u32 enqflags, int last) +{ + int result; + ENTRY; + + /* first, try to enqueue a sub-lock ... */ + result = cl_enqueue_try(env, sublock, io, enqflags); + if ((sublock->cll_state == CLS_ENQUEUED) && !(enqflags & CEF_AGL)) { + /* if it is enqueued, try to `wait' on it---maybe it's already + * granted */ + result = cl_wait_try(env, sublock); + if (result == CLO_REENQUEUED) + result = CLO_WAIT; + } + /* + * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in + * parallel, otherwise---enqueue has to wait until sub-lock is granted + * before proceeding to the next one. + */ + if ((result == CLO_WAIT) && (sublock->cll_state <= CLS_HELD) && + (enqflags & CEF_ASYNC) && (!last || (enqflags & CEF_AGL))) + result = 0; + RETURN(result); +} + +/** + * Helper function for lov_lock_enqueue() that creates missing sub-lock. + */ +static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent, + struct cl_io *io, struct lov_lock *lck, int idx) +{ + struct lov_lock_link *link; + struct cl_lock *sublock; + int result; + + LASSERT(parent->cll_depth == 1); + cl_lock_mutex_put(env, parent); + sublock = lov_sublock_alloc(env, io, lck, idx, &link); + if (!IS_ERR(sublock)) + cl_lock_mutex_get(env, sublock); + cl_lock_mutex_get(env, parent); + + if (!IS_ERR(sublock)) { + cl_lock_get_trust(sublock); + if (parent->cll_state == CLS_QUEUING && + lck->lls_sub[idx].sub_lock == NULL) { + lov_sublock_adopt(env, lck, sublock, idx, link); + } else { + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + /* other thread allocated sub-lock, or enqueue is no + * longer going on */ + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + cl_lock_mutex_get(env, parent); + } + cl_lock_mutex_put(env, sublock); + cl_lock_put(env, sublock); + result = CLO_REPEAT; + } else + result = PTR_ERR(sublock); + return result; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This + * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock + * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock + * state machines in the face of sub-locks sharing (by multiple top-locks), + * and concurrent sub-lock cancellations. + */ +static int lov_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, lock); + int i; + int result; + enum cl_lock_state minstate; + + ENTRY; + + for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct lov_lock_sub *lls; + struct cl_lock *sublock; + struct lov_sublock_env *subenv; + + if (lock->cll_state != CLS_QUEUING) { + /* + * Lock might have left QUEUING state if previous + * iteration released its mutex. Stop enqueing in this + * case and let the upper layer to decide what to do. + */ + LASSERT(i > 0 && result != 0); + break; + } + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + /* + * Sub-lock might have been canceled, while top-lock was + * cached. + */ + if (sub == NULL) { + result = lov_sublock_fill(env, lock, io, lck, i); + /* lov_sublock_fill() released @lock mutex, + * restart. */ + break; + } + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + lov_sublock_hold(env, lck, i); + rc = lov_lock_enqueue_one(subenv->lse_env, lck, sublock, + subenv->lse_io, enqflags, + i == lck->lls_nr - 1); + minstate = min(minstate, sublock->cll_state); + if (rc == CLO_WAIT) { + switch (sublock->cll_state) { + case CLS_QUEUING: + /* take recursive mutex, the lock is + * released in lov_lock_enqueue_wait. + */ + cl_lock_mutex_get(env, sublock); + lov_sublock_unlock(env, sub, closure, + subenv); + rc = lov_lock_enqueue_wait(env, lck, + sublock); + break; + case CLS_CACHED: + cl_lock_get(sublock); + /* take recursive mutex of sublock */ + cl_lock_mutex_get(env, sublock); + /* need to release all locks in closure + * otherwise it may deadlock. LU-2683.*/ + lov_sublock_unlock(env, sub, closure, + subenv); + /* sublock and parent are held. */ + rc = lov_sublock_release(env, lck, i, + 1, rc); + cl_lock_mutex_put(env, sublock); + cl_lock_put(env, sublock); + break; + default: + lov_sublock_unlock(env, sub, closure, + subenv); + break; + } + } else { + LASSERT(sublock->cll_conflict == NULL); + lov_sublock_unlock(env, sub, closure, subenv); + } + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + cl_lock_closure_fini(closure); + RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT); +} + +static int lov_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + int result; + + ENTRY; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + /* top-lock state cannot change concurrently, because single + * thread (one that released the last hold) carries unlocking + * to the completion. */ + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) + continue; + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + if (lls->sub_flags & LSF_HELD) { + LASSERT(sublock->cll_state == CLS_HELD || + sublock->cll_state == CLS_ENQUEUED); + rc = cl_unuse_try(subenv->lse_env, sublock); + rc = lov_sublock_release(env, lck, i, 0, rc); + } + lov_sublock_unlock(env, sub, closure, subenv); + } + result = lov_subresult(result, rc); + } + + if (result == 0 && lck->lls_cancel_race) { + lck->lls_cancel_race = 0; + result = -ESTALE; + } + cl_lock_closure_fini(closure); + RETURN(result); +} + + +static void lov_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + int result; + + ENTRY; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + /* top-lock state cannot change concurrently, because single + * thread (one that released the last hold) carries unlocking + * to the completion. */ + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) + continue; + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + if (!(lls->sub_flags & LSF_HELD)) { + lov_sublock_unlock(env, sub, closure, subenv); + continue; + } + + switch(sublock->cll_state) { + case CLS_HELD: + rc = cl_unuse_try(subenv->lse_env, sublock); + lov_sublock_release(env, lck, i, 0, 0); + break; + default: + lov_sublock_release(env, lck, i, 1, 0); + break; + } + lov_sublock_unlock(env, sub, closure, subenv); + } + + if (rc == CLO_REPEAT) { + --i; + continue; + } + + result = lov_subresult(result, rc); + } + + if (result) + CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock, + "lov_lock_cancel fails with %d.\n", result); + + cl_lock_closure_fini(closure); +} + +static int lov_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + enum cl_lock_state minstate; + int reenqueued; + int result; + int i; + + ENTRY; + +again: + for (result = 0, minstate = CLS_FREEING, i = 0, reenqueued = 0; + i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + LASSERT(sub != NULL); + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + LASSERT(sublock->cll_state >= CLS_ENQUEUED); + if (sublock->cll_state < CLS_HELD) + rc = cl_wait_try(env, sublock); + + minstate = min(minstate, sublock->cll_state); + lov_sublock_unlock(env, sub, closure, subenv); + } + if (rc == CLO_REENQUEUED) { + reenqueued++; + rc = 0; + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + /* Each sublock only can be reenqueued once, so will not loop for + * ever. */ + if (result == 0 && reenqueued != 0) + goto again; + cl_lock_closure_fini(closure); + RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT); +} + +static int lov_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int result; + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + ENTRY; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + struct lov_lock_sub *lls; + struct lov_sublock_env *subenv; + + LASSERT(slice->cls_lock->cll_state == CLS_INTRANSIT); + + lls = &lck->lls_sub[i]; + sub = lls->sub_lock; + if (sub == NULL) { + /* + * Sub-lock might have been canceled, while top-lock was + * cached. + */ + result = -ESTALE; + break; + } + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lck, lls, closure, &subenv); + if (rc == 0) { + LASSERT(sublock->cll_state != CLS_FREEING); + lov_sublock_hold(env, lck, i); + if (sublock->cll_state == CLS_CACHED) { + rc = cl_use_try(subenv->lse_env, sublock, 0); + if (rc != 0) + rc = lov_sublock_release(env, lck, + i, 1, rc); + } else if (sublock->cll_state == CLS_NEW) { + /* Sub-lock might have been canceled, while + * top-lock was cached. */ + result = -ESTALE; + lov_sublock_release(env, lck, i, 1, result); + } + lov_sublock_unlock(env, sub, closure, subenv); + } + result = lov_subresult(result, rc); + if (result != 0) + break; + } + + if (lck->lls_cancel_race) { + /* + * If there is unlocking happened at the same time, then + * sublock_lock state should be FREEING, and lov_sublock_lock + * should return CLO_REPEAT. In this case, it should return + * ESTALE, and up layer should reset the lock state to be NEW. + */ + lck->lls_cancel_race = 0; + LASSERT(result != 0); + result = -ESTALE; + } + cl_lock_closure_fini(closure); + RETURN(result); +} + +#if 0 +static int lock_lock_multi_match() +{ + struct cl_lock *lock = slice->cls_lock; + struct cl_lock_descr *subneed = &lov_env_info(env)->lti_ldescr; + struct lov_object *loo = cl2lov(lov->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct lov_lock_sub *sub; + struct cl_object *subobj; + obd_off fstart; + obd_off fend; + obd_off start; + obd_off end; + int i; + + fstart = cl_offset(need->cld_obj, need->cld_start); + fend = cl_offset(need->cld_obj, need->cld_end + 1) - 1; + subneed->cld_mode = need->cld_mode; + cl_lock_mutex_get(env, lock); + for (i = 0; i < lov->lls_nr; ++i) { + sub = &lov->lls_sub[i]; + if (sub->sub_lock == NULL) + continue; + subobj = sub->sub_descr.cld_obj; + if (!lov_stripe_intersects(loo->lo_lsm, sub->sub_stripe, + fstart, fend, &start, &end)) + continue; + subneed->cld_start = cl_index(subobj, start); + subneed->cld_end = cl_index(subobj, end); + subneed->cld_obj = subobj; + if (!cl_lock_ext_match(&sub->sub_got, subneed)) { + result = 0; + break; + } + } + cl_lock_mutex_put(env, lock); +} +#endif + +/** + * Check if the extent region \a descr is covered by \a child against the + * specific \a stripe. + */ +static int lov_lock_stripe_is_matching(const struct lu_env *env, + struct lov_object *lov, int stripe, + const struct cl_lock_descr *child, + const struct cl_lock_descr *descr) +{ + struct lov_stripe_md *lsm = lov->lo_lsm; + obd_off start; + obd_off end; + int result; + + if (lov_r0(lov)->lo_nr == 1) + return cl_lock_ext_match(child, descr); + + /* + * For a multi-stripes object: + * - make sure the descr only covers child's stripe, and + * - check if extent is matching. + */ + start = cl_offset(&lov->lo_cl, descr->cld_start); + end = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1; + result = end - start <= lsm->lsm_stripe_size && + stripe == lov_stripe_number(lsm, start) && + stripe == lov_stripe_number(lsm, end); + if (result) { + struct cl_lock_descr *subd = &lov_env_info(env)->lti_ldescr; + obd_off sub_start; + obd_off sub_end; + + subd->cld_obj = NULL; /* don't need sub object at all */ + subd->cld_mode = descr->cld_mode; + subd->cld_gid = descr->cld_gid; + result = lov_stripe_intersects(lsm, stripe, start, end, + &sub_start, &sub_end); + LASSERT(result); + subd->cld_start = cl_index(child->cld_obj, sub_start); + subd->cld_end = cl_index(child->cld_obj, sub_end); + result = cl_lock_ext_match(child, subd); + } + return result; +} + +/** + * An implementation of cl_lock_operations::clo_fits_into() method. + * + * Checks whether a lock (given by \a slice) is suitable for \a + * io. Multi-stripe locks can be used only for "quick" io, like truncate, or + * O_APPEND write. + * + * \see ccc_lock_fits_into(). + */ +static int lov_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct lov_lock *lov = cl2lov_lock(slice); + struct lov_object *obj = cl2lov(slice->cls_obj); + int result; + + LASSERT(cl_object_same(need->cld_obj, slice->cls_obj)); + LASSERT(lov->lls_nr > 0); + + ENTRY; + + /* for top lock, it's necessary to match enq flags otherwise it will + * run into problem if a sublock is missing and reenqueue. */ + if (need->cld_enq_flags != lov->lls_orig.cld_enq_flags) + return 0; + + if (need->cld_mode == CLM_GROUP) + /* + * always allow to match group lock. + */ + result = cl_lock_ext_match(&lov->lls_orig, need); + else if (lov->lls_nr == 1) { + struct cl_lock_descr *got = &lov->lls_sub[0].sub_got; + result = lov_lock_stripe_is_matching(env, + cl2lov(slice->cls_obj), + lov->lls_sub[0].sub_stripe, + got, need); + } else if (io->ci_type != CIT_SETATTR && io->ci_type != CIT_MISC && + !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM) + /* + * Multi-stripe locks are only suitable for `quick' IO and for + * glimpse. + */ + result = 0; + else + /* + * Most general case: multi-stripe existing lock, and + * (potentially) multi-stripe @need lock. Check that @need is + * covered by @lov's sub-locks. + * + * For now, ignore lock expansions made by the server, and + * match against original lock extent. + */ + result = cl_lock_ext_match(&lov->lls_orig, need); + CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %d %d/%d: %d\n", + PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got), + lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr, + result); + RETURN(result); +} + +void lov_lock_unlink(const struct lu_env *env, + struct lov_lock_link *link, struct lovsub_lock *sub) +{ + struct lov_lock *lck = link->lll_super; + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + ENTRY; + + list_del_init(&link->lll_list); + LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub); + /* yank this sub-lock from parent's array */ + lck->lls_sub[link->lll_idx].sub_lock = NULL; + LASSERT(lck->lls_nr_filled > 0); + lck->lls_nr_filled--; + lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock); + cl_lock_put(env, parent); + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + EXIT; +} + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub) +{ + struct lov_lock_link *scan; + + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + ENTRY; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + if (scan->lll_super == lck) + RETURN(scan); + } + RETURN(NULL); +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked for "top-to-bottom" delete, when lock destruction starts from the + * top-lock, e.g., as a result of inode destruction. + * + * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there: + * this is done separately elsewhere: + * + * - for inode destruction, lov_object_delete() calls cl_object_kill() for + * each sub-object, purging its locks; + * + * - in other cases (e.g., a fatal error with a top-lock) sub-locks are + * left in the cache. + */ +static void lov_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + struct lov_lock_link *link; + int rc; + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_FREEING); + ENTRY; + + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *lls = &lck->lls_sub[i]; + struct lovsub_lock *lsl = lls->sub_lock; + + if (lsl == NULL) /* already removed */ + continue; + + rc = lov_sublock_lock(env, lck, lls, closure, NULL); + if (rc == CLO_REPEAT) { + --i; + continue; + } + + LASSERT(rc == 0); + LASSERT(lsl->lss_cl.cls_lock->cll_state < CLS_FREEING); + + if (lls->sub_flags & LSF_HELD) + lov_sublock_release(env, lck, i, 1, 0); + + link = lov_lock_link_find(env, lck, lsl); + LASSERT(link != NULL); + lov_lock_unlink(env, link, lsl); + LASSERT(lck->lls_sub[i].sub_lock == NULL); + + lov_sublock_unlock(env, lsl, closure, NULL); + } + + cl_lock_closure_fini(closure); + EXIT; +} + +static int lov_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + int i; + + (*p)(env, cookie, "%d\n", lck->lls_nr); + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *sub; + + sub = &lck->lls_sub[i]; + (*p)(env, cookie, " %d %x: ", i, sub->sub_flags); + if (sub->sub_lock != NULL) + cl_lock_print(env, cookie, p, + sub->sub_lock->lss_cl.cls_lock); + else + (*p)(env, cookie, "---\n"); + } + return 0; +} + +static const struct cl_lock_operations lov_lock_ops = { + .clo_fini = lov_lock_fini, + .clo_enqueue = lov_lock_enqueue, + .clo_wait = lov_lock_wait, + .clo_use = lov_lock_use, + .clo_unuse = lov_lock_unuse, + .clo_cancel = lov_lock_cancel, + .clo_fits_into = lov_lock_fits_into, + .clo_delete = lov_lock_delete, + .clo_print = lov_lock_print +}; + +int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO); + if (lck != NULL) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); + result = lov_lock_sub_init(env, lck, io); + } else + result = -ENOMEM; + RETURN(result); +} + +static void lov_empty_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); +} + +static int lov_empty_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + (*p)(env, cookie, "empty\n"); + return 0; +} + +/* XXX: more methods will be added later. */ +static const struct cl_lock_operations lov_empty_lock_ops = { + .clo_fini = lov_empty_lock_fini, + .clo_print = lov_empty_lock_print +}; + +int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result = -ENOMEM; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lck, lov_lock_kmem, __GFP_IO); + if (lck != NULL) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops); + lck->lls_orig = lock->cll_descr; + result = 0; + } + RETURN(result); +} + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent) +{ + struct cl_lock_closure *closure; + + closure = &lov_env_info(env)->lti_closure; + LASSERT(list_empty(&closure->clc_list)); + cl_lock_closure_init(env, closure, parent, 1); + return closure; +} + + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_log.c b/drivers/staging/lustre/lustre/lov/lov_log.c new file mode 100644 index 000000000000..63b7f8d3182f --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_log.c @@ -0,0 +1,278 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_log.c + * + * Author: Phil Schwan + * Author: Peter Braam + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_LOV +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lov_internal.h" + +/* Add log records for each OSC that this object is striped over, and return + * cookies for each one. We _would_ have nice abstraction here, except that + * we need to keep cookies in stripe order, even if some are NULL, so that + * the right cookies are passed back to the right OSTs at the client side. + * Unset cookies should be all-zero (which will never occur naturally). */ +static int lov_llog_origin_add(const struct lu_env *env, + struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, + struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies) +{ + struct obd_device *obd = ctxt->loc_obd; + struct lov_obd *lov = &obd->u.lov; + int i, rc = 0, cookies = 0; + ENTRY; + + LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count, + "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n", + logcookies, numcookies, lsm->lsm_stripe_count); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + struct obd_device *child = + lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd; + struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx); + + /* fill mds unlink/setattr log record */ + switch (rec->lrh_type) { + case MDS_UNLINK_REC: { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + lur->lur_oid = ostid_id(&loi->loi_oi); + lur->lur_oseq = (__u32)ostid_seq(&loi->loi_oi); + break; + } + case MDS_SETATTR64_REC: { + struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec; + lsr->lsr_oi = loi->loi_oi; + break; + } + default: + break; + } + + /* inject error in llog_obd_add() below */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) { + llog_ctxt_put(cctxt); + cctxt = NULL; + } + rc = llog_obd_add(env, cctxt, rec, NULL, logcookies + cookies, + numcookies - cookies); + llog_ctxt_put(cctxt); + if (rc < 0) { + CERROR("Can't add llog (rc = %d) for stripe %d\n", + rc, cookies); + memset(logcookies + cookies, 0, + sizeof(struct llog_cookie)); + rc = 1; /* skip this cookie */ + } + /* Note that rc is always 1 if llog_obd_add was successful */ + cookies += rc; + } + RETURN(cookies); +} + +static int lov_llog_origin_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, + struct llog_gen *gen, + struct obd_uuid *uuid) +{ + struct obd_device *obd = ctxt->loc_obd; + struct lov_obd *lov = &obd->u.lov; + int i, rc = 0, err = 0; + ENTRY; + + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct obd_device *child; + struct llog_ctxt *cctxt; + + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) + continue; + if (uuid && !obd_uuid_equals(uuid, &lov->lov_tgts[i]->ltd_uuid)) + continue; + CDEBUG(D_CONFIG, "connect %d/%d\n", i, lov->desc.ld_tgt_count); + child = lov->lov_tgts[i]->ltd_exp->exp_obd; + cctxt = llog_get_context(child, ctxt->loc_idx); + rc = llog_connect(cctxt, logid, gen, uuid); + llog_ctxt_put(cctxt); + + if (rc) { + CERROR("error osc_llog_connect tgt %d (%d)\n", i, rc); + if (!err) + err = rc; + } + } + obd_putref(obd); + + RETURN(err); +} + +/* the replicators commit callback */ +static int lov_llog_repl_cancel(const struct lu_env *env, + struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, + int count, struct llog_cookie *cookies, + int flags) +{ + struct lov_obd *lov; + struct obd_device *obd = ctxt->loc_obd; + int rc = 0, i; + ENTRY; + + LASSERT(lsm != NULL); + LASSERT(count == lsm->lsm_stripe_count); + + lov = &obd->u.lov; + obd_getref(obd); + for (i = 0; i < count; i++, cookies++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + struct obd_device *child = + lov->lov_tgts[loi->loi_ost_idx]->ltd_exp->exp_obd; + struct llog_ctxt *cctxt = + llog_get_context(child, ctxt->loc_idx); + int err; + + err = llog_cancel(env, cctxt, NULL, 1, cookies, flags); + llog_ctxt_put(cctxt); + if (err && lov->lov_tgts[loi->loi_ost_idx]->ltd_active) { + CERROR("%s: objid "DOSTID" subobj "DOSTID + " on OST idx %d: rc = %d\n", + obd->obd_name, POSTID(&lsm->lsm_oi), + POSTID(&loi->loi_oi), loi->loi_ost_idx, err); + if (!rc) + rc = err; + } + } + obd_putref(obd); + RETURN(rc); +} + +static struct llog_operations lov_mds_ost_orig_logops = { + .lop_obd_add = lov_llog_origin_add, + .lop_connect = lov_llog_origin_connect, +}; + +static struct llog_operations lov_size_repl_logops = { + .lop_cancel = lov_llog_repl_cancel, +}; + +int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *index) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_device *child; + int i, rc = 0; + ENTRY; + + LASSERT(olg == &obd->obd_olg); + rc = llog_setup(NULL, obd, olg, LLOG_MDS_OST_ORIG_CTXT, disk_obd, + &lov_mds_ost_orig_logops); + if (rc) + RETURN(rc); + + rc = llog_setup(NULL, obd, olg, LLOG_SIZE_REPL_CTXT, disk_obd, + &lov_size_repl_logops); + if (rc) + GOTO(err_cleanup, rc); + + obd_getref(obd); + /* count may not match lov->desc.ld_tgt_count during dynamic ost add */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + if (index && i != *index) + continue; + + child = lov->lov_tgts[i]->ltd_obd; + rc = obd_llog_init(child, &child->obd_olg, disk_obd, &i); + if (rc) + CERROR("error osc_llog_init idx %d osc '%s' tgt '%s' " + "(rc=%d)\n", i, child->obd_name, + disk_obd->obd_name, rc); + rc = 0; + } + obd_putref(obd); + GOTO(err_cleanup, rc); +err_cleanup: + if (rc) { + struct llog_ctxt *ctxt = + llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + } + return rc; +} + +int lov_llog_finish(struct obd_device *obd, int count) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + /* cleanup our llogs only if the ctxts have been setup + * (client lov doesn't setup, mds lov does). */ + ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + + ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + + /* lov->tgt llogs are cleaned during osc_cleanup. */ + RETURN(0); +} diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c new file mode 100644 index 000000000000..ddbac1220263 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_merge.c @@ -0,0 +1,218 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include + +#include "lov_internal.h" + +/** Merge the lock value block(&lvb) attributes and KMS from each of the + * stripes in a file into a single lvb. It is expected that the caller + * initializes the current atime, mtime, ctime to avoid regressing a more + * uptodate time on the local client. + */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place) +{ + __u64 size = 0; + __u64 kms = 0; + __u64 blocks = 0; + obd_time current_mtime = lvb->lvb_mtime; + obd_time current_atime = lvb->lvb_atime; + obd_time current_ctime = lvb->lvb_ctime; + int i; + int rc = 0; + + LASSERT(spin_is_locked(&lsm->lsm_lock)); + LASSERT(lsm->lsm_lock_owner == current_pid()); + + CDEBUG(D_INODE, "MDT ID "DOSTID" initial value: s="LPU64" m="LPU64 + " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, + lvb->lvb_blocks); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + obd_size lov_size, tmpsize; + + if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) { + rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); + continue; + } + + tmpsize = loi->loi_kms; + lov_size = lov_stripe_size(lsm, tmpsize, i); + if (lov_size > kms) + kms = lov_size; + + if (loi->loi_lvb.lvb_size > tmpsize) + tmpsize = loi->loi_lvb.lvb_size; + + lov_size = lov_stripe_size(lsm, tmpsize, i); + if (lov_size > size) + size = lov_size; + /* merge blocks, mtime, atime */ + blocks += loi->loi_lvb.lvb_blocks; + if (loi->loi_lvb.lvb_mtime > current_mtime) + current_mtime = loi->loi_lvb.lvb_mtime; + if (loi->loi_lvb.lvb_atime > current_atime) + current_atime = loi->loi_lvb.lvb_atime; + if (loi->loi_lvb.lvb_ctime > current_ctime) + current_ctime = loi->loi_lvb.lvb_ctime; + + CDEBUG(D_INODE, "MDT ID "DOSTID" on OST[%u]: s="LPU64" m="LPU64 + " a="LPU64" c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), + loi->loi_ost_idx, loi->loi_lvb.lvb_size, + loi->loi_lvb.lvb_mtime, loi->loi_lvb.lvb_atime, + loi->loi_lvb.lvb_ctime, loi->loi_lvb.lvb_blocks); + } + + *kms_place = kms; + lvb->lvb_size = size; + lvb->lvb_blocks = blocks; + lvb->lvb_mtime = current_mtime; + lvb->lvb_atime = current_atime; + lvb->lvb_ctime = current_ctime; + RETURN(rc); +} + +/** Merge the lock value block(&lvb) attributes from each of the stripes in a + * file into a single lvb. It is expected that the caller initializes the + * current atime, mtime, ctime to avoid regressing a more uptodate time on + * the local client. + * + * If \a kms_only is set then we do not consider the recently seen size (rss) + * when updating the known minimum size (kms). Even when merging RSS, we will + * take the KMS value if it's larger. This prevents getattr from stomping on + * dirty cached pages which extend the file size. */ +int lov_merge_lvb(struct obd_export *exp, + struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only) +{ + int rc; + __u64 kms; + + ENTRY; + lov_stripe_lock(lsm); + rc = lov_merge_lvb_kms(lsm, lvb, &kms); + lov_stripe_unlock(lsm); + if (kms_only) + lvb->lvb_size = kms; + + CDEBUG(D_INODE, "merged for ID "DOSTID" s="LPU64" m="LPU64" a="LPU64 + " c="LPU64" b="LPU64"\n", POSTID(&lsm->lsm_oi), lvb->lvb_size, + lvb->lvb_mtime, lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(rc); +} + +/* Must be called under the lov_stripe_lock() */ +int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, + obd_off size, int shrink) +{ + struct lov_oinfo *loi; + int stripe = 0; + __u64 kms; + ENTRY; + + LASSERT(spin_is_locked(&lsm->lsm_lock)); + LASSERT(lsm->lsm_lock_owner == current_pid()); + + if (shrink) { + for (; stripe < lsm->lsm_stripe_count; stripe++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[stripe]; + kms = lov_size_to_stripe(lsm, size, stripe); + CDEBUG(D_INODE, + "stripe %d KMS %sing "LPU64"->"LPU64"\n", + stripe, kms > loi->loi_kms ? "increas":"shrink", + loi->loi_kms, kms); + loi_kms_set(loi, loi->loi_lvb.lvb_size = kms); + } + RETURN(0); + } + + if (size > 0) + stripe = lov_stripe_number(lsm, size - 1); + kms = lov_size_to_stripe(lsm, size, stripe); + loi = lsm->lsm_oinfo[stripe]; + + CDEBUG(D_INODE, "stripe %d KMS %sincreasing "LPU64"->"LPU64"\n", + stripe, kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms); + if (kms > loi->loi_kms) + loi_kms_set(loi, kms); + + RETURN(0); +} + +void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_valid valid, + struct lov_stripe_md *lsm, int stripeno, int *set) +{ + valid &= src->o_valid; + + if (*set) { + if (valid & OBD_MD_FLSIZE) { + /* this handles sparse files properly */ + obd_size lov_size; + + lov_size = lov_stripe_size(lsm, src->o_size, stripeno); + if (lov_size > tgt->o_size) + tgt->o_size = lov_size; + } + if (valid & OBD_MD_FLBLOCKS) + tgt->o_blocks += src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + tgt->o_blksize += src->o_blksize; + if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime) + tgt->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime) + tgt->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLDATAVERSION) + tgt->o_data_version += src->o_data_version; + } else { + memcpy(tgt, src, sizeof(*tgt)); + tgt->o_oi = lsm->lsm_oi; + if (valid & OBD_MD_FLSIZE) + tgt->o_size = lov_stripe_size(lsm, src->o_size, + stripeno); + } + + /* data_version needs to be valid on all stripes to be correct! */ + if (!(valid & OBD_MD_FLDATAVERSION)) + tgt->o_valid &= ~OBD_MD_FLDATAVERSION; + + *set += 1; +} diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c new file mode 100644 index 000000000000..8089f03a200e --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_obd.c @@ -0,0 +1,2923 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_obd.c + * + * Author: Phil Schwan + * Author: Peter Braam + * Author: Mike Shaver + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lov_internal.h" + +/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion. + Any function that expects lov_tgts to remain stationary must take a ref. */ +static void lov_getref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + /* nobody gets through here until lov_putref is done */ + mutex_lock(&lov->lov_lock); + atomic_inc(&lov->lov_refcount); + mutex_unlock(&lov->lov_lock); + return; +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); + +static void lov_putref(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + + mutex_lock(&lov->lov_lock); + /* ok to dec to 0 more than once -- ltd_exp's will be null */ + if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { + LIST_HEAD(kill); + int i; + struct lov_tgt_desc *tgt, *n; + CDEBUG(D_CONFIG, "destroying %d lov targets\n", + lov->lov_death_row); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + + if (!tgt || !tgt->ltd_reap) + continue; + list_add(&tgt->ltd_kill, &kill); + /* XXX - right now there is a dependency on ld_tgt_count + * being the maximum tgt index for computing the + * mds_max_easize. So we can't shrink it. */ + lov_ost_pool_remove(&lov->lov_packed, i); + lov->lov_tgts[i] = NULL; + lov->lov_death_row--; + } + mutex_unlock(&lov->lov_lock); + + list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { + list_del(&tgt->ltd_kill); + /* Disconnect */ + __lov_del_obd(obd, tgt); + } + } else { + mutex_unlock(&lov->lov_lock); + } +} + +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev); +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); + + +#define MAX_STRING_SIZE 128 +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data) +{ + struct lov_obd *lov = &obd->u.lov; + struct obd_uuid *tgt_uuid; + struct obd_device *tgt_obd; + static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; + struct obd_import *imp; + proc_dir_entry_t *lov_proc_dir; + int rc; + ENTRY; + + if (!lov->lov_tgts[index]) + RETURN(-EINVAL); + + tgt_uuid = &lov->lov_tgts[index]->ltd_uuid; + tgt_obd = lov->lov_tgts[index]->ltd_obd; + + if (!tgt_obd->obd_set_up) { + CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); + RETURN(-EINVAL); + } + + /* override the sp_me from lov */ + tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; + + if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) + data->ocd_index = index; + + /* + * Divine LOV knows that OBDs under it are OSCs. + */ + imp = tgt_obd->u.cli.cl_import; + + if (activate) { + tgt_obd->obd_no_recov = 0; + /* FIXME this is probably supposed to be + ptlrpc_set_import_active. Horrible naming. */ + ptlrpc_activate_import(imp); + } + + rc = obd_register_observer(tgt_obd, obd); + if (rc) { + CERROR("Target %s register_observer error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(rc); + } + + + if (imp->imp_invalid) { + CDEBUG(D_CONFIG, "not connecting OSC %s; administratively " + "disabled\n", obd_uuid2str(tgt_uuid)); + RETURN(0); + } + + rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, + &lov_osc_uuid, data, NULL); + if (rc || !lov->lov_tgts[index]->ltd_exp) { + CERROR("Target %s connect error %d\n", + obd_uuid2str(tgt_uuid), rc); + RETURN(-ENODEV); + } + + lov->lov_tgts[index]->ltd_reap = 0; + + CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index, + obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in"); + + lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); + if (lov_proc_dir) { + struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd; + proc_dir_entry_t *osc_symlink; + + LASSERT(osc_obd != NULL); + LASSERT(osc_obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(osc_obd->obd_type->typ_name != NULL); + + osc_symlink = lprocfs_add_symlink(osc_obd->obd_name, + lov_proc_dir, + "../../../%s/%s", + osc_obd->obd_type->typ_name, + osc_obd->obd_name); + if (osc_symlink == NULL) { + CERROR("could not register LOV target " + "/proc/fs/lustre/%s/%s/target_obds/%s.", + obd->obd_type->typ_name, obd->obd_name, + osc_obd->obd_name); + lprocfs_remove(&lov_proc_dir); + } + } + + RETURN(0); +} + +static int lov_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct lustre_handle conn; + int i, rc; + ENTRY; + + CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); + + rc = class_connect(&conn, obd, cluuid); + if (rc) + RETURN(rc); + + *exp = class_conn2export(&conn); + + /* Why should there ever be more than 1 connect? */ + lov->lov_connects++; + LASSERT(lov->lov_connects == 1); + + memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd)); + if (data) + lov->lov_ocd = *data; + + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || obd_uuid_empty(&tgt->ltd_uuid)) + continue; + /* Flags will be lowest common denominator */ + rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd); + if (rc) { + CERROR("%s: lov connect tgt %d failed: %d\n", + obd->obd_name, i, rc); + continue; + } + /* connect to administrative disabled ost */ + if (!lov->lov_tgts[i]->ltd_exp) + continue; + + rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, + OBD_NOTIFY_CONNECT, (void *)&i); + if (rc) { + CERROR("%s error sending notify %d\n", + obd->obd_name, rc); + } + } + obd_putref(obd); + + RETURN(0); +} + +static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + proc_dir_entry_t *lov_proc_dir; + struct lov_obd *lov = &obd->u.lov; + struct obd_device *osc_obd; + int rc; + ENTRY; + + osc_obd = class_exp2obd(tgt->ltd_exp); + CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", + obd->obd_name, osc_obd->obd_name); + + if (tgt->ltd_active) { + tgt->ltd_active = 0; + lov->desc.ld_active_tgt_count--; + tgt->ltd_exp->exp_obd->obd_inactive = 1; + } + + lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); + if (lov_proc_dir) { + proc_dir_entry_t *osc_symlink; + + osc_symlink = lprocfs_srch(lov_proc_dir, osc_obd->obd_name); + if (osc_symlink) { + lprocfs_remove(&osc_symlink); + } else { + CERROR("/proc/fs/lustre/%s/%s/target_obds/%s missing.", + obd->obd_type->typ_name, obd->obd_name, + osc_obd->obd_name); + } + } + + if (osc_obd) { + /* Pass it on to our clients. + * XXX This should be an argument to disconnect, + * XXX not a back-door flag on the OBD. Ah well. + */ + osc_obd->obd_force = obd->obd_force; + osc_obd->obd_fail = obd->obd_fail; + osc_obd->obd_no_recov = obd->obd_no_recov; + } + + obd_register_observer(osc_obd, NULL); + + rc = obd_disconnect(tgt->ltd_exp); + if (rc) { + CERROR("Target %s disconnect error %d\n", + tgt->ltd_uuid.uuid, rc); + rc = 0; + } + + tgt->ltd_exp = NULL; + RETURN(0); +} + +static int lov_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int i, rc; + ENTRY; + + if (!lov->lov_tgts) + goto out; + + /* Only disconnect the underlying layers on the final disconnect. */ + lov->lov_connects--; + if (lov->lov_connects != 0) { + /* why should there be more than 1 connect? */ + CERROR("disconnect #%d\n", lov->lov_connects); + goto out; + } + + /* Let's hold another reference so lov_del_obd doesn't spin through + putref every time */ + obd_getref(obd); + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) { + /* Disconnection is the last we know about an obd */ + lov_del_target(obd, i, 0, lov->lov_tgts[i]->ltd_gen); + } + } + obd_putref(obd); + +out: + rc = class_disconnect(exp); /* bz 9811 */ + RETURN(rc); +} + +/* Error codes: + * + * -EINVAL : UUID can't be found in the LOV's target list + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD is the wrong type (!) + * any >= 0 : is log target index + */ +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + enum obd_notify_event ev) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + int index, activate, active; + ENTRY; + + CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n", + lov, uuid->uuid, ev); + + obd_getref(obd); + for (index = 0; index < lov->desc.ld_tgt_count; index++) { + tgt = lov->lov_tgts[index]; + if (!tgt) + continue; + /* + * LU-642, initially inactive OSC could miss the obd_connect, + * we make up for it here. + */ + if (ev == OBD_NOTIFY_ACTIVATE && tgt->ltd_exp == NULL && + obd_uuid_equals(uuid, &tgt->ltd_uuid)) { + struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"}; + + obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd, + &lov_osc_uuid, &lov->lov_ocd, NULL); + } + if (!tgt->ltd_exp) + continue; + + CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n", + index, obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_exp->exp_handle.h_cookie); + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) + break; + } + + if (index == lov->desc.ld_tgt_count) + GOTO(out, index = -EINVAL); + + if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) { + activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0; + + if (lov->lov_tgts[index]->ltd_activate == activate) { + CDEBUG(D_INFO, "OSC %s already %sactivate!\n", + uuid->uuid, activate ? "" : "de"); + } else { + lov->lov_tgts[index]->ltd_activate = activate; + CDEBUG(D_CONFIG, "%sactivate OSC %s\n", + activate ? "" : "de", obd_uuid2str(uuid)); + } + + } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) { + active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0; + + if (lov->lov_tgts[index]->ltd_active == active) { + CDEBUG(D_INFO, "OSC %s already %sactive!\n", + uuid->uuid, active ? "" : "in"); + GOTO(out, index); + } else { + CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", + obd_uuid2str(uuid), active ? "" : "in"); + } + + lov->lov_tgts[index]->ltd_active = active; + if (active) { + lov->desc.ld_active_tgt_count++; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0; + } else { + lov->desc.ld_active_tgt_count--; + lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; + } + } else { + CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid); + } + + out: + obd_putref(obd); + RETURN(index); +} + +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data) +{ + int rc = 0; + struct lov_obd *lov = &obd->u.lov; + ENTRY; + + down_read(&lov->lov_notify_lock); + if (!lov->lov_connects) { + up_read(&lov->lov_notify_lock); + RETURN(rc); + } + + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE || + ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) { + struct obd_uuid *uuid; + + LASSERT(watched); + + if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { + up_read(&lov->lov_notify_lock); + CERROR("unexpected notification of %s %s!\n", + watched->obd_type->typ_name, + watched->obd_name); + RETURN(-EINVAL); + } + uuid = &watched->u.cli.cl_target_uuid; + + /* Set OSC as active before notifying the observer, so the + * observer can use the OSC normally. + */ + rc = lov_set_osc_active(obd, uuid, ev); + if (rc < 0) { + up_read(&lov->lov_notify_lock); + CERROR("event(%d) of %s failed: %d\n", ev, + obd_uuid2str(uuid), rc); + RETURN(rc); + } + /* active event should be pass lov target index as data */ + data = &rc; + } + + /* Pass the notification up the chain. */ + if (watched) { + rc = obd_notify_observer(obd, watched, ev, data); + } else { + /* NULL watched means all osc's in the lov (only for syncs) */ + /* sync event should be send lov idx as data */ + struct lov_obd *lov = &obd->u.lov; + int i, is_sync; + + data = &i; + is_sync = (ev == OBD_NOTIFY_SYNC) || + (ev == OBD_NOTIFY_SYNC_NONBLOCK); + + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* don't send sync event if target not + * connected/activated */ + if (is_sync && !lov->lov_tgts[i]->ltd_active) + continue; + + rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd, + ev, data); + if (rc) { + CERROR("%s: notify %s of %s failed %d\n", + obd->obd_name, + obd->obd_observer->obd_name, + lov->lov_tgts[i]->ltd_obd->obd_name, + rc); + } + } + obd_putref(obd); + } + + up_read(&lov->lov_notify_lock); + RETURN(rc); +} + +static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen, int active) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + struct obd_device *tgt_obd; + int rc; + ENTRY; + + CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n", + uuidp->uuid, index, gen, active); + + if (gen <= 0) { + CERROR("request to add OBD %s with invalid generation: %d\n", + uuidp->uuid, gen); + RETURN(-EINVAL); + } + + tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, + &obd->obd_uuid); + if (tgt_obd == NULL) + RETURN(-EINVAL); + + mutex_lock(&lov->lov_lock); + + if ((index < lov->lov_tgt_size) && (lov->lov_tgts[index] != NULL)) { + tgt = lov->lov_tgts[index]; + CERROR("UUID %s already assigned at LOV target index %d\n", + obd_uuid2str(&tgt->ltd_uuid), index); + mutex_unlock(&lov->lov_lock); + RETURN(-EEXIST); + } + + if (index >= lov->lov_tgt_size) { + /* We need to reallocate the lov target array. */ + struct lov_tgt_desc **newtgts, **old = NULL; + __u32 newsize, oldsize = 0; + + newsize = max(lov->lov_tgt_size, (__u32)2); + while (newsize < index + 1) + newsize = newsize << 1; + OBD_ALLOC(newtgts, sizeof(*newtgts) * newsize); + if (newtgts == NULL) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + if (lov->lov_tgt_size) { + memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) * + lov->lov_tgt_size); + old = lov->lov_tgts; + oldsize = lov->lov_tgt_size; + } + + lov->lov_tgts = newtgts; + lov->lov_tgt_size = newsize; + smp_rmb(); + if (old) + OBD_FREE(old, sizeof(*old) * oldsize); + + CDEBUG(D_CONFIG, "tgts: %p size: %d\n", + lov->lov_tgts, lov->lov_tgt_size); + } + + OBD_ALLOC_PTR(tgt); + if (!tgt) { + mutex_unlock(&lov->lov_lock); + RETURN(-ENOMEM); + } + + rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); + if (rc) { + mutex_unlock(&lov->lov_lock); + OBD_FREE_PTR(tgt); + RETURN(rc); + } + + tgt->ltd_uuid = *uuidp; + tgt->ltd_obd = tgt_obd; + /* XXX - add a sanity check on the generation number. */ + tgt->ltd_gen = gen; + tgt->ltd_index = index; + tgt->ltd_activate = active; + lov->lov_tgts[index] = tgt; + if (index >= lov->desc.ld_tgt_count) + lov->desc.ld_tgt_count = index + 1; + + mutex_unlock(&lov->lov_lock); + + CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", + index, tgt->ltd_gen, lov->desc.ld_tgt_count); + + rc = obd_notify(obd, tgt_obd, OBD_NOTIFY_CREATE, &index); + + if (lov->lov_connects == 0) { + /* lov_connect hasn't been called yet. We'll do the + lov_connect_obd on this target when that fn first runs, + because we don't know the connect flags yet. */ + RETURN(0); + } + + obd_getref(obd); + + rc = lov_connect_obd(obd, index, active, &lov->lov_ocd); + if (rc) + GOTO(out, rc); + + /* connect to administrative disabled ost */ + if (!tgt->ltd_exp) + GOTO(out, rc = 0); + + if (lov->lov_cache != NULL) { + rc = obd_set_info_async(NULL, tgt->ltd_exp, + sizeof(KEY_CACHE_SET), KEY_CACHE_SET, + sizeof(struct cl_client_cache), lov->lov_cache, + NULL); + if (rc < 0) + GOTO(out, rc); + } + + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, + active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE, + (void *)&index); + +out: + if (rc) { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_del_target(obd, index, 0, 0); + } + obd_putref(obd); + RETURN(rc); +} + +/* Schedule a target for deletion */ +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen) +{ + struct lov_obd *lov = &obd->u.lov; + int count = lov->desc.ld_tgt_count; + int rc = 0; + ENTRY; + + if (index >= count) { + CERROR("LOV target index %d >= number of LOV OBDs %d.\n", + index, count); + RETURN(-EINVAL); + } + + /* to make sure there's no ongoing lov_notify() now */ + down_write(&lov->lov_notify_lock); + obd_getref(obd); + + if (!lov->lov_tgts[index]) { + CERROR("LOV target at index %d is not setup.\n", index); + GOTO(out, rc = -EINVAL); + } + + if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) { + CERROR("LOV target UUID %s at index %d doesn't match %s.\n", + lov_uuid2str(lov, index), index, + obd_uuid2str(uuidp)); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", + lov_uuid2str(lov, index), index, + lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp, + lov->lov_tgts[index]->ltd_active); + + lov->lov_tgts[index]->ltd_reap = 1; + lov->lov_death_row++; + /* we really delete it from obd_putref */ +out: + obd_putref(obd); + up_write(&lov->lov_notify_lock); + + RETURN(rc); +} + +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct obd_device *osc_obd; + + LASSERT(tgt); + LASSERT(tgt->ltd_reap); + + osc_obd = class_exp2obd(tgt->ltd_exp); + + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", + tgt->ltd_uuid.uuid, + osc_obd ? osc_obd->obd_name : ""); + + if (tgt->ltd_exp) + lov_disconnect_obd(obd, tgt); + + OBD_FREE_PTR(tgt); + + /* Manual cleanup - no cleanup logs to clean up the osc's. We must + do it ourselves. And we can't do it from lov_cleanup, + because we just lost our only reference to it. */ + if (osc_obd) + class_manual_cleanup(osc_obd); +} + +void lov_fix_desc_stripe_size(__u64 *val) +{ + if (*val < LOV_DEFAULT_STRIPE_SIZE) { + LCONSOLE_WARN("Increasing default stripe size to min %u\n", + LOV_DEFAULT_STRIPE_SIZE); + *val = LOV_DEFAULT_STRIPE_SIZE; + } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { + *val &= ~(LOV_MIN_STRIPE_SIZE - 1); + LCONSOLE_WARN("Changing default stripe size to "LPU64" (a " + "multiple of %u)\n", + *val, LOV_MIN_STRIPE_SIZE); + } +} + +void lov_fix_desc_stripe_count(__u32 *val) +{ + if (*val == 0) + *val = 1; +} + +void lov_fix_desc_pattern(__u32 *val) +{ + /* from lov_setstripe */ + if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) { + LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + *val = 0; + } +} + +void lov_fix_desc_qos_maxage(__u32 *val) +{ + /* fix qos_maxage */ + if (*val == 0) + *val = QOS_DEFAULT_MAXAGE; +} + +void lov_fix_desc(struct lov_desc *desc) +{ + lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); + lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lov_fix_desc_pattern(&desc->ld_pattern); + lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { 0 }; + struct lov_desc *desc; + struct lov_obd *lov = &obd->u.lov; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("LOV setup requires a descriptor\n"); + RETURN(-EINVAL); + } + + desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); + + if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("descriptor size wrong: %d > %d\n", + (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); + RETURN(-EINVAL); + } + + if (desc->ld_magic != LOV_DESC_MAGIC) { + if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { + CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", + obd->obd_name, desc); + lustre_swab_lov_desc(desc); + } else { + CERROR("%s: Bad lov desc magic: %#x\n", + obd->obd_name, desc->ld_magic); + RETURN(-EINVAL); + } + } + + lov_fix_desc(desc); + + desc->ld_active_tgt_count = 0; + lov->desc = *desc; + lov->lov_tgt_size = 0; + + mutex_init(&lov->lov_lock); + atomic_set(&lov->lov_refcount, 0); + lov->lov_sp_me = LUSTRE_SP_CLI; + + init_rwsem(&lov->lov_notify_lock); + + lov->lov_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, + HASH_POOLS_MAX_BITS, + HASH_POOLS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &pool_hash_operations, + CFS_HASH_DEFAULT); + INIT_LIST_HEAD(&lov->lov_pool_list); + lov->lov_pool_count = 0; + rc = lov_ost_pool_init(&lov->lov_packed, 0); + if (rc) + GOTO(out, rc); + + lprocfs_lov_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); +#ifdef LPROCFS + { + int rc; + + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", + 0444, &lov_proc_target_fops, obd); + if (rc) + CWARN("Error adding the target_obd file\n"); + } +#endif + lov->lov_pool_proc_entry = lprocfs_register("pools", + obd->obd_proc_entry, + NULL, NULL); + + RETURN(0); + +out: + return rc; +} + +static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + struct lov_obd *lov = &obd->u.lov; + + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: { + int i; + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) + continue; + obd_precleanup(class_exp2obd(lov->lov_tgts[i]->ltd_exp), + OBD_CLEANUP_EARLY); + } + break; + } + case OBD_CLEANUP_EXPORTS: + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + RETURN(rc); +} + +static int lov_cleanup(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + struct list_head *pos, *tmp; + struct pool_desc *pool; + ENTRY; + + list_for_each_safe(pos, tmp, &lov->lov_pool_list) { + pool = list_entry(pos, struct pool_desc, pool_list); + /* free pool structs */ + CDEBUG(D_INFO, "delete pool %p\n", pool); + /* In the function below, .hs_keycmp resolves to + * pool_hashkey_keycmp() */ + /* coverity[overrun-buffer-val] */ + lov_pool_del(obd, pool->pool_name); + } + cfs_hash_putref(lov->lov_pools_hash_body); + lov_ost_pool_free(&lov->lov_packed); + + lprocfs_obd_cleanup(obd); + if (lov->lov_tgts) { + int i; + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Inactive targets may never have connected */ + if (lov->lov_tgts[i]->ltd_active || + atomic_read(&lov->lov_refcount)) + /* We should never get here - these + should have been removed in the + disconnect. */ + CERROR("lov tgt %d not cleaned!" + " deathrow=%d, lovrc=%d\n", + i, lov->lov_death_row, + atomic_read(&lov->lov_refcount)); + lov_del_target(obd, i, 0, 0); + } + obd_putref(obd); + OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) * + lov->lov_tgt_size); + lov->lov_tgt_size = 0; + } + RETURN(0); +} + +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp) +{ + struct obd_uuid obd_uuid; + int cmd; + int rc = 0; + ENTRY; + + switch(cmd = lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + case LCFG_LOV_DEL_OBD: { + __u32 index; + int gen; + /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ + if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) + GOTO(out, rc = -EINVAL); + + obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); + + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1) + GOTO(out, rc = -EINVAL); + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1) + GOTO(out, rc = -EINVAL); + index = *indexp; + gen = *genp; + if (cmd == LCFG_LOV_ADD_OBD) + rc = lov_add_target(obd, &obd_uuid, index, gen, 1); + else if (cmd == LCFG_LOV_ADD_INA) + rc = lov_add_target(obd, &obd_uuid, index, gen, 0); + else + rc = lov_del_target(obd, index, &obd_uuid, gen); + GOTO(out, rc); + } + case LCFG_PARAM: { + struct lprocfs_static_vars lvars = { 0 }; + struct lov_desc *desc = &(obd->u.lov.desc); + + if (!desc) + GOTO(out, rc = -EINVAL); + + lprocfs_lov_init_vars(&lvars); + + rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + GOTO(out, rc); + } + case LCFG_POOL_NEW: + case LCFG_POOL_ADD: + case LCFG_POOL_DEL: + case LCFG_POOL_REM: + GOTO(out, rc); + + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +static int lov_recreate(struct obd_export *exp, struct obdo *src_oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti) +{ + struct lov_stripe_md *obj_mdp, *lsm; + struct lov_obd *lov = &exp->exp_obd->u.lov; + unsigned ost_idx; + int rc, i; + ENTRY; + + LASSERT(src_oa->o_valid & OBD_MD_FLFLAGS && + src_oa->o_flags & OBD_FL_RECREATE_OBJS); + + OBD_ALLOC(obj_mdp, sizeof(*obj_mdp)); + if (obj_mdp == NULL) + RETURN(-ENOMEM); + + ost_idx = src_oa->o_nlink; + lsm = *ea; + if (lsm == NULL) + GOTO(out, rc = -EINVAL); + if (ost_idx >= lov->desc.ld_tgt_count || + !lov->lov_tgts[ost_idx]) + GOTO(out, rc = -EINVAL); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (lsm->lsm_oinfo[i]->loi_ost_idx == ost_idx) { + if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) != + ostid_id(&src_oa->o_oi)) + GOTO(out, rc = -EINVAL); + break; + } + } + if (i == lsm->lsm_stripe_count) + GOTO(out, rc = -EINVAL); + + rc = obd_create(NULL, lov->lov_tgts[ost_idx]->ltd_exp, + src_oa, &obj_mdp, oti); +out: + OBD_FREE(obj_mdp, sizeof(*obj_mdp)); + RETURN(rc); +} + +/* the LOV expects oa->o_id to be set to the LOV object id */ +static int lov_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *src_oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + struct lov_obd *lov; + int rc = 0; + ENTRY; + + LASSERT(ea != NULL); + if (exp == NULL) + RETURN(-EINVAL); + + if ((src_oa->o_valid & OBD_MD_FLFLAGS) && + src_oa->o_flags == OBD_FL_DELORPHAN) { + /* should be used with LOV anymore */ + LBUG(); + } + + lov = &exp->exp_obd->u.lov; + if (!lov->desc.ld_active_tgt_count) + RETURN(-EIO); + + obd_getref(exp->exp_obd); + /* Recreate a specific object id at the given OST index */ + if ((src_oa->o_valid & OBD_MD_FLFLAGS) && + (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) { + rc = lov_recreate(exp, src_oa, ea, oti); + } + + obd_putref(exp->exp_obd); + RETURN(rc); +} + +#define ASSERT_LSM_MAGIC(lsmp) \ +do { \ + LASSERT((lsmp) != NULL); \ + LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 || \ + (lsmp)->lsm_magic == LOV_MAGIC_V3), \ + "%p->lsm_magic=%x\n", (lsmp), (lsmp)->lsm_magic); \ +} while (0) + +static int lov_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa) +{ + struct lov_request_set *set; + struct obd_info oinfo; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0, err = 0; + ENTRY; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + if (oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + LASSERT(oti->oti_logcookies); + } + + lov = &exp->exp_obd->u.lov; + obd_getref(exp->exp_obd); + rc = lov_prep_destroy_set(exp, &oinfo, oa, lsm, oti, &set); + if (rc) + GOTO(out, rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; + + err = obd_destroy(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + req->rq_oi.oi_oa, NULL, oti, NULL, capa); + err = lov_update_common_set(set, req, err); + if (err) { + CERROR("%s: destroying objid "DOSTID" subobj " + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, POSTID(&oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, err); + if (!rc) + rc = err; + } + } + + if (rc == 0) { + LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); + rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp); + } + err = lov_fini_destroy_set(set); +out: + obd_putref(exp->exp_obd); + RETURN(rc ? rc : err); +} + +static int lov_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int err = 0, rc = 0; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + + rc = lov_prep_getattr_set(exp, oinfo, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx" + " %u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe, + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx); + + rc = obd_getattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi); + err = lov_update_common_set(set, req, rc); + if (err) { + CERROR("%s: getattr objid "DOSTID" subobj " + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&oinfo->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, err); + break; + } + } + + rc = lov_fini_getattr_set(set); + if (err) + rc = err; + RETURN(rc); +} + +static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + /* don't do attribute merge if this aysnc op failed */ + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_getattr_set(lovset); + RETURN(rc ? rc : err); +} + +static int lov_getattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *lovset; + struct lov_obd *lov; + struct list_head *pos; + struct lov_request *req; + int rc = 0, err; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + + rc = lov_prep_getattr_set(exp, oinfo, &lovset); + if (rc) + RETURN(rc); + + CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n", + POSTID(&oinfo->oi_md->lsm_oi), oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); + + list_for_each(pos, &lovset->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx" + "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe, + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx); + rc = obd_getattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, rqset); + if (rc) { + CERROR("%s: getattr objid "DOSTID" subobj" + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&oinfo->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, rc); + GOTO(out, rc); + } + } + + if (!list_empty(&rqset->set_requests)) { + LASSERT(rc == 0); + LASSERT (rqset->set_interpret == NULL); + rqset->set_interpret = lov_getattr_interpret; + rqset->set_arg = (void *)lovset; + RETURN(rc); + } +out: + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_getattr_set(lovset); + RETURN(rc ? rc : err); +} + +static int lov_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) +{ + struct lov_request_set *set; + struct lov_obd *lov; + struct list_head *pos; + struct lov_request *req; + int err = 0, rc = 0; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + /* for now, we only expect the following updates here */ + LASSERT(!(oinfo->oi_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE | + OBD_MD_FLMODE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLFLAGS | OBD_MD_FLSIZE | + OBD_MD_FLGROUP | OBD_MD_FLUID | + OBD_MD_FLGID | OBD_MD_FLFID | + OBD_MD_FLGENER))); + lov = &exp->exp_obd->u.lov; + rc = lov_prep_setattr_set(exp, oinfo, oti, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + rc = obd_setattr(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, NULL); + err = lov_update_setattr_set(set, req, rc); + if (err) { + CERROR("%s: setattr objid "DOSTID" subobj " + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&set->set_oi->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, + err); + if (!rc) + rc = err; + } + } + err = lov_fini_setattr_set(set); + if (!rc) + rc = err; + RETURN(rc); +} + +static int lov_setattr_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_setattr_set(lovset); + RETURN(rc ? rc : err); +} + +/* If @oti is given, the request goes from MDS and responses from OSTs are not + needed. Otherwise, a client is waiting for responses. */ +static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) { + LASSERT(oti); + LASSERT(oti->oti_logcookies); + } + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + rc = lov_prep_setattr_set(exp, oinfo, oti, &set); + if (rc) + RETURN(rc); + + CDEBUG(D_INFO, "objid "DOSTID": %ux%u byte stripes\n", + POSTID(&oinfo->oi_md->lsm_oi), + oinfo->oi_md->lsm_stripe_count, + oinfo->oi_md->lsm_stripe_size); + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + oti->oti_logcookies = set->set_cookies + req->rq_stripe; + + CDEBUG(D_INFO, "objid "DOSTID"[%d] has subobj "DOSTID" at idx" + "%u\n", POSTID(&oinfo->oi_oa->o_oi), req->rq_stripe, + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx); + + rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, oti, rqset); + if (rc) { + CERROR("error: setattr objid "DOSTID" subobj" + DOSTID" on OST idx %d: rc = %d\n", + POSTID(&set->set_oi->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), + req->rq_idx, rc); + break; + } + } + + /* If we are not waiting for responses on async requests, return. */ + if (rc || !rqset || list_empty(&rqset->set_requests)) { + int err; + if (rc) + atomic_set(&set->set_completes, 0); + err = lov_fini_setattr_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_setattr_interpret; + rqset->set_arg = (void *)set; + + RETURN(0); +} + +static int lov_punch_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_punch_set(lovset); + RETURN(rc ? rc : err); +} + +/* FIXME: maybe we'll just make one node the authoritative attribute node, then + * we can send this 'punch' to just the authoritative node and the nodes + * that the punch will affect. */ +static int lov_punch(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *set; + struct lov_obd *lov; + struct list_head *pos; + struct lov_request *req; + int rc = 0; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + rc = lov_prep_punch_set(exp, oinfo, oti, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + rc = obd_punch(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, NULL, rqset); + if (rc) { + CERROR("%s: punch objid "DOSTID" subobj "DOSTID + " on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&set->set_oi->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, rc); + break; + } + } + + if (rc || list_empty(&rqset->set_requests)) { + int err; + err = lov_fini_punch_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_punch_interpret; + rqset->set_arg = (void *)set; + + RETURN(0); +} + +static int lov_sync_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = data; + int err; + ENTRY; + + if (rc) + atomic_set(&lovset->set_completes, 0); + err = lov_fini_sync_set(lovset); + RETURN(rc ?: err); +} + +static int lov_sync(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_off start, obd_off end, + struct ptlrpc_request_set *rqset) +{ + struct lov_request_set *set = NULL; + struct lov_obd *lov; + struct list_head *pos; + struct lov_request *req; + int rc = 0; + ENTRY; + + ASSERT_LSM_MAGIC(oinfo->oi_md); + LASSERT(rqset != NULL); + + if (!exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + rc = lov_prep_sync_set(exp, oinfo, start, end, &set); + if (rc) + RETURN(rc); + + CDEBUG(D_INFO, "fsync objid "DOSTID" ["LPX64", "LPX64"]\n", + POSTID(&set->set_oi->oi_oa->o_oi), start, end); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + rc = obd_sync(env, lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, req->rq_oi.oi_policy.l_extent.start, + req->rq_oi.oi_policy.l_extent.end, rqset); + if (rc) { + CERROR("%s: fsync objid "DOSTID" subobj "DOSTID + " on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, + POSTID(&set->set_oi->oi_oa->o_oi), + POSTID(&req->rq_oi.oi_oa->o_oi), req->rq_idx, + rc); + break; + } + } + + /* If we are not waiting for responses on async requests, return. */ + if (rc || list_empty(&rqset->set_requests)) { + int err = lov_fini_sync_set(set); + + RETURN(rc ?: err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_sync_interpret; + rqset->set_arg = (void *)set; + + RETURN(0); +} + +static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo, + obd_count oa_bufs, struct brw_page *pga) +{ + struct obd_info oinfo = { { { 0 } } }; + int i, rc = 0; + + oinfo.oi_oa = lov_oinfo->oi_oa; + + /* The caller just wants to know if there's a chance that this + * I/O can succeed */ + for (i = 0; i < oa_bufs; i++) { + int stripe = lov_stripe_number(lov_oinfo->oi_md, pga[i].off); + int ost = lov_oinfo->oi_md->lsm_oinfo[stripe]->loi_ost_idx; + obd_off start, end; + + if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off, + pga[i].off + pga[i].count - 1, + &start, &end)) + continue; + + if (!lov->lov_tgts[ost] || !lov->lov_tgts[ost]->ltd_active) { + CDEBUG(D_HA, "lov idx %d inactive\n", ost); + return -EIO; + } + + rc = obd_brw(OBD_BRW_CHECK, lov->lov_tgts[ost]->ltd_exp, &oinfo, + 1, &pga[i], NULL); + if (rc) + break; + } + return rc; +} + +static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti) +{ + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int err, rc = 0; + ENTRY; + + ASSERT_LSM_MAGIC(oinfo->oi_md); + + if (cmd == OBD_BRW_CHECK) { + rc = lov_brw_check(lov, oinfo, oa_bufs, pga); + RETURN(rc); + } + + rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + struct obd_export *sub_exp; + struct brw_page *sub_pga; + req = list_entry(pos, struct lov_request, rq_link); + + sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp; + sub_pga = set->set_pga + req->rq_pgaidx; + rc = obd_brw(cmd, sub_exp, &req->rq_oi, req->rq_oabufs, + sub_pga, oti); + if (rc) + break; + lov_update_common_set(set, req, rc); + } + + err = lov_fini_brw_set(set); + if (!rc) + rc = err; + RETURN(rc); +} + +static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset, + void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + ENTRY; + rc = lov_fini_enqueue_set(lovset, lovset->set_ei->ei_mode, rc, rqset); + RETURN(rc); +} + +static int lov_enqueue(struct obd_export *exp, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset) +{ + ldlm_mode_t mode = einfo->ei_mode; + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + ldlm_error_t rc; + ENTRY; + + LASSERT(oinfo); + ASSERT_LSM_MAGIC(oinfo->oi_md); + LASSERT(mode == (mode & -mode)); + + /* we should never be asked to replay a lock this way. */ + LASSERT((oinfo->oi_flags & LDLM_FL_REPLAY) == 0); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + rc = lov_prep_enqueue_set(exp, oinfo, einfo, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + rc = obd_enqueue(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, einfo, rqset); + if (rc != ELDLM_OK) + GOTO(out, rc); + } + + if (rqset && !list_empty(&rqset->set_requests)) { + LASSERT(rc == 0); + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_enqueue_interpret; + rqset->set_arg = (void *)set; + RETURN(rc); + } +out: + rc = lov_fini_enqueue_set(set, mode, rc, rqset); + RETURN(rc); +} + +static int lov_change_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, ldlm_iterator_t it, + void *data) +{ + struct lov_obd *lov; + int rc = 0, i; + ENTRY; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_stripe_md submd; + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx); + continue; + } + + submd.lsm_oi = loi->loi_oi; + submd.lsm_stripe_count = 0; + rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, + &submd, it, data); + } + RETURN(rc); +} + +/* find any ldlm lock of the inode in lov + * return 0 not find + * 1 find one + * < 0 error */ +static int lov_find_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, ldlm_iterator_t it, + void *data) +{ + struct lov_obd *lov; + int rc = 0, i; + ENTRY; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_stripe_md submd; + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx); + continue; + } + submd.lsm_oi = loi->loi_oi; + submd.lsm_stripe_count = 0; + rc = obd_find_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, + &submd, it, data); + if (rc != 0) + RETURN(rc); + } + RETURN(rc); +} + +static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm, + __u32 mode, struct lustre_handle *lockh) +{ + struct lov_request_set *set; + struct obd_info oinfo; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + struct lustre_handle *lov_lockhp; + int err = 0, rc = 0; + ENTRY; + + ASSERT_LSM_MAGIC(lsm); + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + LASSERT(lockh); + lov = &exp->exp_obd->u.lov; + rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set); + if (rc) + RETURN(rc); + + list_for_each(pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; + + rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp, + req->rq_oi.oi_md, mode, lov_lockhp); + rc = lov_update_common_set(set, req, rc); + if (rc) { + CERROR("%s: cancel objid "DOSTID" subobj " + DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi), + POSTID(&req->rq_oi.oi_md->lsm_oi), + req->rq_idx, rc); + err = rc; + } + + } + lov_fini_cancel_set(set); + RETURN(err); +} + +static int lov_cancel_unused(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_cancel_flags_t flags, void *opaque) +{ + struct lov_obd *lov; + int rc = 0, i; + ENTRY; + + if (!exp || !exp->exp_obd) + RETURN(-ENODEV); + + lov = &exp->exp_obd->u.lov; + if (lsm == NULL) { + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) + continue; + + err = obd_cancel_unused(lov->lov_tgts[i]->ltd_exp, NULL, + flags, opaque); + if (!rc) + rc = err; + } + RETURN(rc); + } + + ASSERT_LSM_MAGIC(lsm); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_stripe_md submd; + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + int idx = loi->loi_ost_idx; + int err; + + if (!lov->lov_tgts[idx]) { + CDEBUG(D_HA, "lov idx %d NULL\n", idx); + continue; + } + + if (!lov->lov_tgts[idx]->ltd_active) + CDEBUG(D_HA, "lov idx %d inactive\n", idx); + + submd.lsm_oi = loi->loi_oi; + submd.lsm_stripe_count = 0; + err = obd_cancel_unused(lov->lov_tgts[idx]->ltd_exp, + &submd, flags, opaque); + if (err && lov->lov_tgts[idx]->ltd_active) { + CERROR("%s: cancel unused objid "DOSTID + " subobj "DOSTID" on OST idx %d: rc = %d\n", + exp->exp_obd->obd_name, POSTID(&lsm->lsm_oi), + POSTID(&loi->loi_oi), idx, err); + if (!rc) + rc = err; + } + } + RETURN(rc); +} + +int lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) +{ + struct lov_request_set *lovset = (struct lov_request_set *)data; + int err; + ENTRY; + + if (rc) + atomic_set(&lovset->set_completes, 0); + + err = lov_fini_statfs_set(lovset); + RETURN(rc ? rc : err); +} + +static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo, + __u64 max_age, struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_request_set *set; + struct lov_request *req; + struct list_head *pos; + struct lov_obd *lov; + int rc = 0; + ENTRY; + + LASSERT(oinfo != NULL); + LASSERT(oinfo->oi_osfs != NULL); + + lov = &obd->u.lov; + rc = lov_prep_statfs_set(obd, oinfo, &set); + if (rc) + RETURN(rc); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp, + &req->rq_oi, max_age, rqset); + if (rc) + break; + } + + if (rc || list_empty(&rqset->set_requests)) { + int err; + if (rc) + atomic_set(&set->set_completes, 0); + err = lov_fini_statfs_set(set); + RETURN(rc ? rc : err); + } + + LASSERT(rqset->set_interpret == NULL); + rqset->set_interpret = lov_statfs_interpret; + rqset->set_arg = (void *)set; + RETURN(0); +} + +static int lov_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + ENTRY; + + + /* for obdclass we forbid using obd_statfs_rqset, but prefer using async + * statfs requests */ + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + oinfo.oi_osfs = osfs; + oinfo.oi_flags = flags; + rc = lov_statfs_async(exp, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + + RETURN(rc); +} + +static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + int i = 0, rc = 0, count = lov->desc.ld_tgt_count; + struct obd_uuid *uuidp; + ENTRY; + + switch (cmd) { + case IOC_OBD_STATFS: { + struct obd_ioctl_data *data = karg; + struct obd_device *osc_obd; + struct obd_statfs stat_buf = {0}; + __u32 index; + __u32 flags; + + memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); + if ((index >= count)) + RETURN(-ENODEV); + + if (!lov->lov_tgts[index]) + /* Try again with the next index */ + RETURN(-EAGAIN); + if (!lov->lov_tgts[index]->ltd_active) + RETURN(-ENODATA); + + osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); + if (!osc_obd) + RETURN(-EINVAL); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + RETURN(-EFAULT); + + flags = uarg ? *(__u32*)uarg : 0; + /* got statfs data */ + rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + flags); + if (rc) + RETURN(rc); + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + RETURN(-EFAULT); + break; + } + case OBD_IOC_LOV_GET_CONFIG: { + struct obd_ioctl_data *data; + struct lov_desc *desc; + char *buf = NULL; + __u32 *genp; + + len = 0; + if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) + RETURN(-EINVAL); + + data = (struct obd_ioctl_data *)buf; + + if (sizeof(*desc) > data->ioc_inllen1) { + obd_ioctl_freedata(buf, len); + RETURN(-EINVAL); + } + + if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) { + obd_ioctl_freedata(buf, len); + RETURN(-EINVAL); + } + + if (sizeof(__u32) * count > data->ioc_inllen3) { + obd_ioctl_freedata(buf, len); + RETURN(-EINVAL); + } + + desc = (struct lov_desc *)data->ioc_inlbuf1; + memcpy(desc, &(lov->desc), sizeof(*desc)); + + uuidp = (struct obd_uuid *)data->ioc_inlbuf2; + genp = (__u32 *)data->ioc_inlbuf3; + /* the uuid will be empty for deleted OSTs */ + for (i = 0; i < count; i++, uuidp++, genp++) { + if (!lov->lov_tgts[i]) + continue; + *uuidp = lov->lov_tgts[i]->ltd_uuid; + *genp = lov->lov_tgts[i]->ltd_gen; + } + + if (copy_to_user((void *)uarg, buf, len)) + rc = -EFAULT; + obd_ioctl_freedata(buf, len); + break; + } + case LL_IOC_LOV_SETSTRIPE: + rc = lov_setstripe(exp, len, karg, uarg); + break; + case LL_IOC_LOV_GETSTRIPE: + rc = lov_getstripe(exp, karg, uarg); + break; + case LL_IOC_LOV_SETEA: + rc = lov_setea(exp, karg, uarg); + break; + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lov_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_OSTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = lov->lov_tgts[qctl->qc_idx]; + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || + !obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_OSTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + default: { + int set = 0; + + if (count == 0) + RETURN(-ENOTTY); + + for (i = 0; i < count; i++) { + int err; + struct obd_device *osc_obd; + + /* OST was disconnected */ + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) + continue; + + /* ll_umount_begin() sets force flag but for lov, not + * osc. Let's pass it through */ + osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); + osc_obd->obd_force = obddev->obd_force; + err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp, + len, karg, uarg); + if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { + RETURN(err); + } else if (err) { + if (lov->lov_tgts[i]->ltd_active) { + CDEBUG(err == -ENOTTY ? + D_IOCTL : D_WARNING, + "iocontrol OSC %s on OST " + "idx %d cmd %x: err = %d\n", + lov_uuid2str(lov, i), + i, cmd, err); + if (!rc) + rc = err; + } + } else { + set = 1; + } + } + if (!set && !rc) + rc = -EIO; + } + } + + RETURN(rc); +} + +#define FIEMAP_BUFFER_SIZE 4096 + +/** + * Non-zero fe_logical indicates that this is a continuation FIEMAP + * call. The local end offset and the device are sent in the first + * fm_extent. This function calculates the stripe number from the index. + * This function returns a stripe_no on which mapping is to be restarted. + * + * This function returns fm_end_offset which is the in-OST offset at which + * mapping should be restarted. If fm_end_offset=0 is returned then caller + * will re-calculate proper offset in next stripe. + * Note that the first extent is passed to lov_get_info via the value field. + * + * \param fiemap fiemap request header + * \param lsm striping information for the file + * \param fm_start logical start of mapping + * \param fm_end logical end of mapping + * \param start_stripe starting stripe will be returned in this + */ +obd_size fiemap_calc_fm_end_offset(struct ll_user_fiemap *fiemap, + struct lov_stripe_md *lsm, obd_size fm_start, + obd_size fm_end, int *start_stripe) +{ + obd_size local_end = fiemap->fm_extents[0].fe_logical; + obd_off lun_start, lun_end; + obd_size fm_end_offset; + int stripe_no = -1, i; + + if (fiemap->fm_extent_count == 0 || + fiemap->fm_extents[0].fe_logical == 0) + return 0; + + /* Find out stripe_no from ost_index saved in the fe_device */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (lsm->lsm_oinfo[i]->loi_ost_idx == + fiemap->fm_extents[0].fe_device) { + stripe_no = i; + break; + } + } + if (stripe_no == -1) + return -EINVAL; + + /* If we have finished mapping on previous device, shift logical + * offset to start of next device */ + if ((lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end, + &lun_start, &lun_end)) != 0 && + local_end < lun_end) { + fm_end_offset = local_end; + *start_stripe = stripe_no; + } else { + /* This is a special value to indicate that caller should + * calculate offset in next stripe. */ + fm_end_offset = 0; + *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count; + } + + return fm_end_offset; +} + +/** + * We calculate on which OST the mapping will end. If the length of mapping + * is greater than (stripe_size * stripe_count) then the last_stripe will + * will be one just before start_stripe. Else we check if the mapping + * intersects each OST and find last_stripe. + * This function returns the last_stripe and also sets the stripe_count + * over which the mapping is spread + * + * \param lsm striping information for the file + * \param fm_start logical start of mapping + * \param fm_end logical end of mapping + * \param start_stripe starting stripe of the mapping + * \param stripe_count the number of stripes across which to map is returned + * + * \retval last_stripe return the last stripe of the mapping + */ +int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, obd_size fm_start, + obd_size fm_end, int start_stripe, + int *stripe_count) +{ + int last_stripe; + obd_off obd_start, obd_end; + int i, j; + + if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) { + last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 : + start_stripe - 1); + *stripe_count = lsm->lsm_stripe_count; + } else { + for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count; + i = (i + 1) % lsm->lsm_stripe_count, j++) { + if ((lov_stripe_intersects(lsm, i, fm_start, fm_end, + &obd_start, &obd_end)) == 0) + break; + } + *stripe_count = j; + last_stripe = (start_stripe + j - 1) %lsm->lsm_stripe_count; + } + + return last_stripe; +} + +/** + * Set fe_device and copy extents from local buffer into main return buffer. + * + * \param fiemap fiemap request header + * \param lcl_fm_ext array of local fiemap extents to be copied + * \param ost_index OST index to be written into the fm_device field for each + extent + * \param ext_count number of extents to be copied + * \param current_extent where to start copying in main extent array + */ +void fiemap_prepare_and_copy_exts(struct ll_user_fiemap *fiemap, + struct ll_fiemap_extent *lcl_fm_ext, + int ost_index, unsigned int ext_count, + int current_extent) +{ + char *to; + int ext; + + for (ext = 0; ext < ext_count; ext++) { + lcl_fm_ext[ext].fe_device = ost_index; + lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET; + } + + /* Copy fm_extent's from fm_local to return buffer */ + to = (char *)fiemap + fiemap_count_to_size(current_extent); + memcpy(to, lcl_fm_ext, ext_count * sizeof(struct ll_fiemap_extent)); +} + +/** + * Break down the FIEMAP request and send appropriate calls to individual OSTs. + * This also handles the restarting of FIEMAP calls in case mapping overflows + * the available number of extents in single call. + */ +static int lov_fiemap(struct lov_obd *lov, __u32 keylen, void *key, + __u32 *vallen, void *val, struct lov_stripe_md *lsm) +{ + struct ll_fiemap_info_key *fm_key = key; + struct ll_user_fiemap *fiemap = val; + struct ll_user_fiemap *fm_local = NULL; + struct ll_fiemap_extent *lcl_fm_ext; + int count_local; + unsigned int get_num_extents = 0; + int ost_index = 0, actual_start_stripe, start_stripe; + obd_size fm_start, fm_end, fm_length, fm_end_offset; + obd_size curr_loc; + int current_extent = 0, rc = 0, i; + int ost_eof = 0; /* EOF for object */ + int ost_done = 0; /* done with required mapping for this OST? */ + int last_stripe; + int cur_stripe = 0, cur_stripe_wrap = 0, stripe_count; + unsigned int buffer_size = FIEMAP_BUFFER_SIZE; + + if (lsm == NULL) + GOTO(out, rc = 0); + + if (fiemap_count_to_size(fm_key->fiemap.fm_extent_count) < buffer_size) + buffer_size = fiemap_count_to_size(fm_key->fiemap.fm_extent_count); + + OBD_ALLOC_LARGE(fm_local, buffer_size); + if (fm_local == NULL) + GOTO(out, rc = -ENOMEM); + lcl_fm_ext = &fm_local->fm_extents[0]; + + count_local = fiemap_size_to_count(buffer_size); + + memcpy(fiemap, &fm_key->fiemap, sizeof(*fiemap)); + fm_start = fiemap->fm_start; + fm_length = fiemap->fm_length; + /* Calculate start stripe, last stripe and length of mapping */ + actual_start_stripe = start_stripe = lov_stripe_number(lsm, fm_start); + fm_end = (fm_length == ~0ULL ? fm_key->oa.o_size : + fm_start + fm_length - 1); + /* If fm_length != ~0ULL but fm_start+fm_length-1 exceeds file size */ + if (fm_end > fm_key->oa.o_size) + fm_end = fm_key->oa.o_size; + + last_stripe = fiemap_calc_last_stripe(lsm, fm_start, fm_end, + actual_start_stripe, &stripe_count); + + fm_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fm_start, + fm_end, &start_stripe); + if (fm_end_offset == -EINVAL) + GOTO(out, rc = -EINVAL); + + if (fiemap->fm_extent_count == 0) { + get_num_extents = 1; + count_local = 0; + } + + /* Check each stripe */ + for (cur_stripe = start_stripe, i = 0; i < stripe_count; + i++, cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) { + obd_size req_fm_len; /* Stores length of required mapping */ + obd_size len_mapped_single_call; + obd_off lun_start, lun_end, obd_object_end; + unsigned int ext_count; + + cur_stripe_wrap = cur_stripe; + + /* Find out range of mapping on this stripe */ + if ((lov_stripe_intersects(lsm, cur_stripe, fm_start, fm_end, + &lun_start, &obd_object_end)) == 0) + continue; + + /* If this is a continuation FIEMAP call and we are on + * starting stripe then lun_start needs to be set to + * fm_end_offset */ + if (fm_end_offset != 0 && cur_stripe == start_stripe) + lun_start = fm_end_offset; + + if (fm_length != ~0ULL) { + /* Handle fm_start + fm_length overflow */ + if (fm_start + fm_length < fm_start) + fm_length = ~0ULL - fm_start; + lun_end = lov_size_to_stripe(lsm, fm_start + fm_length, + cur_stripe); + } else { + lun_end = ~0ULL; + } + + if (lun_start == lun_end) + continue; + + req_fm_len = obd_object_end - lun_start; + fm_local->fm_length = 0; + len_mapped_single_call = 0; + + /* If the output buffer is very large and the objects have many + * extents we may need to loop on a single OST repeatedly */ + ost_eof = 0; + ost_done = 0; + do { + if (get_num_extents == 0) { + /* Don't get too many extents. */ + if (current_extent + count_local > + fiemap->fm_extent_count) + count_local = fiemap->fm_extent_count - + current_extent; + } + + lun_start += len_mapped_single_call; + fm_local->fm_length = req_fm_len - len_mapped_single_call; + req_fm_len = fm_local->fm_length; + fm_local->fm_extent_count = count_local; + fm_local->fm_mapped_extents = 0; + fm_local->fm_flags = fiemap->fm_flags; + + fm_key->oa.o_oi = lsm->lsm_oinfo[cur_stripe]->loi_oi; + ost_index = lsm->lsm_oinfo[cur_stripe]->loi_ost_idx; + + if (ost_index < 0 || ost_index >=lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + /* If OST is inactive, return extent with UNKNOWN flag */ + if (!lov->lov_tgts[ost_index]->ltd_active) { + fm_local->fm_flags |= FIEMAP_EXTENT_LAST; + fm_local->fm_mapped_extents = 1; + + lcl_fm_ext[0].fe_logical = lun_start; + lcl_fm_ext[0].fe_length = obd_object_end - + lun_start; + lcl_fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN; + + goto inactive_tgt; + } + + fm_local->fm_start = lun_start; + fm_local->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER; + memcpy(&fm_key->fiemap, fm_local, sizeof(*fm_local)); + *vallen=fiemap_count_to_size(fm_local->fm_extent_count); + rc = obd_get_info(NULL, + lov->lov_tgts[ost_index]->ltd_exp, + keylen, key, vallen, fm_local, lsm); + if (rc != 0) + GOTO(out, rc); + +inactive_tgt: + ext_count = fm_local->fm_mapped_extents; + if (ext_count == 0) { + ost_done = 1; + /* If last stripe has hole at the end, + * then we need to return */ + if (cur_stripe_wrap == last_stripe) { + fiemap->fm_mapped_extents = 0; + goto finish; + } + break; + } + + /* If we just need num of extents then go to next device */ + if (get_num_extents) { + current_extent += ext_count; + break; + } + + len_mapped_single_call = lcl_fm_ext[ext_count-1].fe_logical - + lun_start + lcl_fm_ext[ext_count - 1].fe_length; + + /* Have we finished mapping on this device? */ + if (req_fm_len <= len_mapped_single_call) + ost_done = 1; + + /* Clear the EXTENT_LAST flag which can be present on + * last extent */ + if (lcl_fm_ext[ext_count-1].fe_flags & FIEMAP_EXTENT_LAST) + lcl_fm_ext[ext_count - 1].fe_flags &= + ~FIEMAP_EXTENT_LAST; + + curr_loc = lov_stripe_size(lsm, + lcl_fm_ext[ext_count - 1].fe_logical+ + lcl_fm_ext[ext_count - 1].fe_length, + cur_stripe); + if (curr_loc >= fm_key->oa.o_size) + ost_eof = 1; + + fiemap_prepare_and_copy_exts(fiemap, lcl_fm_ext, + ost_index, ext_count, + current_extent); + + current_extent += ext_count; + + /* Ran out of available extents? */ + if (current_extent >= fiemap->fm_extent_count) + goto finish; + } while (ost_done == 0 && ost_eof == 0); + + if (cur_stripe_wrap == last_stripe) + goto finish; + } + +finish: + /* Indicate that we are returning device offsets unless file just has + * single stripe */ + if (lsm->lsm_stripe_count > 1) + fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER; + + if (get_num_extents) + goto skip_last_device_calc; + + /* Check if we have reached the last stripe and whether mapping for that + * stripe is done. */ + if (cur_stripe_wrap == last_stripe) { + if (ost_done || ost_eof) + fiemap->fm_extents[current_extent - 1].fe_flags |= + FIEMAP_EXTENT_LAST; + } + +skip_last_device_calc: + fiemap->fm_mapped_extents = current_extent; + +out: + OBD_FREE_LARGE(fm_local, buffer_size); + return rc; +} + +static int lov_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + int i, rc; + ENTRY; + + if (!vallen || !val) + RETURN(-EFAULT); + + obd_getref(obddev); + + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { + struct { + char name[16]; + struct ldlm_lock *lock; + } *data = key; + struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name; + struct lov_oinfo *loi; + __u32 *stripe = val; + + if (*vallen < sizeof(*stripe)) + GOTO(out, rc = -EFAULT); + *vallen = sizeof(*stripe); + + /* XXX This is another one of those bits that will need to + * change if we ever actually support nested LOVs. It uses + * the lock's export to find out which stripe it is. */ + /* XXX - it's assumed all the locks for deleted OSTs have + * been cancelled. Also, the export for deleted OSTs will + * be NULL and won't match the lock's export. */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + loi = lsm->lsm_oinfo[i]; + if (!lov->lov_tgts[loi->loi_ost_idx]) + continue; + if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp == + data->lock->l_conn_export && + ostid_res_name_eq(&loi->loi_oi, res_id)) { + *stripe = i; + GOTO(out, rc = 0); + } + } + LDLM_ERROR(data->lock, "lock on inode without such object"); + dump_lsm(D_ERROR, lsm); + GOTO(out, rc = -ENXIO); + } else if (KEY_IS(KEY_LAST_ID)) { + struct obd_id_info *info = val; + __u32 size = sizeof(obd_id); + struct lov_tgt_desc *tgt; + + LASSERT(*vallen == sizeof(struct obd_id_info)); + tgt = lov->lov_tgts[info->idx]; + + if (!tgt || !tgt->ltd_active) + GOTO(out, rc = -ESRCH); + + rc = obd_get_info(env, tgt->ltd_exp, keylen, key, + &size, info->data, NULL); + GOTO(out, rc = 0); + } else if (KEY_IS(KEY_LOVDESC)) { + struct lov_desc *desc_ret = val; + *desc_ret = lov->desc; + + GOTO(out, rc = 0); + } else if (KEY_IS(KEY_FIEMAP)) { + rc = lov_fiemap(lov, keylen, key, vallen, val, lsm); + GOTO(out, rc); + } else if (KEY_IS(KEY_CONNECT_FLAG)) { + struct lov_tgt_desc *tgt; + __u64 ost_idx = *((__u64*)val); + + LASSERT(*vallen == sizeof(__u64)); + LASSERT(ost_idx < lov->desc.ld_tgt_count); + tgt = lov->lov_tgts[ost_idx]; + + if (!tgt || !tgt->ltd_exp) + GOTO(out, rc = -ESRCH); + + *((__u64 *)val) = exp_connect_flags(tgt->ltd_exp); + GOTO(out, rc = 0); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = lov->desc.ld_tgt_count; + GOTO(out, rc = 0); + } + + rc = -EINVAL; + +out: + obd_putref(obddev); + RETURN(rc); +} + +static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct lov_obd *lov = &obddev->u.lov; + obd_count count; + int i, rc = 0, err; + struct lov_tgt_desc *tgt; + unsigned incr, check_uuid, + do_inactive, no_set; + unsigned next_id = 0, mds_con = 0, capa = 0; + ENTRY; + + incr = check_uuid = do_inactive = no_set = 0; + if (set == NULL) { + no_set = 1; + set = ptlrpc_prep_set(); + if (!set) + RETURN(-ENOMEM); + } + + obd_getref(obddev); + count = lov->desc.ld_tgt_count; + + if (KEY_IS(KEY_NEXT_ID)) { + count = vallen / sizeof(struct obd_id_info); + vallen = sizeof(obd_id); + incr = sizeof(struct obd_id_info); + do_inactive = 1; + next_id = 1; + } else if (KEY_IS(KEY_CHECKSUM)) { + do_inactive = 1; + } else if (KEY_IS(KEY_EVICT_BY_NID)) { + /* use defaults: do_inactive = incr = 0; */ + } else if (KEY_IS(KEY_MDS_CONN)) { + mds_con = 1; + } else if (KEY_IS(KEY_CAPA_KEY)) { + capa = 1; + } else if (KEY_IS(KEY_CACHE_SET)) { + LASSERT(lov->lov_cache == NULL); + lov->lov_cache = val; + do_inactive = 1; + } + + for (i = 0; i < count; i++, val = (char *)val + incr) { + if (next_id) { + tgt = lov->lov_tgts[((struct obd_id_info*)val)->idx]; + } else { + tgt = lov->lov_tgts[i]; + } + /* OST was disconnected */ + if (!tgt || !tgt->ltd_exp) + continue; + + /* OST is inactive and we don't want inactive OSCs */ + if (!tgt->ltd_active && !do_inactive) + continue; + + if (mds_con) { + struct mds_group_info *mgi; + + LASSERT(vallen == sizeof(*mgi)); + mgi = (struct mds_group_info *)val; + + /* Only want a specific OSC */ + if (mgi->uuid && !obd_uuid_equals(mgi->uuid, + &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, sizeof(int), + &mgi->group, set); + } else if (next_id) { + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, + ((struct obd_id_info*)val)->data, set); + } else if (capa) { + struct mds_capa_info *info = (struct mds_capa_info*)val; + + LASSERT(vallen == sizeof(*info)); + + /* Only want a specific OSC */ + if (info->uuid && + !obd_uuid_equals(info->uuid, &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, keylen, + key, sizeof(*info->capa), + info->capa, set); + } else { + /* Only want a specific OSC */ + if (check_uuid && + !obd_uuid_equals(val, &tgt->ltd_uuid)) + continue; + + err = obd_set_info_async(env, tgt->ltd_exp, + keylen, key, vallen, val, set); + } + + if (!rc) + rc = err; + } + + obd_putref(obddev); + if (no_set) { + err = ptlrpc_set_wait(set); + if (!rc) + rc = err; + ptlrpc_set_destroy(set); + } + RETURN(rc); +} + +static int lov_extent_calc(struct obd_export *exp, struct lov_stripe_md *lsm, + int cmd, __u64 *offset) +{ + __u32 ssize = lsm->lsm_stripe_size; + __u64 start; + + start = *offset; + lov_do_div64(start, ssize); + start = start * ssize; + + CDEBUG(D_DLMTRACE, "offset "LPU64", stripe %u, start "LPU64 + ", end "LPU64"\n", *offset, ssize, start, + start + ssize - 1); + if (cmd == OBD_CALC_STRIPE_END) { + *offset = start + ssize - 1; + } else if (cmd == OBD_CALC_STRIPE_START) { + *offset = start; + } else { + LBUG(); + } + + RETURN(0); +} + +void lov_stripe_lock(struct lov_stripe_md *md) +{ + LASSERT(md->lsm_lock_owner != current_pid()); + spin_lock(&md->lsm_lock); + LASSERT(md->lsm_lock_owner == 0); + md->lsm_lock_owner = current_pid(); +} +EXPORT_SYMBOL(lov_stripe_lock); + +void lov_stripe_unlock(struct lov_stripe_md *md) +{ + LASSERT(md->lsm_lock_owner == current_pid()); + md->lsm_lock_owner = 0; + spin_unlock(&md->lsm_lock); +} +EXPORT_SYMBOL(lov_stripe_unlock); + +static int lov_quotactl(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_tgt_desc *tgt; + __u64 curspace = 0; + __u64 bhardlimit = 0; + int i, rc = 0; + ENTRY; + + if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON && + oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF && + oqctl->qc_cmd != Q_GETOQUOTA && + oqctl->qc_cmd != Q_INITQUOTA && + oqctl->qc_cmd != LUSTRE_Q_SETQUOTA && + oqctl->qc_cmd != Q_FINVALIDATE) { + CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd); + RETURN(-EFAULT); + } + + /* for lov tgt */ + obd_getref(obd); + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + tgt = lov->lov_tgts[i]; + + if (!tgt) + continue; + + if (!tgt->ltd_active || tgt->ltd_reap) { + if (oqctl->qc_cmd == Q_GETOQUOTA && + lov->lov_tgts[i]->ltd_activate) { + rc = -EREMOTEIO; + CERROR("ost %d is inactive\n", i); + } else { + CDEBUG(D_HA, "ost %d is inactive\n", i); + } + continue; + } + + err = obd_quotactl(tgt->ltd_exp, oqctl); + if (err) { + if (tgt->ltd_active && !rc) + rc = err; + continue; + } + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + curspace += oqctl->qc_dqblk.dqb_curspace; + bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit; + } + } + obd_putref(obd); + + if (oqctl->qc_cmd == Q_GETOQUOTA) { + oqctl->qc_dqblk.dqb_curspace = curspace; + oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit; + } + RETURN(rc); +} + +static int lov_quotacheck(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct lov_obd *lov = &obd->u.lov; + int i, rc = 0; + ENTRY; + + obd_getref(obd); + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + /* Skip quota check on the administratively disabled OSTs. */ + if (!lov->lov_tgts[i]->ltd_activate) { + CWARN("lov idx %d was administratively disabled, " + "skip quotacheck on it.\n", i); + continue; + } + + if (!lov->lov_tgts[i]->ltd_active) { + CERROR("lov idx %d inactive\n", i); + rc = -EIO; + goto out; + } + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_activate) + continue; + + err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl); + if (err && !rc) + rc = err; + } + +out: + obd_putref(obd); + + RETURN(rc); +} + +struct obd_ops lov_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = lov_setup, + .o_precleanup = lov_precleanup, + .o_cleanup = lov_cleanup, + //.o_process_config = lov_process_config, + .o_connect = lov_connect, + .o_disconnect = lov_disconnect, + .o_statfs = lov_statfs, + .o_statfs_async = lov_statfs_async, + .o_packmd = lov_packmd, + .o_unpackmd = lov_unpackmd, + .o_create = lov_create, + .o_destroy = lov_destroy, + .o_getattr = lov_getattr, + .o_getattr_async = lov_getattr_async, + .o_setattr = lov_setattr, + .o_setattr_async = lov_setattr_async, + .o_brw = lov_brw, + .o_merge_lvb = lov_merge_lvb, + .o_adjust_kms = lov_adjust_kms, + .o_punch = lov_punch, + .o_sync = lov_sync, + .o_enqueue = lov_enqueue, + .o_change_cbdata = lov_change_cbdata, + .o_find_cbdata = lov_find_cbdata, + .o_cancel = lov_cancel, + .o_cancel_unused = lov_cancel_unused, + .o_iocontrol = lov_iocontrol, + .o_get_info = lov_get_info, + .o_set_info_async = lov_set_info_async, + .o_extent_calc = lov_extent_calc, + .o_llog_init = lov_llog_init, + .o_llog_finish = lov_llog_finish, + .o_notify = lov_notify, + .o_pool_new = lov_pool_new, + .o_pool_rem = lov_pool_remove, + .o_pool_add = lov_pool_add, + .o_pool_del = lov_pool_del, + .o_getref = lov_getref, + .o_putref = lov_putref, + .o_quotactl = lov_quotactl, + .o_quotacheck = lov_quotacheck, +}; + +struct kmem_cache *lov_oinfo_slab; + +extern struct lu_kmem_descr lov_caches[]; + +int __init lov_init(void) +{ + struct lprocfs_static_vars lvars = { 0 }; + int rc; + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches); + + rc = lu_kmem_init(lov_caches); + if (rc) + return rc; + + lov_oinfo_slab = kmem_cache_create("lov_oinfo", + sizeof(struct lov_oinfo), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (lov_oinfo_slab == NULL) { + lu_kmem_fini(lov_caches); + return -ENOMEM; + } + lprocfs_lov_init_vars(&lvars); + + rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars, + LUSTRE_LOV_NAME, &lov_device_type); + + if (rc) { + kmem_cache_destroy(lov_oinfo_slab); + lu_kmem_fini(lov_caches); + } + + RETURN(rc); +} + +static void /*__exit*/ lov_exit(void) +{ + class_unregister_type(LUSTRE_LOV_NAME); + kmem_cache_destroy(lov_oinfo_slab); + + lu_kmem_fini(lov_caches); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver"); +MODULE_LICENSE("GPL"); + +cfs_module(lov, LUSTRE_VERSION_STRING, lov_init, lov_exit); diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c new file mode 100644 index 000000000000..aa8ae80e8121 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_object.c @@ -0,0 +1,942 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOV layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" +#include + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Layout operations. + * + */ + +struct lov_layout_operations { + int (*llo_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state); + int (*llo_delete)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_install)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + int (*llo_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + int (*llo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + int (*llo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +}; + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); + +/***************************************************************************** + * + * Lov object layout operations. + * + */ + +static void lov_install_empty(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + /* + * File without objects. + */ +} + +static int lov_init_empty(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + return 0; +} + +static void lov_install_raid0(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ +} + +static struct cl_object *lov_sub_find(const struct lu_env *env, + struct cl_device *dev, + const struct lu_fid *fid, + const struct cl_object_conf *conf) +{ + struct lu_object *o; + + ENTRY; + o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); + LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); + RETURN(lu2cl(o)); +} + +static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, + struct cl_object *stripe, + struct lov_layout_raid0 *r0, int idx) +{ + struct cl_object_header *hdr; + struct cl_object_header *subhdr; + struct cl_object_header *parent; + struct lov_oinfo *oinfo; + int result; + + if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) { + /* For sanity:test_206. + * Do not leave the object in cache to avoid accessing + * freed memory. This is because osc_object is referring to + * lov_oinfo of lsm_stripe_data which will be freed due to + * this failure. */ + cl_object_kill(env, stripe); + cl_object_put(env, stripe); + return -EIO; + } + + hdr = cl_object_header(lov2cl(lov)); + subhdr = cl_object_header(stripe); + parent = subhdr->coh_parent; + + oinfo = lov->lo_lsm->lsm_oinfo[idx]; + CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: ostid: "DOSTID + " idx: %d gen: %d\n", + PFID(&subhdr->coh_lu.loh_fid), subhdr, idx, + PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi), + oinfo->loi_ost_idx, oinfo->loi_ost_gen); + + if (parent == NULL) { + subhdr->coh_parent = hdr; + subhdr->coh_nesting = hdr->coh_nesting + 1; + lu_object_ref_add(&stripe->co_lu, "lov-parent", lov); + r0->lo_sub[idx] = cl2lovsub(stripe); + r0->lo_sub[idx]->lso_super = lov; + r0->lo_sub[idx]->lso_index = idx; + result = 0; + } else { + struct lu_object *old_obj; + struct lov_object *old_lov; + unsigned int mask = D_INODE; + + old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); + LASSERT(old_obj != NULL); + old_lov = cl2lov(lu2cl(old_obj)); + if (old_lov->lo_layout_invalid) { + /* the object's layout has already changed but isn't + * refreshed */ + lu_object_unhash(env, &stripe->co_lu); + result = -EAGAIN; + } else { + mask = D_ERROR; + result = -EIO; + } + + LU_OBJECT_DEBUG(mask, env, &stripe->co_lu, + "stripe %d is already owned.\n", idx); + LU_OBJECT_DEBUG(mask, env, old_obj, "owned.\n"); + LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n"); + cl_object_put(env, stripe); + } + return result; +} + +static int lov_init_raid0(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + int result; + int i; + + struct cl_object *stripe; + struct lov_thread_info *lti = lov_env_info(env); + struct cl_object_conf *subconf = <i->lti_stripe_conf; + struct lov_stripe_md *lsm = conf->u.coc_md->lsm; + struct lu_fid *ofid = <i->lti_fid; + struct lov_layout_raid0 *r0 = &state->raid0; + + ENTRY; + + if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) { + dump_lsm(D_ERROR, lsm); + LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n", + LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic); + } + + LASSERT(lov->lo_lsm == NULL); + lov->lo_lsm = lsm_addref(lsm); + r0->lo_nr = lsm->lsm_stripe_count; + LASSERT(r0->lo_nr <= lov_targets_nr(dev)); + + OBD_ALLOC_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]); + if (r0->lo_sub != NULL) { + result = 0; + subconf->coc_inode = conf->coc_inode; + spin_lock_init(&r0->lo_sub_lock); + /* + * Create stripe cl_objects. + */ + for (i = 0; i < r0->lo_nr && result == 0; ++i) { + struct cl_device *subdev; + struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; + int ost_idx = oinfo->loi_ost_idx; + + result = ostid_to_fid(ofid, &oinfo->loi_oi, + oinfo->loi_ost_idx); + if (result != 0) + GOTO(out, result); + + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + subconf->u.coc_oinfo = oinfo; + LASSERTF(subdev != NULL, "not init ost %d\n", ost_idx); + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + stripe = lov_sub_find(env, subdev, ofid, subconf); + if (!IS_ERR(stripe)) { + result = lov_init_sub(env, lov, stripe, r0, i); + if (result == -EAGAIN) { /* try again */ + --i; + result = 0; + } + } else { + result = PTR_ERR(stripe); + } + } + } else + result = -ENOMEM; +out: + RETURN(result); +} + +static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY); + + lov_layout_wait(env, lov); + + cl_object_prune(env, &lov->lo_cl); + return 0; +} + +static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, + struct lovsub_object *los, int idx) +{ + struct cl_object *sub; + struct lov_layout_raid0 *r0; + struct lu_site *site; + struct lu_site_bkt_data *bkt; + wait_queue_t *waiter; + + r0 = &lov->u.raid0; + LASSERT(r0->lo_sub[idx] == los); + + sub = lovsub2cl(los); + site = sub->co_lu.lo_dev->ld_site; + bkt = lu_site_bkt_from_fid(site, &sub->co_lu.lo_header->loh_fid); + + cl_object_kill(env, sub); + /* release a reference to the sub-object and ... */ + lu_object_ref_del(&sub->co_lu, "lov-parent", lov); + cl_object_put(env, sub); + + /* ... wait until it is actually destroyed---sub-object clears its + * ->lo_sub[] slot in lovsub_object_fini() */ + if (r0->lo_sub[idx] == los) { + waiter = &lov_env_info(env)->lti_waiter; + init_waitqueue_entry_current(waiter); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) { + /* this wait-queue is signaled at the end of + * lu_object_free(). */ + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock(&r0->lo_sub_lock); + if (r0->lo_sub[idx] == los) { + spin_unlock(&r0->lo_sub_lock); + waitq_wait(waiter, TASK_UNINTERRUPTIBLE); + } else { + spin_unlock(&r0->lo_sub_lock); + set_current_state(TASK_RUNNING); + break; + } + } + remove_wait_queue(&bkt->lsb_marche_funebre, waiter); + } + LASSERT(r0->lo_sub[idx] == NULL); +} + +static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + ENTRY; + + dump_lsm(D_INODE, lsm); + + lov_layout_wait(env, lov); + if (r0->lo_sub != NULL) { + for (i = 0; i < r0->lo_nr; ++i) { + struct lovsub_object *los = r0->lo_sub[i]; + + if (los != NULL) { + cl_locks_prune(env, &los->lso_cl, 1); + /* + * If top-level object is to be evicted from + * the cache, so are its sub-objects. + */ + lov_subobject_kill(env, lov, los, i); + } + } + } + cl_object_prune(env, &lov->lo_cl); + RETURN(0); +} + +static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY); +} + +static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + ENTRY; + + if (r0->lo_sub != NULL) { + OBD_FREE_LARGE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]); + r0->lo_sub = NULL; + } + + dump_lsm(D_INODE, lov->lo_lsm); + lov_free_memmd(&lov->lo_lsm); + + EXIT; +} + +static int lov_print_empty(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid); + return 0; +} + +static int lov_print_raid0(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_layout_raid0 *r0 = lov_r0(lov); + struct lov_stripe_md *lsm = lov->lo_lsm; + int i; + + (*p)(env, cookie, "stripes: %d, %svalid, lsm{%p 0x%08X %d %u %u}: \n", + r0->lo_nr, lov->lo_layout_invalid ? "in" : "", lsm, + lsm->lsm_magic, atomic_read(&lsm->lsm_refc), + lsm->lsm_stripe_count, lsm->lsm_layout_gen); + for (i = 0; i < r0->lo_nr; ++i) { + struct lu_object *sub; + + if (r0->lo_sub[i] != NULL) { + sub = lovsub2lu(r0->lo_sub[i]); + lu_object_print(env, cookie, p, sub); + } else + (*p)(env, cookie, "sub %d absent\n", i); + } + return 0; +} + +/** + * Implements cl_object_operations::coo_attr_get() method for an object + * without stripes (LLT_EMPTY layout type). + * + * The only attributes this layer is authoritative in this case is + * cl_attr::cat_blocks---it's 0. + */ +static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + attr->cat_blocks = 0; + return 0; +} + +static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_raid0 *r0 = lov_r0(lov); + struct cl_attr *lov_attr = &r0->lo_attr; + int result = 0; + + ENTRY; + + /* this is called w/o holding type guard mutex, so it must be inside + * an on going IO otherwise lsm may be replaced. + * LU-2117: it turns out there exists one exception. For mmaped files, + * the lock of those files may be requested in the other file's IO + * context, and this function is called in ccc_lock_state(), it will + * hit this assertion. + * Anyway, it's still okay to call attr_get w/o type guard as layout + * can't go if locks exist. */ + /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */ + + if (!r0->lo_attr_valid) { + struct lov_stripe_md *lsm = lov->lo_lsm; + struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; + __u64 kms = 0; + + memset(lvb, 0, sizeof(*lvb)); + /* XXX: timestamps can be negative by sanity:test_39m, + * how can it be? */ + lvb->lvb_atime = LLONG_MIN; + lvb->lvb_ctime = LLONG_MIN; + lvb->lvb_mtime = LLONG_MIN; + + /* + * XXX that should be replaced with a loop over sub-objects, + * doing cl_object_attr_get() on them. But for now, let's + * reuse old lov code. + */ + + /* + * XXX take lsm spin-lock to keep lov_merge_lvb_kms() + * happy. It's not needed, because new code uses + * ->coh_attr_guard spin-lock to protect consistency of + * sub-object attributes. + */ + lov_stripe_lock(lsm); + result = lov_merge_lvb_kms(lsm, lvb, &kms); + lov_stripe_unlock(lsm); + if (result == 0) { + cl_lvb2attr(lov_attr, lvb); + lov_attr->cat_kms = kms; + r0->lo_attr_valid = 1; + } + } + if (result == 0) { /* merge results */ + attr->cat_blocks = lov_attr->cat_blocks; + attr->cat_size = lov_attr->cat_size; + attr->cat_kms = lov_attr->cat_kms; + if (attr->cat_atime < lov_attr->cat_atime) + attr->cat_atime = lov_attr->cat_atime; + if (attr->cat_ctime < lov_attr->cat_ctime) + attr->cat_ctime = lov_attr->cat_ctime; + if (attr->cat_mtime < lov_attr->cat_mtime) + attr->cat_mtime = lov_attr->cat_mtime; + } + RETURN(result); +} + +const static struct lov_layout_operations lov_dispatch[] = { + [LLT_EMPTY] = { + .llo_init = lov_init_empty, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_empty, + .llo_install = lov_install_empty, + .llo_print = lov_print_empty, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = lov_lock_init_empty, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty + }, + [LLT_RAID0] = { + .llo_init = lov_init_raid0, + .llo_delete = lov_delete_raid0, + .llo_fini = lov_fini_raid0, + .llo_install = lov_install_raid0, + .llo_print = lov_print_raid0, + .llo_page_init = lov_page_init_raid0, + .llo_lock_init = lov_lock_init_raid0, + .llo_io_init = lov_io_init_raid0, + .llo_getattr = lov_attr_get_raid0 + } +}; + + +/** + * Performs a double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ +}) + +static inline void lov_conf_freeze(struct lov_object *lov) +{ + if (lov->lo_owner != current) + down_read(&lov->lo_type_guard); +} + +static inline void lov_conf_thaw(struct lov_object *lov) +{ + if (lov->lo_owner != current) + up_read(&lov->lo_type_guard); +} + +#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + int __lock = !!(lock); \ + typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ + \ + if (__lock) \ + lov_conf_freeze(__obj); \ + __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ + if (__lock) \ + lov_conf_thaw(__obj); \ + __result; \ +}) + +/** + * Performs a locked double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH(obj, op, ...) \ + LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) + +#define LOV_2DISPATCH_VOID(obj, op, ...) \ +do { \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + lov_conf_freeze(__obj); \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ + lov_conf_thaw(__obj); \ +} while (0) + +static void lov_conf_lock(struct lov_object *lov) +{ + LASSERT(lov->lo_owner != current); + down_write(&lov->lo_type_guard); + LASSERT(lov->lo_owner == NULL); + lov->lo_owner = current; +} + +static void lov_conf_unlock(struct lov_object *lov) +{ + lov->lo_owner = NULL; + up_write(&lov->lo_type_guard); +} + +static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov) +{ + struct l_wait_info lwi = { 0 }; + ENTRY; + + while (atomic_read(&lov->lo_active_ios) > 0) { + CDEBUG(D_INODE, "file:"DFID" wait for active IO, now: %d.\n", + PFID(lu_object_fid(lov2lu(lov))), + atomic_read(&lov->lo_active_ios)); + + l_wait_event(lov->lo_waitq, + atomic_read(&lov->lo_active_ios) == 0, &lwi); + } + RETURN(0); +} + +static int lov_layout_change(const struct lu_env *unused, + struct lov_object *lov, + const struct cl_object_conf *conf) +{ + int result; + enum lov_layout_type llt = LLT_EMPTY; + union lov_layout_state *state = &lov->u; + const struct lov_layout_operations *old_ops; + const struct lov_layout_operations *new_ops; + + struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); + void *cookie; + struct lu_env *env; + int refcheck; + ENTRY; + + LASSERT(0 <= lov->lo_type && lov->lo_type < ARRAY_SIZE(lov_dispatch)); + + if (conf->u.coc_md != NULL && conf->u.coc_md->lsm != NULL) + llt = LLT_RAID0; /* only raid0 is supported. */ + LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch)); + + cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + cl_env_reexit(cookie); + RETURN(PTR_ERR(env)); + } + + old_ops = &lov_dispatch[lov->lo_type]; + new_ops = &lov_dispatch[llt]; + + result = old_ops->llo_delete(env, lov, &lov->u); + if (result == 0) { + old_ops->llo_fini(env, lov, &lov->u); + + LASSERT(atomic_read(&lov->lo_active_ios) == 0); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + lov->lo_type = LLT_EMPTY; + result = new_ops->llo_init(env, + lu2lov_dev(lov->lo_cl.co_lu.lo_dev), + lov, conf, state); + if (result == 0) { + new_ops->llo_install(env, lov, state); + lov->lo_type = llt; + } else { + new_ops->llo_delete(env, lov, state); + new_ops->llo_fini(env, lov, state); + /* this file becomes an EMPTY file. */ + } + } + + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + RETURN(result); +} + +/***************************************************************************** + * + * Lov object operations. + * + */ + +int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lov_device *dev = lu2lov_dev(obj->lo_dev); + struct lov_object *lov = lu2lov(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + union lov_layout_state *set = &lov->u; + const struct lov_layout_operations *ops; + int result; + + ENTRY; + init_rwsem(&lov->lo_type_guard); + atomic_set(&lov->lo_active_ios, 0); + init_waitqueue_head(&lov->lo_waitq); + + cl_object_page_init(lu2cl(obj), sizeof(struct lov_page)); + + /* no locking is necessary, as object is being created */ + lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY; + ops = &lov_dispatch[lov->lo_type]; + result = ops->llo_init(env, dev, lov, cconf, set); + if (result == 0) + ops->llo_install(env, lov, set); + RETURN(result); +} + +static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lov_stripe_md *lsm = NULL; + struct lov_object *lov = cl2lov(obj); + int result = 0; + ENTRY; + + lov_conf_lock(lov); + if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { + lov->lo_layout_invalid = true; + GOTO(out, result = 0); + } + + if (conf->coc_opc == OBJECT_CONF_WAIT) { + if (lov->lo_layout_invalid && + atomic_read(&lov->lo_active_ios) > 0) { + lov_conf_unlock(lov); + result = lov_layout_wait(env, lov); + lov_conf_lock(lov); + } + GOTO(out, result); + } + + LASSERT(conf->coc_opc == OBJECT_CONF_SET); + + if (conf->u.coc_md != NULL) + lsm = conf->u.coc_md->lsm; + if ((lsm == NULL && lov->lo_lsm == NULL) || + (lsm != NULL && lov->lo_lsm != NULL && + lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen)) { + /* same version of layout */ + lov->lo_layout_invalid = false; + GOTO(out, result = 0); + } + + /* will change layout - check if there still exists active IO. */ + if (atomic_read(&lov->lo_active_ios) > 0) { + lov->lo_layout_invalid = true; + GOTO(out, result = -EBUSY); + } + + lov->lo_layout_invalid = lov_layout_change(env, lov, conf); + EXIT; + +out: + lov_conf_unlock(lov); + RETURN(result); +} + +static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); + EXIT; +} + +static void lov_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(lov, lov_object_kmem); + EXIT; +} + +static int lov_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o); +} + +int lov_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), + llo_page_init, env, obj, page, vmpage); +} + +/** + * Implements cl_object_operations::clo_io_init() method for lov + * layer. Dispatches to the appropriate layout io initialization method. + */ +int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl); + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, + !io->ci_ignore_layout, env, obj, io); +} + +/** + * An implementation of cl_object_operations::clo_attr_get() method for lov + * layer. For raid0 layout this collects and merges attributes of all + * sub-objects. + */ +static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + /* do not take lock, as this function is called under a + * spin-lock. Layout is protected from changing by ongoing IO. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); +} + +static int lov_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + /* + * No dispatch is required here, as no layout implements this. + */ + return 0; +} + +int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + /* No need to lock because we've taken one refcount of layout. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock, + io); +} + +static const struct cl_object_operations lov_ops = { + .coo_page_init = lov_page_init, + .coo_lock_init = lov_lock_init, + .coo_io_init = lov_io_init, + .coo_attr_get = lov_attr_get, + .coo_attr_set = lov_attr_set, + .coo_conf_set = lov_conf_set +}; + +static const struct lu_object_operations lov_lu_obj_ops = { + .loo_object_init = lov_object_init, + .loo_object_delete = lov_object_delete, + .loo_object_release = NULL, + .loo_object_free = lov_object_free, + .loo_object_print = lov_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lov_object *lov; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lov, lov_object_kmem, __GFP_IO); + if (lov != NULL) { + obj = lov2lu(lov); + lu_object_init(obj, NULL, dev); + lov->lo_cl.co_ops = &lov_ops; + lov->lo_type = -1; /* invalid, to catch uninitialized type */ + /* + * object io operation vector (cl_object::co_iop) is installed + * later in lov_object_init(), as different vectors are used + * for object with different layouts. + */ + obj->lo_ops = &lov_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) +{ + struct lov_stripe_md *lsm = NULL; + + lov_conf_freeze(lov); + if (lov->lo_lsm != NULL) { + lsm = lsm_addref(lov->lo_lsm); + CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), + lov->lo_layout_invalid, current); + } + lov_conf_thaw(lov); + return lsm; +} + +void lov_lsm_decref(struct lov_object *lov, struct lov_stripe_md *lsm) +{ + if (lsm == NULL) + return; + + CDEBUG(D_INODE, "lsm %p decref %d by %p.\n", + lsm, atomic_read(&lsm->lsm_refc), current); + + lov_free_memmd(&lsm); +} + +struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj) +{ + struct lu_object *luobj; + struct lov_stripe_md *lsm = NULL; + + if (clobj == NULL) + return NULL; + + luobj = lu_object_locate(&cl_object_header(clobj)->coh_lu, + &lov_device_type); + if (luobj != NULL) + lsm = lov_lsm_addref(lu2lov(luobj)); + return lsm; +} +EXPORT_SYMBOL(lov_lsm_get); + +void lov_lsm_put(struct cl_object *unused, struct lov_stripe_md *lsm) +{ + if (lsm != NULL) + lov_free_memmd(&lsm); +} +EXPORT_SYMBOL(lov_lsm_put); + +int lov_read_and_clear_async_rc(struct cl_object *clob) +{ + struct lu_object *luobj; + int rc = 0; + ENTRY; + + luobj = lu_object_locate(&cl_object_header(clob)->coh_lu, + &lov_device_type); + if (luobj != NULL) { + struct lov_object *lov = lu2lov(luobj); + + lov_conf_freeze(lov); + switch (lov->lo_type) { + case LLT_RAID0: { + struct lov_stripe_md *lsm; + int i; + + lsm = lov->lo_lsm; + LASSERT(lsm != NULL); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + if (loi->loi_ar.ar_rc && !rc) + rc = loi->loi_ar.ar_rc; + loi->loi_ar.ar_rc = 0; + } + } + case LLT_EMPTY: + break; + default: + LBUG(); + } + lov_conf_thaw(lov); + } + RETURN(rc); +} +EXPORT_SYMBOL(lov_read_and_clear_async_rc); + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c new file mode 100644 index 000000000000..f62b7e53b665 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_offset.c @@ -0,0 +1,267 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include + +#include "lov_internal.h" + +/* compute object size given "stripeno" and the ost size */ +obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_stripe_size; + unsigned long stripe_size; + obd_off swidth; + obd_size lov_size; + int magic = lsm->lsm_magic; + ENTRY; + + if (ost_size == 0) + RETURN(0); + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_size = lov_do_div64(ost_size, ssize); + if (stripe_size) + lov_size = ost_size * swidth + stripeno * ssize + stripe_size; + else + lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; + + RETURN(lov_size); +} + +/* we have an offset in file backed by an lov and want to find out where + * that offset lands in our given stripe of the file. for the easy + * case where the offset is within the stripe, we just have to scale the + * offset down to make it relative to the stripe instead of the lov. + * + * the harder case is what to do when the offset doesn't intersect the + * stripe. callers will want start offsets clamped ahead to the start + * of the nearest stripe in the file. end offsets similarly clamped to the + * nearest ending byte of a stripe in the file: + * + * all this function does is move offsets to the nearest region of the + * stripe, and it does its work "mod" the full length of all the stripes. + * consider a file with 3 stripes: + * + * S E + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * to find stripe 1's offsets for S and E, it divides by the full stripe + * width and does its math in the context of a single set of stripes: + * + * S E + * ----------------------------------- + * | 0 | 1 | 2 | + * ----------------------------------- + * + * it'll notice that E is outside stripe 1 and clamp it to the end of the + * stripe, then multiply it back out by lov_off to give the real offsets in + * the stripe: + * + * S E + * --------------------------------------------------------------------- + * | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------------------------------------------- + * + * it would have done similarly and pulled S forward to the start of a 1 + * stripe if, say, S had landed in a 0 stripe. + * + * this rounding isn't always correct. consider an E lov offset that lands + * on a 0 stripe, the "mod stripe width" math will pull it forward to the + * start of a 1 stripe, when in fact it wanted to be rounded back to the end + * of a previous 1 stripe. this logic is handled by callers and this is why: + * + * this function returns < 0 when the offset was "before" the stripe and + * was moved forward to the start of the stripe in question; 0 when it + * falls in the stripe and no shifting was done; > 0 when the offset + * was outside the stripe and was pulled back to its final byte. */ +int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, + int stripeno, obd_off *obdoff) +{ + unsigned long ssize = lsm->lsm_stripe_size; + obd_off stripe_off, this_stripe, swidth; + int magic = lsm->lsm_magic; + int ret = 0; + + if (lov_off == OBD_OBJECT_EOF) { + *obdoff = OBD_OBJECT_EOF; + return 0; + } + + LASSERT(lsm_op_find(magic) != NULL); + + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off, + &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(lov_off, swidth); + + this_stripe = (obd_off)stripeno * ssize; + if (stripe_off < this_stripe) { + stripe_off = 0; + ret = -1; + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + stripe_off = ssize; + ret = 1; + } + } + + *obdoff = lov_off * ssize + stripe_off; + return ret; +} + +/* Given a whole-file size and a stripe number, give the file size which + * corresponds to the individual object of that stripe. + * + * This behaves basically in the same was as lov_stripe_offset, except that + * file sizes falling before the beginning of a stripe are clamped to the end + * of the previous stripe, not the beginning of the next: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * if clamped to stripe 2 becomes: + * + * S + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + */ +obd_off lov_size_to_stripe(struct lov_stripe_md *lsm, obd_off file_size, + int stripeno) +{ + unsigned long ssize = lsm->lsm_stripe_size; + obd_off stripe_off, this_stripe, swidth; + int magic = lsm->lsm_magic; + + if (file_size == OBD_OBJECT_EOF) + return OBD_OBJECT_EOF; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size, + &swidth); + + /* lov_do_div64(a, b) returns a % b, and a = a / b */ + stripe_off = lov_do_div64(file_size, swidth); + + this_stripe = (obd_off)stripeno * ssize; + if (stripe_off < this_stripe) { + /* Move to end of previous stripe, or zero */ + if (file_size > 0) { + file_size--; + stripe_off = ssize; + } else { + stripe_off = 0; + } + } else { + stripe_off -= this_stripe; + + if (stripe_off >= ssize) { + /* Clamp to end of this stripe */ + stripe_off = ssize; + } + } + + return (file_size * ssize + stripe_off); +} + +/* given an extent in an lov and a stripe, calculate the extent of the stripe + * that is contained within the lov extent. this returns true if the given + * stripe does intersect with the lov extent. */ +int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, + obd_off start, obd_off end, + obd_off *obd_start, obd_off *obd_end) +{ + int start_side, end_side; + + start_side = lov_stripe_offset(lsm, start, stripeno, obd_start); + end_side = lov_stripe_offset(lsm, end, stripeno, obd_end); + + CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n", + start, end, start_side, *obd_start, *obd_end, end_side); + + /* this stripe doesn't intersect the file extent when neither + * start or the end intersected the stripe and obd_start and + * obd_end got rounded up to the save value. */ + if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) + return 0; + + /* as mentioned in the lov_stripe_offset commentary, end + * might have been shifted in the wrong direction. This + * happens when an end offset is before the stripe when viewed + * through the "mod stripe size" math. we detect it being shifted + * in the wrong direction and touch it up. + * interestingly, this can't underflow since end must be > start + * if we passed through the previous check. + * (should we assert for that somewhere?) */ + if (end_side != 0) + (*obd_end)--; + + return 1; +} + +/* compute which stripe number "lov_off" will be written into */ +int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off) +{ + unsigned long ssize = lsm->lsm_stripe_size; + obd_off stripe_off, swidth; + int magic = lsm->lsm_magic; + + LASSERT(lsm_op_find(magic) != NULL); + lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth); + + stripe_off = lov_do_div64(lov_off, swidth); + + /* Puts stripe_off/ssize result into stripe_off */ + lov_do_div64(stripe_off, ssize); + + return stripe_off; +} diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c new file mode 100644 index 000000000000..8bb57aa5f418 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_pack.c @@ -0,0 +1,677 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include +#include +#include +#include +#include +#include + +#include "lov_internal.h" + +static void lov_dump_lmm_common(int level, void *lmmp) +{ + struct lov_mds_md *lmm = lmmp; + struct ost_id oi; + + lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); + CDEBUG(level, "objid "DOSTID", magic 0x%08x, pattern %#x\n", + POSTID(&oi), le32_to_cpu(lmm->lmm_magic), + le32_to_cpu(lmm->lmm_pattern)); + CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n", + le32_to_cpu(lmm->lmm_stripe_size), + le16_to_cpu(lmm->lmm_stripe_count), + le16_to_cpu(lmm->lmm_layout_gen)); +} + +static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod, + int stripe_count) +{ + int i; + + if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { + CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n", + stripe_count, LOV_V1_INSANE_STRIPE_COUNT); + } + + for (i = 0; i < stripe_count; ++i, ++lod) { + struct ost_id oi; + + ostid_le_to_cpu(&lod->l_ost_oi, &oi); + CDEBUG(level, "stripe %u idx %u subobj "DOSTID"\n", i, + le32_to_cpu(lod->l_ost_idx), POSTID(&oi)); + } +} + +void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) +{ + lov_dump_lmm_common(level, lmm); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) +{ + lov_dump_lmm_common(level, lmm); + CDEBUG(level,"pool_name "LOV_POOLNAMEF"\n", lmm->lmm_pool_name); + lov_dump_lmm_objects(level, lmm->lmm_objects, + le16_to_cpu(lmm->lmm_stripe_count)); +} + +void lov_dump_lmm(int level, void *lmm) +{ + int magic; + + magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic; + switch (magic) { + case LOV_MAGIC_V1: + return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm)); + case LOV_MAGIC_V3: + return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm)); + default: + CERROR("Cannot recognize lmm_magic %x", magic); + } + return; +} + +#define LMM_ASSERT(test) \ +do { \ + if (!(test)) lov_dump_lmm(D_ERROR, lmm); \ + LASSERT(test); /* so we know what assertion failed */ \ +} while(0) + +/* Pack LOV object metadata for disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + * + * XXX In the future, this will be enhanced to get the EA size from the + * underlying OSC device(s) to get their EA sizes so we can stack + * LOVs properly. For now lov_mds_md_size() just assumes one obd_id + * per stripe. + */ +int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + struct lov_mds_md_v1 *lmmv1; + struct lov_mds_md_v3 *lmmv3; + __u16 stripe_count; + struct lov_ost_data_v1 *lmm_objects; + int lmm_size, lmm_magic; + int i; + int cplen = 0; + ENTRY; + + if (lsm) { + lmm_magic = lsm->lsm_magic; + } else { + if (lmmp && *lmmp) + lmm_magic = le32_to_cpu((*lmmp)->lmm_magic); + else + /* lsm == NULL and lmmp == NULL */ + lmm_magic = LOV_MAGIC; + } + + if ((lmm_magic != LOV_MAGIC_V1) && + (lmm_magic != LOV_MAGIC_V3)) { + CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", + lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); + RETURN(-EINVAL); + + } + + if (lsm) { + /* If we are just sizing the EA, limit the stripe count + * to the actual number of OSTs in this filesystem. */ + if (!lmmp) { + stripe_count = lov_get_stripecnt(lov, lmm_magic, + lsm->lsm_stripe_count); + lsm->lsm_stripe_count = stripe_count; + } else { + stripe_count = lsm->lsm_stripe_count; + } + } else { + /* No need to allocate more than maximum supported stripes. + * Anyway, this is pretty inaccurate since ld_tgt_count now + * represents max index and we should rely on the actual number + * of OSTs instead */ + stripe_count = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize, + lmm_magic); + if (stripe_count > lov->desc.ld_tgt_count) + stripe_count = lov->desc.ld_tgt_count; + } + + /* XXX LOV STACKING call into osc for sizes */ + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); + + if (!lmmp) + RETURN(lmm_size); + + if (*lmmp && !lsm) { + stripe_count = le16_to_cpu((*lmmp)->lmm_stripe_count); + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); + OBD_FREE_LARGE(*lmmp, lmm_size); + *lmmp = NULL; + RETURN(0); + } + + if (!*lmmp) { + OBD_ALLOC_LARGE(*lmmp, lmm_size); + if (!*lmmp) + RETURN(-ENOMEM); + } + + CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n", + lmm_magic, lmm_size); + + lmmv1 = *lmmp; + lmmv3 = (struct lov_mds_md_v3 *)*lmmp; + if (lmm_magic == LOV_MAGIC_V3) + lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3); + else + lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); + + if (!lsm) + RETURN(lmm_size); + + /* lmmv1 and lmmv3 point to the same struct and have the + * same first fields + */ + lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi); + lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size); + lmmv1->lmm_stripe_count = cpu_to_le16(stripe_count); + lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern); + lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen); + if (lsm->lsm_magic == LOV_MAGIC_V3) { + cplen = strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, + sizeof(lmmv3->lmm_pool_name)); + if (cplen >= sizeof(lmmv3->lmm_pool_name)) + RETURN(-E2BIG); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = lmmv1->lmm_objects; + } + + for (i = 0; i < stripe_count; i++) { + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + /* XXX LOV STACKING call down to osc_packmd() to do packing */ + LASSERTF(ostid_id(&loi->loi_oi) != 0, "lmm_oi "DOSTID + " stripe %u/%u idx %u\n", POSTID(&lmmv1->lmm_oi), + i, stripe_count, loi->loi_ost_idx); + ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); + lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); + } + + RETURN(lmm_size); +} + +/* Find the max stripecount we should use */ +__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count) +{ + __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; + + if (!stripe_count) + stripe_count = lov->desc.ld_default_stripe_count; + if (stripe_count > lov->desc.ld_active_tgt_count) + stripe_count = lov->desc.ld_active_tgt_count; + if (!stripe_count) + stripe_count = 1; + + /* stripe count is based on whether ldiskfs can handle + * larger EA sizes */ + if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && + lov->lov_ocd.ocd_max_easize) + max_stripes = lov_mds_md_stripecnt(lov->lov_ocd.ocd_max_easize, + magic); + + if (stripe_count > max_stripes) + stripe_count = max_stripes; + + return stripe_count; +} + + +static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count) +{ + int rc; + + if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) { + char *buffer; + int sz; + + CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n", + le32_to_cpu(*(__u32 *)lmm), lmm_bytes); + sz = lmm_bytes * 2 + 1; + OBD_ALLOC_LARGE(buffer, sz); + if (buffer != NULL) { + int i; + + for (i = 0; i < lmm_bytes; i++) + sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]); + buffer[sz - 1] = '\0'; + CERROR("%s\n", buffer); + OBD_FREE_LARGE(buffer, sz); + } + return -EINVAL; + } + rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm, + lmm_bytes, stripe_count); + return rc; +} + +int lov_alloc_memmd(struct lov_stripe_md **lsmp, __u16 stripe_count, + int pattern, int magic) +{ + int i, lsm_size; + ENTRY; + + CDEBUG(D_INFO, "alloc lsm, stripe_count %d\n", stripe_count); + + *lsmp = lsm_alloc_plain(stripe_count, &lsm_size); + if (!*lsmp) { + CERROR("can't allocate lsmp stripe_count %d\n", stripe_count); + RETURN(-ENOMEM); + } + + atomic_set(&(*lsmp)->lsm_refc, 1); + spin_lock_init(&(*lsmp)->lsm_lock); + (*lsmp)->lsm_magic = magic; + (*lsmp)->lsm_stripe_count = stripe_count; + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; + (*lsmp)->lsm_pattern = pattern; + (*lsmp)->lsm_pool_name[0] = '\0'; + (*lsmp)->lsm_layout_gen = 0; + (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0; + + for (i = 0; i < stripe_count; i++) + loi_init((*lsmp)->lsm_oinfo[i]); + + RETURN(lsm_size); +} + +int lov_free_memmd(struct lov_stripe_md **lsmp) +{ + struct lov_stripe_md *lsm = *lsmp; + int refc; + + *lsmp = NULL; + LASSERT(atomic_read(&lsm->lsm_refc) > 0); + if ((refc = atomic_dec_return(&lsm->lsm_refc)) == 0) { + LASSERT(lsm_op_find(lsm->lsm_magic) != NULL); + lsm_op_find(lsm->lsm_magic)->lsm_free(lsm); + } + return refc; +} + + +/* Unpack LOV object metadata from disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + */ +int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int rc = 0, lsm_size; + __u16 stripe_count; + __u32 magic; + ENTRY; + + /* If passed an MDS struct use values from there, otherwise defaults */ + if (lmm) { + rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count); + if (rc) + RETURN(rc); + magic = le32_to_cpu(lmm->lmm_magic); + } else { + magic = LOV_MAGIC; + stripe_count = lov_get_stripecnt(lov, magic, 0); + } + + /* If we aren't passed an lsmp struct, we just want the size */ + if (!lsmp) { + /* XXX LOV STACKING call into osc for sizes */ + LBUG(); + RETURN(lov_stripe_md_size(stripe_count)); + } + /* If we are passed an allocated struct but nothing to unpack, free */ + if (*lsmp && !lmm) { + lov_free_memmd(lsmp); + RETURN(0); + } + + lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0, + magic); + if (lsm_size < 0) + RETURN(lsm_size); + + /* If we are passed a pointer but nothing to unpack, we only alloc */ + if (!lmm) + RETURN(lsm_size); + + LASSERT(lsm_op_find(magic) != NULL); + rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm); + if (rc) { + lov_free_memmd(lsmp); + RETURN(rc); + } + + RETURN(lsm_size); +} + +static int __lov_setstripe(struct obd_export *exp, int max_lmm_size, + struct lov_stripe_md **lsmp, + struct lov_user_md *lump) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + char buffer[sizeof(struct lov_user_md_v3)]; + struct lov_user_md_v3 *lumv3 = (struct lov_user_md_v3 *)&buffer[0]; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&buffer[0]; + int lmm_magic; + __u16 stripe_count; + int rc; + int cplen = 0; + ENTRY; + + rc = lov_lum_swab_if_needed(lumv3, &lmm_magic, lump); + if (rc) + RETURN(rc); + + /* in the rest of the tests, as *lumv1 and lumv3 have the same + * fields, we use lumv1 to avoid code duplication */ + + if (lumv1->lmm_pattern == 0) { + lumv1->lmm_pattern = lov->desc.ld_pattern ? + lov->desc.ld_pattern : LOV_PATTERN_RAID0; + } + + if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) { + CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n", + lumv1->lmm_pattern); + RETURN(-EINVAL); + } + + /* 64kB is the largest common page size we see (ia64), and matches the + * check in lfs */ + if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { + CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n", + lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE); + lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE; + } + + if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) && + (lumv1->lmm_stripe_offset != + (typeof(lumv1->lmm_stripe_offset))(-1))) { + CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n", + lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count); + RETURN(-EINVAL); + } + stripe_count = lov_get_stripecnt(lov, lmm_magic, + lumv1->lmm_stripe_count); + + if (max_lmm_size) { + int max_stripes = (max_lmm_size - + lov_mds_md_size(0, lmm_magic)) / + sizeof(struct lov_ost_data_v1); + if (unlikely(max_stripes < stripe_count)) { + CDEBUG(D_IOCTL, "stripe count reset from %d to %d\n", + stripe_count, max_stripes); + stripe_count = max_stripes; + } + } + + if (lmm_magic == LOV_USER_MAGIC_V3) { + struct pool_desc *pool; + + /* In the function below, .hs_keycmp resolves to + * pool_hashkey_keycmp() */ + /* coverity[overrun-buffer-val] */ + pool = lov_find_pool(lov, lumv3->lmm_pool_name); + if (pool != NULL) { + if (lumv3->lmm_stripe_offset != + (typeof(lumv3->lmm_stripe_offset))(-1)) { + rc = lov_check_index_in_pool( + lumv3->lmm_stripe_offset, pool); + if (rc < 0) { + lov_pool_putref(pool); + RETURN(-EINVAL); + } + } + + if (stripe_count > pool_tgt_count(pool)) + stripe_count = pool_tgt_count(pool); + + lov_pool_putref(pool); + } + } + + rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic); + + if (rc >= 0) { + (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset; + (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size; + if (lmm_magic == LOV_USER_MAGIC_V3) { + cplen = strlcpy((*lsmp)->lsm_pool_name, + lumv3->lmm_pool_name, + sizeof((*lsmp)->lsm_pool_name)); + if (cplen >= sizeof((*lsmp)->lsm_pool_name)) + rc = -E2BIG; + } + rc = 0; + } + + RETURN(rc); +} + +/* Configure object striping information on a new file. + * + * @lmmu is a pointer to a user struct with one or more of the fields set to + * indicate the application preference: lmm_stripe_count, lmm_stripe_size, + * lmm_stripe_offset, and lmm_stripe_pattern. lmm_magic must be LOV_MAGIC. + * @lsmp is a pointer to an in-core stripe MD that needs to be filled in. + */ +int lov_setstripe(struct obd_export *exp, int max_lmm_size, + struct lov_stripe_md **lsmp, struct lov_user_md *lump) +{ + int rc; + mm_segment_t seg; + + seg = get_fs(); + set_fs(KERNEL_DS); + + rc = __lov_setstripe(exp, max_lmm_size, lsmp, lump); + set_fs(seg); + RETURN(rc); +} + +int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_user_md *lump) +{ + int i; + int rc; + struct obd_export *oexp; + struct lov_obd *lov = &exp->exp_obd->u.lov; + obd_id last_id = 0; + struct lov_user_ost_data_v1 *lmm_objects; + + ENTRY; + + if (lump->lmm_magic == LOV_USER_MAGIC_V3) + lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects; + else + lmm_objects = lump->lmm_objects; + + for (i = 0; i < lump->lmm_stripe_count; i++) { + __u32 len = sizeof(last_id); + oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp; + rc = obd_get_info(NULL, oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID, + &len, &last_id, NULL); + if (rc) + RETURN(rc); + if (ostid_id(&lmm_objects[i].l_ost_oi) > last_id) { + CERROR("Setting EA for object > than last id on" + " ost idx %d "DOSTID" > "LPD64" \n", + lmm_objects[i].l_ost_idx, + POSTID(&lmm_objects[i].l_ost_oi), last_id); + RETURN(-EINVAL); + } + } + + rc = lov_setstripe(exp, 0, lsmp, lump); + if (rc) + RETURN(rc); + + for (i = 0; i < lump->lmm_stripe_count; i++) { + (*lsmp)->lsm_oinfo[i]->loi_ost_idx = + lmm_objects[i].l_ost_idx; + (*lsmp)->lsm_oinfo[i]->loi_oi = lmm_objects[i].l_ost_oi; + } + RETURN(0); +} + + +/* Retrieve object striping information. + * + * @lump is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_USER_MAGIC. + */ +int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, + struct lov_user_md *lump) +{ + /* + * XXX huge struct allocated on stack. + */ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum; + struct lov_mds_md *lmmk = NULL; + int rc, lmm_size; + int lum_size; + mm_segment_t seg; + ENTRY; + + if (!lsm) + RETURN(-ENODATA); + + /* + * "Switch to kernel segment" to allow copying from kernel space by + * copy_{to,from}_user(). + */ + seg = get_fs(); + set_fs(KERNEL_DS); + + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(&lum, lump, lum_size)) + GOTO(out_set, rc = -EFAULT); + else if ((lum.lmm_magic != LOV_USER_MAGIC) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) + GOTO(out_set, rc = -EINVAL); + + if (lum.lmm_stripe_count && + (lum.lmm_stripe_count < lsm->lsm_stripe_count)) { + /* Return right size of stripe to user */ + lum.lmm_stripe_count = lsm->lsm_stripe_count; + rc = copy_to_user(lump, &lum, lum_size); + GOTO(out_set, rc = -EOVERFLOW); + } + rc = lov_packmd(exp, &lmmk, lsm); + if (rc < 0) + GOTO(out_set, rc); + lmm_size = rc; + rc = 0; + + /* FIXME: Bug 1185 - copy fields properly when structs change */ + /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */ + CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3)); + CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]); + + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + ((lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) || + (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)))) { + lustre_swab_lov_mds_md(lmmk); + lustre_swab_lov_user_md_objects( + (struct lov_user_ost_data*)lmmk->lmm_objects, + lmmk->lmm_stripe_count); + } + if (lum.lmm_magic == LOV_USER_MAGIC) { + /* User request for v1, we need skip lmm_pool_name */ + if (lmmk->lmm_magic == LOV_MAGIC_V3) { + memmove((char*)(&lmmk->lmm_stripe_count) + + sizeof(lmmk->lmm_stripe_count), + ((struct lov_mds_md_v3*)lmmk)->lmm_objects, + lmmk->lmm_stripe_count * + sizeof(struct lov_ost_data_v1)); + lmm_size -= LOV_MAXPOOLNAME; + } + } else { + /* if v3 we just have to update the lum_size */ + lum_size = sizeof(struct lov_user_md_v3); + } + + /* User wasn't expecting this many OST entries */ + if (lum.lmm_stripe_count == 0) + lmm_size = lum_size; + else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) + GOTO(out_set, rc = -EOVERFLOW); + /* + * Have a difference between lov_mds_md & lov_user_md. + * So we have to re-order the data before copy to user. + */ + lum.lmm_stripe_count = lmmk->lmm_stripe_count; + lum.lmm_layout_gen = lmmk->lmm_layout_gen; + ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen; + ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count; + if (copy_to_user(lump, lmmk, lmm_size)) + rc = -EFAULT; + + obd_free_diskmd(exp, &lmmk); +out_set: + set_fs(seg); + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c new file mode 100644 index 000000000000..65790d684720 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_page.c @@ -0,0 +1,235 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lov page operations. + * + */ + +static int lov_page_invariant(const struct cl_page_slice *slice) +{ + const struct cl_page *page = slice->cpl_page; + const struct cl_page *sub = lov_sub_page(slice); + + return ergo(sub != NULL, + page->cp_child == sub && + sub->cp_parent == page && + page->cp_state == sub->cp_state); +} + +static void lov_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct cl_page *sub = lov_sub_page(slice); + + LINVRNT(lov_page_invariant(slice)); + ENTRY; + + if (sub != NULL) { + LASSERT(sub->cp_state == CPS_FREEING); + lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent); + sub->cp_parent = NULL; + slice->cpl_page->cp_child = NULL; + cl_page_put(env, sub); + } + EXIT; +} + +static int lov_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io, + int nonblock) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_io_sub *sub; + + LINVRNT(lov_page_invariant(slice)); + LINVRNT(!cl2lov_page(slice)->lps_invalid); + ENTRY; + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + lov_sub_page(slice)->cp_owner = sub->sub_io; + lov_sub_put(sub); + } else + LBUG(); /* Arrgh */ + RETURN(0); +} + +static void lov_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + lov_page_own(env, slice, io, 0); +} + +static int lov_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_io_sub *sub; + int rc = 0; + + LINVRNT(lov_page_invariant(slice)); + LINVRNT(!cl2lov_page(slice)->lps_invalid); + ENTRY; + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + rc = cl_page_cache_add(sub->sub_env, sub->sub_io, + slice->cpl_page->cp_child, CRT_WRITE); + lov_sub_put(sub); + } else { + rc = PTR_ERR(sub); + CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, "rc = %d\n", rc); + } + RETURN(rc); +} + +static int lov_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp); +} + +static const struct cl_page_operations lov_page_ops = { + .cpo_fini = lov_page_fini, + .cpo_own = lov_page_own, + .cpo_assume = lov_page_assume, + .io = { + [CRT_WRITE] = { + .cpo_cache_add = lov_page_cache_add + } + }, + .cpo_print = lov_page_print +}; + +static void lov_empty_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + LASSERT(slice->cpl_page->cp_child == NULL); +} + +int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct lov_object *loo = cl2lov(obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct lov_io *lio = lov_env_io(env); + struct cl_page *subpage; + struct cl_object *subobj; + struct lov_io_sub *sub; + struct lov_page *lpg = cl_object_page_slice(obj, page); + loff_t offset; + obd_off suboff; + int stripe; + int rc; + ENTRY; + + offset = cl_offset(obj, page->cp_index); + stripe = lov_stripe_number(loo->lo_lsm, offset); + LASSERT(stripe < r0->lo_nr); + rc = lov_stripe_offset(loo->lo_lsm, offset, stripe, + &suboff); + LASSERT(rc == 0); + + lpg->lps_invalid = 1; + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_page_ops); + + sub = lov_sub_get(env, lio, stripe); + if (IS_ERR(sub)) + GOTO(out, rc = PTR_ERR(sub)); + + subobj = lovsub2cl(r0->lo_sub[stripe]); + subpage = cl_page_find_sub(sub->sub_env, subobj, + cl_index(subobj, suboff), vmpage, page); + lov_sub_put(sub); + if (IS_ERR(subpage)) + GOTO(out, rc = PTR_ERR(subpage)); + + if (likely(subpage->cp_parent == page)) { + lu_ref_add(&subpage->cp_reference, "lov", page); + lpg->lps_invalid = 0; + rc = 0; + } else { + CL_PAGE_DEBUG(D_ERROR, env, page, "parent page\n"); + CL_PAGE_DEBUG(D_ERROR, env, subpage, "child page\n"); + LASSERT(0); + } + + EXIT; +out: + return rc; +} + + +static const struct cl_page_operations lov_empty_page_ops = { + .cpo_fini = lov_empty_page_fini, + .cpo_print = lov_page_print +}; + +int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct lov_page *lpg = cl_object_page_slice(obj, page); + void *addr; + ENTRY; + + cl_page_slice_add(page, &lpg->lps_cl, obj, &lov_empty_page_ops); + addr = kmap(vmpage); + memset(addr, 0, cl_page_size(obj)); + kunmap(vmpage); + cl_page_export(env, page, 1); + RETURN(0); +} + + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c new file mode 100644 index 000000000000..0f3f96dee915 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_pool.c @@ -0,0 +1,682 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pool.c + * + * OST pool methods + * + * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include "lov_internal.h" + +#define pool_tgt(_p, _i) \ + _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]] + +static void lov_pool_getref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + atomic_inc(&pool->pool_refcount); +} + +void lov_pool_putref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + if (atomic_dec_and_test(&pool->pool_refcount)) { + LASSERT(hlist_unhashed(&pool->pool_hash)); + LASSERT(list_empty(&pool->pool_list)); + LASSERT(pool->pool_proc_entry == NULL); + lov_ost_pool_free(&(pool->pool_rr.lqr_pool)); + lov_ost_pool_free(&(pool->pool_obds)); + OBD_FREE_PTR(pool); + EXIT; + } +} + +void lov_pool_putref_locked(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); + LASSERT(atomic_read(&pool->pool_refcount) > 1); + + atomic_dec(&pool->pool_refcount); +} + +/* + * hash function using a Rotating Hash algorithm + * Knuth, D. The Art of Computer Programming, + * Volume 3: Sorting and Searching, + * Chapter 6.4. + * Addison Wesley, 1973 + */ +static __u32 pool_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask) +{ + int i; + __u32 result; + char *poolname; + + result = 0; + poolname = (char *)key; + for (i = 0; i < LOV_MAXPOOLNAME; i++) { + if (poolname[i] == '\0') + break; + result = (result << 4)^(result >> 28) ^ poolname[i]; + } + return (result % mask); +} + +static void *pool_key(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return (pool->pool_name); +} + +static int pool_hashkey_keycmp(const void *key, struct hlist_node *compared_hnode) +{ + char *pool_name; + struct pool_desc *pool; + + pool_name = (char *)key; + pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash); + return !strncmp(pool_name, pool->pool_name, LOV_MAXPOOLNAME); +} + +static void *pool_hashobject(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct pool_desc, pool_hash); +} + +static void pool_hashrefcount_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_getref(pool); +} + +static void pool_hashrefcount_put_locked(cfs_hash_t *hs, + struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + lov_pool_putref_locked(pool); +} + +cfs_hash_ops_t pool_hash_operations = { + .hs_hash = pool_hashfn, + .hs_key = pool_key, + .hs_keycmp = pool_hashkey_keycmp, + .hs_object = pool_hashobject, + .hs_get = pool_hashrefcount_get, + .hs_put_locked = pool_hashrefcount_put_locked, + +}; + +#ifdef LPROCFS +/* ifdef needed for liblustre support */ +/* + * pool /proc seq_file methods + */ +/* + * iterator is used to go through the target pool entries + * index is the current entry index in the lp_array[] array + * index >= pos returned to the seq_file interface + * pos is from 0 to (pool->pool_obds.op_count - 1) + */ +#define POOL_IT_MAGIC 0xB001CEA0 +struct pool_iterator { + int magic; + struct pool_desc *pool; + int idx; /* from 0 to pool_tgt_size - 1 */ +}; + +static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + int prev_idx; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + + /* test if end of file */ + if (*pos >= pool_tgt_count(iter->pool)) + return NULL; + + /* iterate to find a non empty entry */ + prev_idx = iter->idx; + down_read(&pool_tgt_rw_sem(iter->pool)); + iter->idx++; + if (iter->idx == pool_tgt_count(iter->pool)) { + iter->idx = prev_idx; /* we stay on the last entry */ + up_read(&pool_tgt_rw_sem(iter->pool)); + return NULL; + } + up_read(&pool_tgt_rw_sem(iter->pool)); + (*pos)++; + /* return != NULL to continue */ + return iter; +} + +static void *pool_proc_start(struct seq_file *s, loff_t *pos) +{ + struct pool_desc *pool = (struct pool_desc *)s->private; + struct pool_iterator *iter; + + lov_pool_getref(pool); + if ((pool_tgt_count(pool) == 0) || + (*pos >= pool_tgt_count(pool))) { + /* iter is not created, so stop() has no way to + * find pool to dec ref */ + lov_pool_putref(pool); + return NULL; + } + + OBD_ALLOC_PTR(iter); + if (!iter) + return ERR_PTR(-ENOMEM); + iter->magic = POOL_IT_MAGIC; + iter->pool = pool; + iter->idx = 0; + + /* we use seq_file private field to memorized iterator so + * we can free it at stop() */ + /* /!\ do not forget to restore it to pool before freeing it */ + s->private = iter; + if (*pos > 0) { + loff_t i; + void *ptr; + + i = 0; + do { + ptr = pool_proc_next(s, &iter, &i); + } while ((i < *pos) && (ptr != NULL)); + return ptr; + } + return iter; +} + +static void pool_proc_stop(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + + /* in some cases stop() method is called 2 times, without + * calling start() method (see seq_read() from fs/seq_file.c) + * we have to free only if s->private is an iterator */ + if ((iter) && (iter->magic == POOL_IT_MAGIC)) { + /* we restore s->private so next call to pool_proc_start() + * will work */ + s->private = iter->pool; + lov_pool_putref(iter->pool); + OBD_FREE_PTR(iter); + } + return; +} + +static int pool_proc_show(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)v; + struct lov_tgt_desc *tgt; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + LASSERT(iter->pool != NULL); + LASSERT(iter->idx <= pool_tgt_count(iter->pool)); + + down_read(&pool_tgt_rw_sem(iter->pool)); + tgt = pool_tgt(iter->pool, iter->idx); + up_read(&pool_tgt_rw_sem(iter->pool)); + if (tgt) + seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + + return 0; +} + +static struct seq_operations pool_proc_ops = { + .start = pool_proc_start, + .next = pool_proc_next, + .stop = pool_proc_stop, + .show = pool_proc_show, +}; + +static int pool_proc_open(struct inode *inode, struct file *file) +{ + int rc; + + rc = seq_open(file, &pool_proc_ops); + if (!rc) { + struct seq_file *s = file->private_data; + s->private = PROC_I(inode)->pde->data; + } + return rc; +} + +static struct file_operations pool_proc_operations = { + .open = pool_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* LPROCFS */ + +void lov_dump_pool(int level, struct pool_desc *pool) +{ + int i; + + lov_pool_getref(pool); + + CDEBUG(level, "pool "LOV_POOLNAMEF" has %d members\n", + pool->pool_name, pool->pool_obds.op_count); + down_read(&pool_tgt_rw_sem(pool)); + + for (i = 0; i < pool_tgt_count(pool) ; i++) { + if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp) + continue; + CDEBUG(level, "pool "LOV_POOLNAMEF"[%d] = %s\n", + pool->pool_name, i, + obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid))); + } + + up_read(&pool_tgt_rw_sem(pool)); + lov_pool_putref(pool); +} + +#define LOV_POOL_INIT_COUNT 2 +int lov_ost_pool_init(struct ost_pool *op, unsigned int count) +{ + ENTRY; + + if (count == 0) + count = LOV_POOL_INIT_COUNT; + op->op_array = NULL; + op->op_count = 0; + init_rwsem(&op->op_rw_sem); + op->op_size = count; + OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0])); + if (op->op_array == NULL) { + op->op_size = 0; + RETURN(-ENOMEM); + } + EXIT; + return 0; +} + +/* Caller must hold write op_rwlock */ +int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count) +{ + __u32 *new; + int new_size; + + LASSERT(min_count != 0); + + if (op->op_count < op->op_size) + return 0; + + new_size = max(min_count, 2 * op->op_size); + OBD_ALLOC(new, new_size * sizeof(op->op_array[0])); + if (new == NULL) + return -ENOMEM; + + /* copy old array to new one */ + memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0])); + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = new; + op->op_size = new_size; + return 0; +} + +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) +{ + int rc = 0, i; + ENTRY; + + down_write(&op->op_rw_sem); + + rc = lov_ost_pool_extend(op, min_count); + if (rc) + GOTO(out, rc); + + /* search ost in pool array */ + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) + GOTO(out, rc = -EEXIST); + } + /* ost not found we add it */ + op->op_array[op->op_count] = idx; + op->op_count++; + EXIT; +out: + up_write(&op->op_rw_sem); + return rc; +} + +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) +{ + int i; + ENTRY; + + down_write(&op->op_rw_sem); + + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + memmove(&op->op_array[i], &op->op_array[i + 1], + (op->op_count - i - 1) * sizeof(op->op_array[0])); + op->op_count--; + up_write(&op->op_rw_sem); + EXIT; + return 0; + } + } + + up_write(&op->op_rw_sem); + RETURN(-EINVAL); +} + +int lov_ost_pool_free(struct ost_pool *op) +{ + ENTRY; + + if (op->op_size == 0) + RETURN(0); + + down_write(&op->op_rw_sem); + + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = NULL; + op->op_count = 0; + op->op_size = 0; + + up_write(&op->op_rw_sem); + RETURN(0); +} + + +int lov_pool_new(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *new_pool; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + if (strlen(poolname) > LOV_MAXPOOLNAME) + RETURN(-ENAMETOOLONG); + + OBD_ALLOC_PTR(new_pool); + if (new_pool == NULL) + RETURN(-ENOMEM); + + strncpy(new_pool->pool_name, poolname, LOV_MAXPOOLNAME); + new_pool->pool_name[LOV_MAXPOOLNAME] = '\0'; + new_pool->pool_lobd = obd; + /* ref count init to 1 because when created a pool is always used + * up to deletion + */ + atomic_set(&new_pool->pool_refcount, 1); + rc = lov_ost_pool_init(&new_pool->pool_obds, 0); + if (rc) + GOTO(out_err, rc); + + memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr)); + rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); + if (rc) + GOTO(out_free_pool_obds, rc); + + INIT_HLIST_NODE(&new_pool->pool_hash); + +#ifdef LPROCFS + /* we need this assert seq_file is not implementated for liblustre */ + /* get ref for /proc file */ + lov_pool_getref(new_pool); + new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, + poolname, NULL, NULL, + new_pool, + &pool_proc_operations); + if (IS_ERR(new_pool->pool_proc_entry)) { + CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname); + new_pool->pool_proc_entry = NULL; + lov_pool_putref(new_pool); + } + CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry); +#endif + + spin_lock(&obd->obd_dev_lock); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + /* add to find only when it fully ready */ + rc = cfs_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + if (rc) + GOTO(out_err, rc = -EEXIST); + + CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); + + RETURN(0); + +out_err: + spin_lock(&obd->obd_dev_lock); + list_del_init(&new_pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + lprocfs_remove(&new_pool->pool_proc_entry); + + lov_ost_pool_free(&new_pool->pool_rr.lqr_pool); +out_free_pool_obds: + lov_ost_pool_free(&new_pool->pool_obds); + OBD_FREE_PTR(new_pool); + return rc; +} + +int lov_pool_del(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *pool; + ENTRY; + + lov = &(obd->u.lov); + + /* lookup and kill hash reference */ + pool = cfs_hash_del_key(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + if (pool->pool_proc_entry != NULL) { + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry); + lprocfs_remove(&pool->pool_proc_entry); + lov_pool_putref(pool); + } + + spin_lock(&obd->obd_dev_lock); + list_del_init(&pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + /* release last reference */ + lov_pool_putref(pool); + + RETURN(0); +} + + +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc; + ENTRY; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + + /* search ost in lov array */ + obd_getref(obd); + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); + if (rc) + GOTO(out, rc); + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "Added %s to "LOV_POOLNAMEF" as member %d\n", + ostname, poolname, pool_tgt_count(pool)); + + EXIT; +out: + obd_putref(obd); + lov_pool_putref(pool); + return rc; +} + +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int lov_idx; + int rc = 0; + ENTRY; + + lov = &(obd->u.lov); + + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + RETURN(-ENOENT); + + obd_str2uuid(&ost_uuid, ostname); + + obd_getref(obd); + /* search ost in lov array, to get index */ + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) + continue; + + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (lov_idx == lov->desc.ld_tgt_count) + GOTO(out, rc = -EINVAL); + + lov_ost_pool_remove(&pool->pool_obds, lov_idx); + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname, + poolname); + + EXIT; +out: + obd_putref(obd); + lov_pool_putref(pool); + return rc; +} + +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool) +{ + int i, rc; + ENTRY; + + /* caller may no have a ref on pool if it got the pool + * without calling lov_find_pool() (e.g. go through the lov pool + * list) + */ + lov_pool_getref(pool); + + down_read(&pool_tgt_rw_sem(pool)); + + for (i = 0; i < pool_tgt_count(pool); i++) { + if (pool_tgt_array(pool)[i] == idx) + GOTO(out, rc = 0); + } + rc = -ENOENT; + EXIT; +out: + up_read(&pool_tgt_rw_sem(pool)); + + lov_pool_putref(pool); + return rc; +} + +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname) +{ + struct pool_desc *pool; + + pool = NULL; + if (poolname[0] != '\0') { + pool = cfs_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + CWARN("Request for an unknown pool ("LOV_POOLNAMEF")\n", + poolname); + if ((pool != NULL) && (pool_tgt_count(pool) == 0)) { + CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n", + poolname); + /* pool is ignored, so we remove ref on it */ + lov_pool_putref(pool); + pool = NULL; + } + } + return pool; +} diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c new file mode 100644 index 000000000000..13f1637bc700 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lov_request.c @@ -0,0 +1,1551 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include + +#include +#include +#include + +#include "lov_internal.h" + +static void lov_init_set(struct lov_request_set *set) +{ + set->set_count = 0; + atomic_set(&set->set_completes, 0); + atomic_set(&set->set_success, 0); + atomic_set(&set->set_finish_checked, 0); + set->set_cookies = 0; + INIT_LIST_HEAD(&set->set_list); + atomic_set(&set->set_refcount, 1); + init_waitqueue_head(&set->set_waitq); + spin_lock_init(&set->set_lock); +} + +void lov_finish_set(struct lov_request_set *set) +{ + struct list_head *pos, *n; + ENTRY; + + LASSERT(set); + list_for_each_safe(pos, n, &set->set_list) { + struct lov_request *req = list_entry(pos, + struct lov_request, + rq_link); + list_del_init(&req->rq_link); + + if (req->rq_oi.oi_oa) + OBDO_FREE(req->rq_oi.oi_oa); + if (req->rq_oi.oi_md) + OBD_FREE_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_osfs) + OBD_FREE(req->rq_oi.oi_osfs, + sizeof(*req->rq_oi.oi_osfs)); + OBD_FREE(req, sizeof(*req)); + } + + if (set->set_pga) { + int len = set->set_oabufs * sizeof(*set->set_pga); + OBD_FREE_LARGE(set->set_pga, len); + } + if (set->set_lockh) + lov_llh_put(set->set_lockh); + + OBD_FREE(set, sizeof(*set)); + EXIT; +} + +int lov_set_finished(struct lov_request_set *set, int idempotent) +{ + int completes = atomic_read(&set->set_completes); + + CDEBUG(D_INFO, "check set %d/%d\n", completes, set->set_count); + + if (completes == set->set_count) { + if (idempotent) + return 1; + if (atomic_inc_return(&set->set_finish_checked) == 1) + return 1; + } + return 0; +} + +void lov_update_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + req->rq_complete = 1; + req->rq_rc = rc; + + atomic_inc(&set->set_completes); + if (rc == 0) + atomic_inc(&set->set_success); + + wake_up(&set->set_waitq); +} + +int lov_update_common_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; + ENTRY; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !(lov->lov_tgts[req->rq_idx] && + lov->lov_tgts[req->rq_idx]->ltd_active)) + rc = 0; + + /* FIXME in raid1 regime, should return 0 */ + RETURN(rc); +} + +void lov_set_add_req(struct lov_request *req, struct lov_request_set *set) +{ + list_add_tail(&req->rq_link, &set->set_list); + set->set_count++; + req->rq_rqset = set; +} + +static int lov_check_set(struct lov_obd *lov, int idx) +{ + int rc = 0; + mutex_lock(&lov->lov_lock); + + if (lov->lov_tgts[idx] == NULL || + lov->lov_tgts[idx]->ltd_active || + (lov->lov_tgts[idx]->ltd_exp != NULL && + class_exp2cliimp(lov->lov_tgts[idx]->ltd_exp)->imp_connect_tried)) + rc = 1; + + mutex_unlock(&lov->lov_lock); + return rc; +} + +/* Check if the OSC connection exists and is active. + * If the OSC has not yet had a chance to connect to the OST the first time, + * wait once for it to connect instead of returning an error. + */ +int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + struct lov_tgt_desc *tgt; + int rc = 0; + + mutex_lock(&lov->lov_lock); + + tgt = lov->lov_tgts[ost_idx]; + + if (unlikely(tgt == NULL)) + GOTO(out, rc = 0); + + if (likely(tgt->ltd_active)) + GOTO(out, rc = 1); + + if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried) + GOTO(out, rc = 0); + + mutex_unlock(&lov->lov_lock); + + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(obd_timeout), + cfs_time_seconds(1), NULL, NULL); + + rc = l_wait_event(waitq, lov_check_set(lov, ost_idx), &lwi); + if (tgt != NULL && tgt->ltd_active) + return 1; + + return 0; + +out: + mutex_unlock(&lov->lov_lock); + return rc; +} + +extern void osc_update_enqueue(struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, + struct ost_lvb *lvb, __u32 mode, int rc); + +static int lov_update_enqueue_lov(struct obd_export *exp, + struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, int idx, + struct ost_id *oi, int rc) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + + if (rc != ELDLM_OK && + !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) { + memset(lov_lockhp, 0, sizeof(*lov_lockhp)); + if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) { + /* -EUSERS used by OST to report file contention */ + if (rc != -EINTR && rc != -EUSERS) + CERROR("%s: enqueue objid "DOSTID" subobj" + DOSTID" on OST idx %d: rc %d\n", + exp->exp_obd->obd_name, + POSTID(oi), POSTID(&loi->loi_oi), + loi->loi_ost_idx, rc); + } else + rc = ELDLM_OK; + } + return rc; +} + +int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc) +{ + struct lov_request_set *set = req->rq_rqset; + struct lustre_handle *lov_lockhp; + struct obd_info *oi = set->set_oi; + struct lov_oinfo *loi; + ENTRY; + + LASSERT(oi != NULL); + + lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; + loi = oi->oi_md->lsm_oinfo[req->rq_stripe]; + + /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set + * and that copy can be arbitrarily out of date. + * + * The LOV API is due for a serious rewriting anyways, and this + * can be addressed then. */ + + lov_stripe_lock(oi->oi_md); + osc_update_enqueue(lov_lockhp, loi, oi->oi_flags, + &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc); + if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT)) + memset(lov_lockhp, 0, sizeof *lov_lockhp); + rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags, + req->rq_idx, &oi->oi_md->lsm_oi, rc); + lov_stripe_unlock(oi->oi_md); + lov_update_set(set, req, rc); + RETURN(rc); +} + +/* The callback for osc_enqueue that updates lov info for every OSC request. */ +static int cb_update_enqueue(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct ldlm_enqueue_info *einfo; + struct lov_request *lovreq; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + einfo = lovreq->rq_rqset->set_ei; + return lov_update_enqueue_set(lovreq, einfo->ei_mode, rc); +} + +static int enqueue_done(struct lov_request_set *set, __u32 mode) +{ + struct lov_request *req; + struct lov_obd *lov = &set->set_exp->exp_obd->u.lov; + int completes = atomic_read(&set->set_completes); + int rc = 0; + ENTRY; + + /* enqueue/match success, just return */ + if (completes && completes == atomic_read(&set->set_success)) + RETURN(0); + + /* cancel enqueued/matched locks */ + list_for_each_entry(req, &set->set_list, rq_link) { + struct lustre_handle *lov_lockhp; + + if (!req->rq_complete || req->rq_rc) + continue; + + lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; + LASSERT(lov_lockhp); + if (!lustre_handle_is_used(lov_lockhp)) + continue; + + rc = obd_cancel(lov->lov_tgts[req->rq_idx]->ltd_exp, + req->rq_oi.oi_md, mode, lov_lockhp); + if (rc && lov->lov_tgts[req->rq_idx] && + lov->lov_tgts[req->rq_idx]->ltd_active) + CERROR("%s: cancelling obdjid "DOSTID" on OST" + "idx %d error: rc = %d\n", + set->set_exp->exp_obd->obd_name, + POSTID(&req->rq_oi.oi_md->lsm_oi), + req->rq_idx, rc); + } + if (set->set_lockh) + lov_llh_put(set->set_lockh); + RETURN(rc); +} + +int lov_fini_enqueue_set(struct lov_request_set *set, __u32 mode, int rc, + struct ptlrpc_request_set *rqset) +{ + int ret = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + /* Do enqueue_done only for sync requests and if any request + * succeeded. */ + if (!rqset) { + if (rc) + atomic_set(&set->set_completes, 0); + ret = enqueue_done(set, mode); + } else if (set->set_lockh) + lov_llh_put(set->set_lockh); + + lov_put_reqset(set); + + RETURN(rc ? rc : ret); +} + +static void lov_llh_addref(void *llhp) +{ + struct lov_lock_handles *llh = llhp; + + atomic_inc(&llh->llh_refcount); + CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh, + atomic_read(&llh->llh_refcount)); +} + +static struct portals_handle_ops lov_handle_ops = { + .hop_addref = lov_llh_addref, + .hop_free = NULL, +}; + +static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm) +{ + struct lov_lock_handles *llh; + + OBD_ALLOC(llh, sizeof *llh + + sizeof(*llh->llh_handles) * lsm->lsm_stripe_count); + if (llh == NULL) + return NULL; + + atomic_set(&llh->llh_refcount, 2); + llh->llh_stripe_count = lsm->lsm_stripe_count; + INIT_LIST_HEAD(&llh->llh_handle.h_link); + class_handle_hash(&llh->llh_handle, &lov_handle_ops); + + return llh; +} + +int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct lov_request_set **reqset) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + struct lov_request_set *set; + int i, rc = 0; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + set->set_ei = einfo; + set->set_lockh = lov_llh_new(oinfo->oi_md); + if (set->set_lockh == NULL) + GOTO(out_set, rc = -ENOMEM); + oinfo->oi_lockh->cookie = set->set_lockh->llh_handle.h_cookie; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_request *req; + obd_off start, end; + + loi = oinfo->oi_md->lsm_oinfo[i]; + if (!lov_stripe_intersects(oinfo->oi_md, i, + oinfo->oi_policy.l_extent.start, + oinfo->oi_policy.l_extent.end, + &start, &end)) + continue; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + req->rq_buflen = sizeof(*req->rq_oi.oi_md) + + sizeof(struct lov_oinfo *) + + sizeof(struct lov_oinfo); + OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + req->rq_oi.oi_md->lsm_oinfo[0] = + ((void *)req->rq_oi.oi_md) + sizeof(*req->rq_oi.oi_md) + + sizeof(struct lov_oinfo *); + + /* Set lov request specific parameters. */ + req->rq_oi.oi_lockh = set->set_lockh->llh_handles + i; + req->rq_oi.oi_cb_up = cb_update_enqueue; + req->rq_oi.oi_flags = oinfo->oi_flags; + + LASSERT(req->rq_oi.oi_lockh); + + req->rq_oi.oi_policy.l_extent.gid = + oinfo->oi_policy.l_extent.gid; + req->rq_oi.oi_policy.l_extent.start = start; + req->rq_oi.oi_policy.l_extent.end = end; + + req->rq_idx = loi->loi_ost_idx; + req->rq_stripe = i; + + /* XXX LOV STACKING: submd should be from the subobj */ + req->rq_oi.oi_md->lsm_oi = loi->loi_oi; + req->rq_oi.oi_md->lsm_stripe_count = 0; + req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid = + loi->loi_kms_valid; + req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms = loi->loi_kms; + req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb = loi->loi_lvb; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(0); +out_set: + lov_fini_enqueue_set(set, einfo->ei_mode, rc, NULL); + RETURN(rc); +} + +int lov_fini_match_set(struct lov_request_set *set, __u32 mode, int flags) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + rc = enqueue_done(set, mode); + if ((set->set_count == atomic_read(&set->set_success)) && + (flags & LDLM_FL_TEST_LOCK)) + lov_llh_put(set->set_lockh); + + lov_put_reqset(set); + + RETURN(rc); +} + +int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, ldlm_policy_data_t *policy, + __u32 mode, struct lustre_handle *lockh, + struct lov_request_set **reqset) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + struct lov_request_set *set; + int i, rc = 0; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_lockh = lov_llh_new(lsm); + if (set->set_lockh == NULL) + GOTO(out_set, rc = -ENOMEM); + lockh->cookie = set->set_lockh->llh_handle.h_cookie; + + for (i = 0; i < lsm->lsm_stripe_count; i++){ + struct lov_oinfo *loi; + struct lov_request *req; + obd_off start, end; + + loi = lsm->lsm_oinfo[i]; + if (!lov_stripe_intersects(lsm, i, policy->l_extent.start, + policy->l_extent.end, &start, &end)) + continue; + + /* FIXME raid1 should grace this error */ + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + GOTO(out_set, rc = -EIO); + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_oi.oi_policy.l_extent.start = start; + req->rq_oi.oi_policy.l_extent.end = end; + req->rq_oi.oi_policy.l_extent.gid = policy->l_extent.gid; + + req->rq_idx = loi->loi_ost_idx; + req->rq_stripe = i; + + /* XXX LOV STACKING: submd should be from the subobj */ + req->rq_oi.oi_md->lsm_oi = loi->loi_oi; + req->rq_oi.oi_md->lsm_stripe_count = 0; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_match_set(set, mode, 0); + RETURN(rc); +} + +int lov_fini_cancel_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + + LASSERT(set->set_exp); + if (set->set_lockh) + lov_llh_put(set->set_lockh); + + lov_put_reqset(set); + + RETURN(rc); +} + +int lov_prep_cancel_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md *lsm, __u32 mode, + struct lustre_handle *lockh, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + int i, rc = 0; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_lockh = lov_handle2llh(lockh); + if (set->set_lockh == NULL) { + CERROR("LOV: invalid lov lock handle %p\n", lockh); + GOTO(out_set, rc = -EINVAL); + } + lockh->cookie = set->set_lockh->llh_handle.h_cookie; + + for (i = 0; i < lsm->lsm_stripe_count; i++){ + struct lov_request *req; + struct lustre_handle *lov_lockhp; + struct lov_oinfo *loi = lsm->lsm_oinfo[i]; + + lov_lockhp = set->set_lockh->llh_handles + i; + if (!lustre_handle_is_used(lov_lockhp)) { + CDEBUG(D_INFO, "lov idx %d subobj "DOSTID" no lock\n", + loi->loi_ost_idx, POSTID(&loi->loi_oi)); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_idx = loi->loi_ost_idx; + req->rq_stripe = i; + + /* XXX LOV STACKING: submd should be from the subobj */ + req->rq_oi.oi_md->lsm_oi = loi->loi_oi; + req->rq_oi.oi_md->lsm_stripe_count = 0; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_cancel_set(set); + RETURN(rc); +} +static int common_attr_done(struct lov_request_set *set) +{ + struct list_head *pos; + struct lov_request *req; + struct obdo *tmp_oa; + int rc = 0, attrset = 0; + ENTRY; + + LASSERT(set->set_oi != NULL); + + if (set->set_oi->oi_oa == NULL) + RETURN(0); + + if (!atomic_read(&set->set_success)) + RETURN(-EIO); + + OBDO_ALLOC(tmp_oa); + if (tmp_oa == NULL) + GOTO(out, rc = -ENOMEM); + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (!req->rq_complete || req->rq_rc) + continue; + if (req->rq_oi.oi_oa->o_valid == 0) /* inactive stripe */ + continue; + lov_merge_attrs(tmp_oa, req->rq_oi.oi_oa, + req->rq_oi.oi_oa->o_valid, + set->set_oi->oi_md, req->rq_stripe, &attrset); + } + if (!attrset) { + CERROR("No stripes had valid attrs\n"); + rc = -EIO; + } + if ((set->set_oi->oi_oa->o_valid & OBD_MD_FLEPOCH) && + (set->set_oi->oi_md->lsm_stripe_count != attrset)) { + /* When we take attributes of some epoch, we require all the + * ost to be active. */ + CERROR("Not all the stripes had valid attrs\n"); + GOTO(out, rc = -EIO); + } + + tmp_oa->o_oi = set->set_oi->oi_oa->o_oi; + memcpy(set->set_oi->oi_oa, tmp_oa, sizeof(*set->set_oi->oi_oa)); +out: + if (tmp_oa) + OBDO_FREE(tmp_oa); + RETURN(rc); + +} + +static int brw_done(struct lov_request_set *set) +{ + struct lov_stripe_md *lsm = set->set_oi->oi_md; + struct lov_oinfo *loi = NULL; + struct list_head *pos; + struct lov_request *req; + ENTRY; + + list_for_each (pos, &set->set_list) { + req = list_entry(pos, struct lov_request, rq_link); + + if (!req->rq_complete || req->rq_rc) + continue; + + loi = lsm->lsm_oinfo[req->rq_stripe]; + + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) + loi->loi_lvb.lvb_blocks = req->rq_oi.oi_oa->o_blocks; + } + + RETURN(0); +} + +int lov_fini_brw_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + rc = brw_done(set); + /* FIXME update qos data here */ + } + lov_put_reqset(set); + + RETURN(rc); +} + +int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pga, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct { + obd_count index; + obd_count count; + obd_count off; + } *info = NULL; + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i, shift; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oti = oti; + set->set_oi = oinfo; + set->set_oabufs = oa_bufs; + OBD_ALLOC_LARGE(set->set_pga, oa_bufs * sizeof(*set->set_pga)); + if (!set->set_pga) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC_LARGE(info, sizeof(*info) * oinfo->oi_md->lsm_stripe_count); + if (!info) + GOTO(out, rc = -ENOMEM); + + /* calculate the page count for each stripe */ + for (i = 0; i < oa_bufs; i++) { + int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off); + info[stripe].count++; + } + + /* alloc and initialize lov request */ + shift = 0; + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++){ + struct lov_oinfo *loi = NULL; + struct lov_request *req; + + if (info[i].count == 0) + continue; + + loi = oinfo->oi_md->lsm_oinfo[i]; + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + GOTO(out, rc = -EIO); + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out, rc = -ENOMEM); + } + + if (oinfo->oi_oa) { + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + } + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_oa->o_stripe_idx = i; + + req->rq_buflen = sizeof(*req->rq_oi.oi_md); + OBD_ALLOC_LARGE(req->rq_oi.oi_md, req->rq_buflen); + if (req->rq_oi.oi_md == NULL) { + OBDO_FREE(req->rq_oi.oi_oa); + OBD_FREE(req, sizeof(*req)); + GOTO(out, rc = -ENOMEM); + } + + req->rq_idx = loi->loi_ost_idx; + req->rq_stripe = i; + + /* XXX LOV STACKING */ + req->rq_oi.oi_md->lsm_oi = loi->loi_oi; + req->rq_oabufs = info[i].count; + req->rq_pgaidx = shift; + shift += req->rq_oabufs; + + /* remember the index for sort brw_page array */ + info[i].index = req->rq_pgaidx; + + req->rq_oi.oi_capa = oinfo->oi_capa; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out, rc = -EIO); + + /* rotate & sort the brw_page array */ + for (i = 0; i < oa_bufs; i++) { + int stripe = lov_stripe_number(oinfo->oi_md, pga[i].off); + + shift = info[stripe].index + info[stripe].off; + LASSERT(shift < oa_bufs); + set->set_pga[shift] = pga[i]; + lov_stripe_offset(oinfo->oi_md, pga[i].off, stripe, + &set->set_pga[shift].off); + info[stripe].off++; + } +out: + if (info) + OBD_FREE_LARGE(info, + sizeof(*info) * oinfo->oi_md->lsm_stripe_count); + + if (rc == 0) + *reqset = set; + else + lov_fini_brw_set(set); + + RETURN(rc); +} + +int lov_fini_getattr_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) + rc = common_attr_done(set); + + lov_put_reqset(set); + + RETURN(rc); +} + +/* The callback for osc_getattr_async that finilizes a request info when a + * response is received. */ +static int cb_getattr_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_getattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_request *req; + + loi = oinfo->oi_md->lsm_oinfo[i]; + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + if (oinfo->oi_oa->o_valid & OBD_MD_FLEPOCH) + /* SOM requires all the OSTs to be active. */ + GOTO(out_set, rc = -EIO); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_cb_up = cb_getattr_update; + req->rq_oi.oi_capa = oinfo->oi_capa; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_getattr_set(set); + RETURN(rc); +} + +int lov_fini_destroy_set(struct lov_request_set *set) +{ + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + /* FIXME update qos data here */ + } + + lov_put_reqset(set); + + RETURN(0); +} + +int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, + struct obdo *src_oa, struct lov_stripe_md *lsm, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + set->set_oi->oi_md = lsm; + set->set_oi->oi_oa = src_oa; + set->set_oti = oti; + if (oti != NULL && src_oa->o_valid & OBD_MD_FLCOOKIE) + set->set_cookies = oti->oti_logcookies; + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + struct lov_oinfo *loi; + struct lov_request *req; + + loi = lsm->lsm_oinfo[i]; + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_destroy_set(set); + RETURN(rc); +} + +int lov_fini_setattr_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + rc = common_attr_done(set); + /* FIXME update qos data here */ + } + + lov_put_reqset(set); + RETURN(rc); +} + +int lov_update_setattr_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov; + struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md; + ENTRY; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !(lov->lov_tgts[req->rq_idx] && + lov->lov_tgts[req->rq_idx]->ltd_active)) + rc = 0; + + if (rc == 0) { + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLCTIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_ctime = + req->rq_oi.oi_oa->o_ctime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLMTIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_mtime = + req->rq_oi.oi_oa->o_mtime; + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLATIME) + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_atime = + req->rq_oi.oi_oa->o_atime; + } + + RETURN(rc); +} + +/* The callback for osc_setattr_async that finilizes a request info when a + * response is received. */ +static int cb_setattr_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oti = oti; + set->set_oi = oinfo; + if (oti != NULL && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + set->set_cookies = oti->oti_logcookies; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i]; + struct lov_request *req; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_oa->o_stripe_idx = i; + req->rq_oi.oi_cb_up = cb_setattr_update; + req->rq_oi.oi_capa = oinfo->oi_capa; + + if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE) { + int off = lov_stripe_offset(oinfo->oi_md, + oinfo->oi_oa->o_size, i, + &req->rq_oi.oi_oa->o_size); + + if (off < 0 && req->rq_oi.oi_oa->o_size) + req->rq_oi.oi_oa->o_size--; + + CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n", + i, req->rq_oi.oi_oa->o_size, + oinfo->oi_oa->o_size); + } + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_setattr_set(set); + RETURN(rc); +} + +int lov_fini_punch_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + rc = -EIO; + /* FIXME update qos data here */ + if (atomic_read(&set->set_success)) + rc = common_attr_done(set); + } + + lov_put_reqset(set); + + RETURN(rc); +} + +int lov_update_punch_set(struct lov_request_set *set, + struct lov_request *req, int rc) +{ + struct lov_obd *lov = &req->rq_rqset->set_exp->exp_obd->u.lov; + struct lov_stripe_md *lsm = req->rq_rqset->set_oi->oi_md; + ENTRY; + + lov_update_set(set, req, rc); + + /* grace error on inactive ost */ + if (rc && !lov->lov_tgts[req->rq_idx]->ltd_active) + rc = 0; + + if (rc == 0) { + lov_stripe_lock(lsm); + if (req->rq_oi.oi_oa->o_valid & OBD_MD_FLBLOCKS) { + lsm->lsm_oinfo[req->rq_stripe]->loi_lvb.lvb_blocks = + req->rq_oi.oi_oa->o_blocks; + } + + lov_stripe_unlock(lsm); + } + + RETURN(rc); +} + +/* The callback for osc_punch that finilizes a request info when a response + * is received. */ +static int cb_update_punch(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_punch_set(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_oi = oinfo; + set->set_exp = exp; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i]; + struct lov_request *req; + obd_off rs, re; + + if (!lov_stripe_intersects(oinfo->oi_md, i, + oinfo->oi_policy.l_extent.start, + oinfo->oi_policy.l_extent.end, + &rs, &re)) + continue; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + GOTO(out_set, rc = -EIO); + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, + sizeof(*req->rq_oi.oi_oa)); + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_oa->o_valid |= OBD_MD_FLGROUP; + + req->rq_oi.oi_oa->o_stripe_idx = i; + req->rq_oi.oi_cb_up = cb_update_punch; + + req->rq_oi.oi_policy.l_extent.start = rs; + req->rq_oi.oi_policy.l_extent.end = re; + req->rq_oi.oi_policy.l_extent.gid = -1; + + req->rq_oi.oi_capa = oinfo->oi_capa; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_punch_set(set); + RETURN(rc); +} + +int lov_fini_sync_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + LASSERT(set->set_exp); + if (atomic_read(&set->set_completes)) { + if (!atomic_read(&set->set_success)) + rc = -EIO; + /* FIXME update qos data here */ + } + + lov_put_reqset(set); + + RETURN(rc); +} + +/* The callback for osc_sync that finilizes a request info when a + * response is recieved. */ +static int cb_sync_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); +} + +int lov_prep_sync_set(struct obd_export *exp, struct obd_info *oinfo, + obd_off start, obd_off end, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &exp->exp_obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC_PTR(set); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_exp = exp; + set->set_oi = oinfo; + + for (i = 0; i < oinfo->oi_md->lsm_stripe_count; i++) { + struct lov_oinfo *loi = oinfo->oi_md->lsm_oinfo[i]; + struct lov_request *req; + obd_off rs, re; + + if (!lov_check_and_wait_active(lov, loi->loi_ost_idx)) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + if (!lov_stripe_intersects(oinfo->oi_md, i, start, end, &rs, + &re)) + continue; + + OBD_ALLOC_PTR(req); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + req->rq_stripe = i; + req->rq_idx = loi->loi_ost_idx; + + OBDO_ALLOC(req->rq_oi.oi_oa); + if (req->rq_oi.oi_oa == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + *req->rq_oi.oi_oa = *oinfo->oi_oa; + req->rq_oi.oi_oa->o_oi = loi->loi_oi; + req->rq_oi.oi_oa->o_stripe_idx = i; + + req->rq_oi.oi_policy.l_extent.start = rs; + req->rq_oi.oi_policy.l_extent.end = re; + req->rq_oi.oi_policy.l_extent.gid = -1; + req->rq_oi.oi_cb_up = cb_sync_update; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_sync_set(set); + RETURN(rc); +} + +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while(0) + +int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs,int success) +{ + ENTRY; + + if (success) { + __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, + LOV_MAGIC, 0); + if (osfs->os_files != LOV_U64_MAX) + lov_do_div64(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + lov_do_div64(osfs->os_ffree, expected_stripes); + + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(*osfs)); + obd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&obd->obd_osfs_lock); + RETURN(0); + } + + RETURN(-EIO); +} + +int lov_fini_statfs_set(struct lov_request_set *set) +{ + int rc = 0; + ENTRY; + + if (set == NULL) + RETURN(0); + + if (atomic_read(&set->set_completes)) { + rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, + atomic_read(&set->set_success)); + } + lov_put_reqset(set); + RETURN(rc); +} + +void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, + int success) +{ + int shift = 0, quit = 0; + __u64 tmp; + + if (success == 0) { + memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); + } else { + if (osfs->os_bsize != lov_sfs->os_bsize) { + /* assume all block sizes are always powers of 2 */ + /* get the bits difference */ + tmp = osfs->os_bsize | lov_sfs->os_bsize; + for (shift = 0; shift <= 64; ++shift) { + if (tmp & 1) { + if (quit) + break; + else + quit = 1; + shift = 0; + } + tmp >>= 1; + } + } + + if (osfs->os_bsize < lov_sfs->os_bsize) { + osfs->os_bsize = lov_sfs->os_bsize; + + osfs->os_bfree >>= shift; + osfs->os_bavail >>= shift; + osfs->os_blocks >>= shift; + } else if (shift != 0) { + lov_sfs->os_bfree >>= shift; + lov_sfs->os_bavail >>= shift; + lov_sfs->os_blocks >>= shift; + } + osfs->os_bfree += lov_sfs->os_bfree; + osfs->os_bavail += lov_sfs->os_bavail; + osfs->os_blocks += lov_sfs->os_blocks; + /* XXX not sure about this one - depends on policy. + * - could be minimum if we always stripe on all OBDs + * (but that would be wrong for any other policy, + * if one of the OBDs has no more objects left) + * - could be sum if we stripe whole objects + * - could be average, just to give a nice number + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). + */ + LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); + } +} + +/* The callback for osc_statfs_async that finilizes a request info when a + * response is received. */ +static int cb_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct lov_request *lovreq; + struct lov_request_set *set; + struct obd_statfs *osfs, *lov_sfs; + struct lov_obd *lov; + struct lov_tgt_desc *tgt; + struct obd_device *lovobd, *tgtobd; + int success; + ENTRY; + + lovreq = container_of(oinfo, struct lov_request, rq_oi); + set = lovreq->rq_rqset; + lovobd = set->set_obd; + lov = &lovobd->u.lov; + osfs = set->set_oi->oi_osfs; + lov_sfs = oinfo->oi_osfs; + success = atomic_read(&set->set_success); + /* XXX: the same is done in lov_update_common_set, however + lovset->set_exp is not initialized. */ + lov_update_set(set, lovreq, rc); + if (rc) + GOTO(out, rc); + + obd_getref(lovobd); + tgt = lov->lov_tgts[lovreq->rq_idx]; + if (!tgt || !tgt->ltd_active) + GOTO(out_update, rc); + + tgtobd = class_exp2obd(tgt->ltd_exp); + spin_lock(&tgtobd->obd_osfs_lock); + memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs)); + if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0) + tgtobd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&tgtobd->obd_osfs_lock); + +out_update: + lov_update_statfs(osfs, lov_sfs, success); + obd_putref(lovobd); + +out: + if (set->set_oi->oi_flags & OBD_STATFS_PTLRPCD && + lov_set_finished(set, 0)) { + lov_statfs_interpret(NULL, set, set->set_count != + atomic_read(&set->set_success)); + } + + RETURN(0); +} + +int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, + struct lov_request_set **reqset) +{ + struct lov_request_set *set; + struct lov_obd *lov = &obd->u.lov; + int rc = 0, i; + ENTRY; + + OBD_ALLOC(set, sizeof(*set)); + if (set == NULL) + RETURN(-ENOMEM); + lov_init_set(set); + + set->set_obd = obd; + set->set_oi = oinfo; + + /* We only get block data from the OBD */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + struct lov_request *req; + + if (lov->lov_tgts[i] == NULL || + (!lov_check_and_wait_active(lov, i) && + (oinfo->oi_flags & OBD_STATFS_NODELAY))) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); + continue; + } + + /* skip targets that have been explicitely disabled by the + * administrator */ + if (!lov->lov_tgts[i]->ltd_exp) { + CDEBUG(D_HA, "lov idx %d administratively disabled\n", i); + continue; + } + + OBD_ALLOC(req, sizeof(*req)); + if (req == NULL) + GOTO(out_set, rc = -ENOMEM); + + OBD_ALLOC(req->rq_oi.oi_osfs, sizeof(*req->rq_oi.oi_osfs)); + if (req->rq_oi.oi_osfs == NULL) { + OBD_FREE(req, sizeof(*req)); + GOTO(out_set, rc = -ENOMEM); + } + + req->rq_idx = i; + req->rq_oi.oi_cb_up = cb_statfs_update; + req->rq_oi.oi_flags = oinfo->oi_flags; + + lov_set_add_req(req, set); + } + if (!set->set_count) + GOTO(out_set, rc = -EIO); + *reqset = set; + RETURN(rc); +out_set: + lov_fini_statfs_set(set); + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c new file mode 100644 index 000000000000..204ecd0b8639 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lovsub_dev.c @@ -0,0 +1,211 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub transfer operations. + * + */ + +static void lovsub_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lovsub_req *lsr; + + ENTRY; + lsr = cl2lovsub_req(slice); + OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem); + EXIT; +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for lovsub + * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx + * field, which is filled there. + */ +static void lovsub_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct lovsub_object *subobj; + + ENTRY; + subobj = cl2lovsub(obj); + /* + * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it + * unconditionally. It never changes anyway. + */ + attr->cra_oa->o_stripe_idx = subobj->lso_index; + EXIT; +} + +static const struct cl_req_operations lovsub_req_ops = { + .cro_attr_set = lovsub_req_attr_set, + .cro_completion = lovsub_req_completion +}; + +/***************************************************************************** + * + * Lov-sub device and device type functions. + * + */ + +static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device_type *ldt; + int rc; + + ENTRY; + next->ld_site = d->ld_site; + ldt = next->ld_type; + LASSERT(ldt != NULL); + rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); + if (rc) { + next->ld_site = NULL; + RETURN(rc); + } + + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + lsd->acid_next = lu2cl_dev(next); + RETURN(rc); +} + +static struct lu_device *lovsub_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lu_device *next; + struct lovsub_device *lsd; + + ENTRY; + lsd = lu2lovsub_dev(d); + next = cl2lu_dev(lsd->acid_next); + lsd->acid_super = NULL; + lsd->acid_next = NULL; + RETURN(next); +} + +static struct lu_device *lovsub_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device *next = cl2lu_dev(lsd->acid_next); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(lsd); + return next; +} + +static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lovsub_req *lsr; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(lsr, lovsub_req_kmem, __GFP_IO); + if (lsr != NULL) { + cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +static const struct lu_device_operations lovsub_lu_ops = { + .ldo_object_alloc = lovsub_object_alloc, + .ldo_process_config = NULL, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations lovsub_cl_ops = { + .cdo_req_init = lovsub_req_init +}; + +static struct lu_device *lovsub_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lovsub_device *lsd; + + OBD_ALLOC_PTR(lsd); + if (lsd != NULL) { + int result; + + result = cl_device_init(&lsd->acid_cl, t); + if (result == 0) { + d = lovsub2lu_dev(lsd); + d->ld_ops = &lovsub_lu_ops; + lsd->acid_cl.cd_ops = &lovsub_cl_ops; + } else + d = ERR_PTR(result); + } else + d = ERR_PTR(-ENOMEM); + return d; +} + +static const struct lu_device_type_operations lovsub_device_type_ops = { + .ldto_device_alloc = lovsub_device_alloc, + .ldto_device_free = lovsub_device_free, + + .ldto_device_init = lovsub_device_init, + .ldto_device_fini = lovsub_device_fini +}; + +#define LUSTRE_LOVSUB_NAME "lovsub" + +struct lu_device_type lovsub_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOVSUB_NAME, + .ldt_ops = &lovsub_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_io.c b/drivers/staging/lustre/lustre/lov/lovsub_io.c new file mode 100644 index 000000000000..783ec687a4e7 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lovsub_io.c @@ -0,0 +1,55 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub io operations. + * + */ + +/* All trivial */ + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c new file mode 100644 index 000000000000..03bab17ccc64 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lovsub_lock.c @@ -0,0 +1,485 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub lock operations. + * + */ + +static void lovsub_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lovsub_lock *lsl; + + ENTRY; + lsl = cl2lovsub_lock(slice); + LASSERT(list_empty(&lsl->lss_parents)); + OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem); + EXIT; +} + +static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + ENTRY; + parent = lov->lls_cl.cls_lock; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lovsub-parent", current); + cl_lock_mutex_get(env, parent); + EXIT; +} + +static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + ENTRY; + parent = lov->lls_cl.cls_lock; + cl_lock_mutex_put(env, lov->lls_cl.cls_lock); + lu_ref_del(&parent->cll_reference, "lovsub-parent", current); + cl_lock_put(env, parent); + EXIT; +} + +/** + * Implements cl_lock_operations::clo_state() method for lovsub layer, which + * method is called whenever sub-lock state changes. Propagates state change + * to the top-locks. + */ +static void lovsub_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock_link *scan; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + ENTRY; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + struct lov_lock *lov = scan->lll_super; + struct cl_lock *parent = lov->lls_cl.cls_lock; + + if (sub->lss_active != parent) { + lovsub_parent_lock(env, lov); + cl_lock_signal(env, parent); + lovsub_parent_unlock(env, lov); + } + } + EXIT; +} + +/** + * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by + * asking parent lock. + */ +static unsigned long lovsub_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lovsub_lock *lock = cl2lovsub_lock(slice); + struct lov_lock *lov; + unsigned long dumbbell; + + ENTRY; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + + if (!list_empty(&lock->lss_parents)) { + /* + * It is not clear whether all parents have to be asked and + * their estimations summed, or it is enough to ask one. For + * the current usages, one is always enough. + */ + lov = container_of(lock->lss_parents.next, + struct lov_lock_link, lll_list)->lll_super; + + lovsub_parent_lock(env, lov); + dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock); + lovsub_parent_unlock(env, lov); + } else + dumbbell = 0; + + RETURN(dumbbell); +} + +/** + * Maps start/end offsets within a stripe, to offsets within a file. + */ +static void lovsub_lock_descr_map(const struct cl_lock_descr *in, + struct lov_object *lov, + int stripe, struct cl_lock_descr *out) +{ + pgoff_t size; /* stripe size in pages */ + pgoff_t skip; /* how many pages in every stripe are occupied by + * "other" stripes */ + pgoff_t start; + pgoff_t end; + + ENTRY; + start = in->cld_start; + end = in->cld_end; + + if (lov->lo_lsm->lsm_stripe_count > 1) { + size = cl_index(lov2cl(lov), lov->lo_lsm->lsm_stripe_size); + skip = (lov->lo_lsm->lsm_stripe_count - 1) * size; + + /* XXX overflow check here? */ + start += start/size * skip + stripe * size; + + if (end != CL_PAGE_EOF) { + end += end/size * skip + stripe * size; + /* + * And check for overflow... + */ + if (end < in->cld_end) + end = CL_PAGE_EOF; + } + } + out->cld_start = start; + out->cld_end = end; + EXIT; +} + +/** + * Adjusts parent lock extent when a sub-lock is attached to a parent. This is + * called in two ways: + * + * - as part of receive call-back, when server returns granted extent to + * the client, and + * + * - when top-lock finds existing sub-lock in the cache. + * + * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ + * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ. + */ +int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx) +{ + struct cl_lock *parent; + struct lovsub_object *subobj; + struct cl_lock_descr *pd; + struct cl_lock_descr *parent_descr; + int result; + + parent = lov->lls_cl.cls_lock; + parent_descr = &parent->cll_descr; + LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode)); + + subobj = cl2lovsub(sublock->lss_cl.cls_obj); + pd = &lov_env_info(env)->lti_ldescr; + + pd->cld_obj = parent_descr->cld_obj; + pd->cld_mode = parent_descr->cld_mode; + pd->cld_gid = parent_descr->cld_gid; + lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd); + lov->lls_sub[idx].sub_got = *d; + /* + * Notify top-lock about modification, if lock description changes + * materially. + */ + if (!cl_lock_ext_match(parent_descr, pd)) + result = cl_lock_modify(env, parent, pd); + else + result = 0; + return result; +} + +static int lovsub_lock_modify(const struct lu_env *env, + const struct cl_lock_slice *s, + const struct cl_lock_descr *d) +{ + struct lovsub_lock *lock = cl2lovsub_lock(s); + struct lov_lock_link *scan; + struct lov_lock *lov; + int result = 0; + + ENTRY; + + LASSERT(cl_lock_mode_match(d->cld_mode, + s->cls_lock->cll_descr.cld_mode)); + list_for_each_entry(scan, &lock->lss_parents, lll_list) { + int rc; + + lov = scan->lll_super; + lovsub_parent_lock(env, lov); + rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx); + lovsub_parent_unlock(env, lov); + result = result ?: rc; + } + RETURN(result); +} + +static int lovsub_lock_closure(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure) +{ + struct lovsub_lock *sub; + struct cl_lock *parent; + struct lov_lock_link *scan; + int result; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + ENTRY; + + sub = cl2lovsub_lock(slice); + result = 0; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + parent = scan->lll_super->lls_cl.cls_lock; + result = cl_lock_closure_build(env, parent, closure); + if (result != 0) + break; + } + RETURN(result); +} + +/** + * A helper function for lovsub_lock_delete() that deals with a given parent + * top-lock. + */ +static int lovsub_lock_delete_one(const struct lu_env *env, + struct cl_lock *child, struct lov_lock *lov) +{ + struct cl_lock *parent; + int result; + ENTRY; + + parent = lov->lls_cl.cls_lock; + if (parent->cll_error) + RETURN(0); + + result = 0; + switch (parent->cll_state) { + case CLS_ENQUEUED: + /* See LU-1355 for the case that a glimpse lock is + * interrupted by signal */ + LASSERT(parent->cll_flags & CLF_CANCELLED); + break; + case CLS_QUEUING: + case CLS_FREEING: + cl_lock_signal(env, parent); + break; + case CLS_INTRANSIT: + /* + * Here lies a problem: a sub-lock is canceled while top-lock + * is being unlocked. Top-lock cannot be moved into CLS_NEW + * state, because unlocking has to succeed eventually by + * placing lock into CLS_CACHED (or failing it), see + * cl_unuse_try(). Nor can top-lock be left in CLS_CACHED + * state, because lov maintains an invariant that all + * sub-locks exist in CLS_CACHED (this allows cached top-lock + * to be reused immediately). Nor can we wait for top-lock + * state to change, because this can be synchronous to the + * current thread. + * + * We know for sure that lov_lock_unuse() will be called at + * least one more time to finish un-using, so leave a mark on + * the top-lock, that will be seen by the next call to + * lov_lock_unuse(). + */ + if (cl_lock_is_intransit(parent)) + lov->lls_cancel_race = 1; + break; + case CLS_CACHED: + /* + * if a sub-lock is canceled move its top-lock into CLS_NEW + * state to preserve an invariant that a top-lock in + * CLS_CACHED is immediately ready for re-use (i.e., has all + * sub-locks), and so that next attempt to re-use the top-lock + * enqueues missing sub-lock. + */ + cl_lock_state_set(env, parent, CLS_NEW); + /* fall through */ + case CLS_NEW: + /* + * if last sub-lock is canceled, destroy the top-lock (which + * is now `empty') proactively. + */ + if (lov->lls_nr_filled == 0) { + /* ... but unfortunately, this cannot be done easily, + * as cancellation of a top-lock might acquire mutices + * of its other sub-locks, violating lock ordering, + * see cl_lock_{cancel,delete}() preconditions. + * + * To work around this, the mutex of this sub-lock is + * released, top-lock is destroyed, and sub-lock mutex + * acquired again. The list of parents has to be + * re-scanned from the beginning after this. + * + * Only do this if no mutices other than on @child and + * @parent are held by the current thread. + * + * TODO: The lock modal here is too complex, because + * the lock may be canceled and deleted by voluntarily: + * cl_lock_request + * -> osc_lock_enqueue_wait + * -> osc_lock_cancel_wait + * -> cl_lock_delete + * -> lovsub_lock_delete + * -> cl_lock_cancel/delete + * -> ... + * + * The better choice is to spawn a kernel thread for + * this purpose. -jay + */ + if (cl_lock_nr_mutexed(env) == 2) { + cl_lock_mutex_put(env, child); + cl_lock_cancel(env, parent); + cl_lock_delete(env, parent); + result = 1; + } + } + break; + case CLS_HELD: + CL_LOCK_DEBUG(D_ERROR, env, parent, "Delete CLS_HELD lock\n"); + default: + CERROR("Impossible state: %d\n", parent->cll_state); + LBUG(); + break; + } + + RETURN(result); +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked in "bottom-to-top" delete, when lock destruction starts from the + * sub-lock (e.g, as a result of ldlm lock LRU policy). + */ +static void lovsub_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *child = slice->cls_lock; + struct lovsub_lock *sub = cl2lovsub_lock(slice); + int restart; + + LASSERT(cl_lock_is_mutexed(child)); + + ENTRY; + /* + * Destruction of a sub-lock might take multiple iterations, because + * when the last sub-lock of a given top-lock is deleted, top-lock is + * canceled proactively, and this requires to release sub-lock + * mutex. Once sub-lock mutex has been released, list of its parents + * has to be re-scanned from the beginning. + */ + do { + struct lov_lock *lov; + struct lov_lock_link *scan; + struct lov_lock_link *temp; + struct lov_lock_sub *subdata; + + restart = 0; + list_for_each_entry_safe(scan, temp, + &sub->lss_parents, lll_list) { + lov = scan->lll_super; + subdata = &lov->lls_sub[scan->lll_idx]; + lovsub_parent_lock(env, lov); + subdata->sub_got = subdata->sub_descr; + lov_lock_unlink(env, scan, sub); + restart = lovsub_lock_delete_one(env, child, lov); + lovsub_parent_unlock(env, lov); + + if (restart) { + cl_lock_mutex_get(env, child); + break; + } + } + } while (restart); + EXIT; +} + +static int lovsub_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock *lov; + struct lov_lock_link *scan; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + lov = scan->lll_super; + (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov); + if (lov != NULL) + cl_lock_descr_print(env, cookie, p, + &lov->lls_cl.cls_lock->cll_descr); + (*p)(env, cookie, "] "); + } + return 0; +} + +static const struct cl_lock_operations lovsub_lock_ops = { + .clo_fini = lovsub_lock_fini, + .clo_state = lovsub_lock_state, + .clo_delete = lovsub_lock_delete, + .clo_modify = lovsub_lock_modify, + .clo_closure = lovsub_lock_closure, + .clo_weigh = lovsub_lock_weigh, + .clo_print = lovsub_lock_print +}; + +int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lovsub_lock *lsk; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lsk, lovsub_lock_kmem, __GFP_IO); + if (lsk != NULL) { + INIT_LIST_HEAD(&lsk->lss_parents); + cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops); + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c new file mode 100644 index 000000000000..1b83d9081c40 --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lovsub_object.c @@ -0,0 +1,170 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub object operations. + * + */ + +int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); + struct lu_object *below; + struct lu_device *under; + + int result; + + ENTRY; + under = &dev->acid_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + lu_object_add(obj, below); + cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page)); + result = 0; + } else + result = -ENOMEM; + RETURN(result); + +} + +static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + struct lov_object *lov = los->lso_super; + ENTRY; + + /* We can't assume lov was assigned here, because of the shadow + * object handling in lu_object_find. + */ + if (lov) { + LASSERT(lov->lo_type == LLT_RAID0); + LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los); + spin_lock(&lov->u.raid0.lo_sub_lock); + lov->u.raid0.lo_sub[los->lso_index] = NULL; + spin_unlock(&lov->u.raid0.lo_sub_lock); + } + + lu_object_fini(obj); + lu_object_header_fini(&los->lso_header.coh_lu); + OBD_SLAB_FREE_PTR(los, lovsub_object_kmem); + EXIT; +} + +static int lovsub_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + + return (*p)(env, cookie, "[%d]", los->lso_index); +} + +static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_object *lov = cl2lovsub(obj)->lso_super; + + ENTRY; + lov_r0(lov)->lo_attr_valid = 0; + RETURN(0); +} + +static int lovsub_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lovsub_object *los = cl2lovsub(obj); + + ENTRY; + RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb)); +} + + + +static const struct cl_object_operations lovsub_ops = { + .coo_page_init = lovsub_page_init, + .coo_lock_init = lovsub_lock_init, + .coo_attr_set = lovsub_attr_set, + .coo_glimpse = lovsub_object_glimpse +}; + +static const struct lu_object_operations lovsub_lu_obj_ops = { + .loo_object_init = lovsub_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = lovsub_object_free, + .loo_object_print = lovsub_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct lovsub_object *los; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(los, lovsub_object_kmem, __GFP_IO); + if (los != NULL) { + struct cl_object_header *hdr; + + obj = lovsub2lu(los); + hdr = &los->lso_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + los->lso_cl.co_ops = &lovsub_ops; + obj->lo_ops = &lovsub_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c new file mode 100644 index 000000000000..bc9e683968da --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lovsub_page.c @@ -0,0 +1,72 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov + * @{ + */ + +/***************************************************************************** + * + * Lovsub page operations. + * + */ + +static void lovsub_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ +} + +static const struct cl_page_operations lovsub_page_ops = { + .cpo_fini = lovsub_page_fini +}; + +int lovsub_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *unused) +{ + struct lovsub_page *lsb = cl_object_page_slice(obj, page); + ENTRY; + + cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops); + RETURN(0); +} + +/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c new file mode 100644 index 000000000000..732d5c70edbb --- /dev/null +++ b/drivers/staging/lustre/lustre/lov/lproc_lov.c @@ -0,0 +1,304 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "lov_internal.h" + +#ifdef LPROCFS +static int lov_rd_stripesize(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size); +} + +static int lov_wr_stripesize(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + __u64 val; + int rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_stripe_size(&val); + desc->ld_default_stripe_size = val; + return count; +} + +static int lov_rd_stripeoffset(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_offset); +} + +static int lov_wr_stripeoffset(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + __u64 val; + int rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + desc->ld_default_stripe_offset = val; + return count; +} + +static int lov_rd_stripetype(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* dev = (struct obd_device*)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_pattern); +} + +static int lov_wr_stripetype(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + int val, rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_pattern(&val); + desc->ld_pattern = val; + return count; +} + +static int lov_rd_stripecount(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, "%d\n", + (__s16)(desc->ld_default_stripe_count + 1) - 1); +} + +static int lov_wr_stripecount(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc; + int val, rc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + lov_fix_desc_stripe_count(&val); + desc->ld_default_stripe_count = val; + return count; +} + +static int lov_rd_numobd(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_tgt_count); + +} + +static int lov_rd_activeobd(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* dev = (struct obd_device*)data; + struct lov_desc *desc; + + LASSERT(dev != NULL); + desc = &dev->u.lov.desc; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_active_tgt_count); +} + +static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*) data; + struct lov_obd *lov; + + LASSERT(dev != NULL); + lov = &dev->u.lov; + *eof = 1; + return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid); +} + +static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + ++*pos; + } + return NULL; +} + +static void lov_tgt_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_device *dev = p->private; + struct lov_obd *lov = &dev->u.lov; + + while (++*pos < lov->desc.ld_tgt_count) { + if (lov->lov_tgts[*pos]) + return lov->lov_tgts[*pos]; + } + return NULL; +} + +static int lov_tgt_seq_show(struct seq_file *p, void *v) +{ + struct lov_tgt_desc *tgt = v; + return seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index, + obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_active ? "" : "IN"); +} + +struct seq_operations lov_tgt_sops = { + .start = lov_tgt_seq_start, + .stop = lov_tgt_seq_stop, + .next = lov_tgt_seq_next, + .show = lov_tgt_seq_show, +}; + +static int lov_target_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + int rc; + + LPROCFS_ENTRY_AND_CHECK(dp); + rc = seq_open(file, &lov_tgt_sops); + if (rc) { + LPROCFS_EXIT(); + return rc; + } + + seq = file->private_data; + seq->private = dp->data; + return 0; +} + +struct lprocfs_vars lprocfs_lov_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "stripesize", lov_rd_stripesize, lov_wr_stripesize, 0 }, + { "stripeoffset", lov_rd_stripeoffset, lov_wr_stripeoffset, 0 }, + { "stripecount", lov_rd_stripecount, lov_wr_stripecount, 0 }, + { "stripetype", lov_rd_stripetype, lov_wr_stripetype, 0 }, + { "numobd", lov_rd_numobd, 0, 0 }, + { "activeobd", lov_rd_activeobd, 0, 0 }, + { "filestotal", lprocfs_rd_filestotal, 0, 0 }, + { "filesfree", lprocfs_rd_filesfree, 0, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/ + { "blocksize", lprocfs_rd_blksize, 0, 0 }, + { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, + { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, + { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, + { "desc_uuid", lov_rd_desc_uuid, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_lov_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_lov_module_vars; + lvars->obd_vars = lprocfs_lov_obd_vars; +} + +struct file_operations lov_proc_target_fops = { + .owner = THIS_MODULE, + .open = lov_target_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/lvfs/Makefile b/drivers/staging/lustre/lustre/lvfs/Makefile new file mode 100644 index 000000000000..f50b1c574385 --- /dev/null +++ b/drivers/staging/lustre/lustre/lvfs/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_LUSTRE_FS) += lvfs.o + +lvfs-y := lvfs_linux.o fsfilt.o lvfs_lib.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt.c b/drivers/staging/lustre/lustre/lvfs/fsfilt.c new file mode 100644 index 000000000000..064445cbdb57 --- /dev/null +++ b/drivers/staging/lustre/lustre/lvfs/fsfilt.c @@ -0,0 +1,138 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include +#include +#include +#include + +LIST_HEAD(fsfilt_types); + +static struct fsfilt_operations *fsfilt_search_type(const char *type) +{ + struct fsfilt_operations *found; + struct list_head *p; + + list_for_each(p, &fsfilt_types) { + found = list_entry(p, struct fsfilt_operations, fs_list); + if (!strcmp(found->fs_type, type)) { + return found; + } + } + return NULL; +} + +int fsfilt_register_ops(struct fsfilt_operations *fs_ops) +{ + struct fsfilt_operations *found; + + /* lock fsfilt_types list */ + if ((found = fsfilt_search_type(fs_ops->fs_type))) { + if (found != fs_ops) { + CERROR("different operations for type %s\n", + fs_ops->fs_type); + /* unlock fsfilt_types list */ + RETURN(-EEXIST); + } + } else { + try_module_get(THIS_MODULE); + list_add(&fs_ops->fs_list, &fsfilt_types); + } + + /* unlock fsfilt_types list */ + return 0; +} +EXPORT_SYMBOL(fsfilt_register_ops); + +void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops) +{ + struct list_head *p; + + /* lock fsfilt_types list */ + list_for_each(p, &fsfilt_types) { + struct fsfilt_operations *found; + + found = list_entry(p, typeof(*found), fs_list); + if (found == fs_ops) { + list_del(p); + module_put(THIS_MODULE); + break; + } + } + /* unlock fsfilt_types list */ +} +EXPORT_SYMBOL(fsfilt_unregister_ops); + +struct fsfilt_operations *fsfilt_get_ops(const char *type) +{ + struct fsfilt_operations *fs_ops; + + /* lock fsfilt_types list */ + if (!(fs_ops = fsfilt_search_type(type))) { + char name[32]; + int rc; + + snprintf(name, sizeof(name) - 1, "fsfilt_%s", type); + name[sizeof(name) - 1] = '\0'; + + if (!(rc = request_module("%s", name))) { + fs_ops = fsfilt_search_type(type); + CDEBUG(D_INFO, "Loaded module '%s'\n", name); + if (!fs_ops) + rc = -ENOENT; + } + + if (rc) { + CERROR("Can't find %s interface\n", name); + RETURN(ERR_PTR(rc < 0 ? rc : -rc)); + /* unlock fsfilt_types list */ + } + } + try_module_get(fs_ops->fs_owner); + /* unlock fsfilt_types list */ + + return fs_ops; +} +EXPORT_SYMBOL(fsfilt_get_ops); + +void fsfilt_put_ops(struct fsfilt_operations *fs_ops) +{ + module_put(fs_ops->fs_owner); +} +EXPORT_SYMBOL(fsfilt_put_ops); diff --git a/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c new file mode 100644 index 000000000000..c1e99b37572e --- /dev/null +++ b/drivers/staging/lustre/lustre/lvfs/fsfilt_ext3.c @@ -0,0 +1,761 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lvfs/fsfilt_ext3.c + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#ifdef HAVE_EXT_PBLOCK /* Name changed to ext4_ext_pblock for kernel 2.6.35 */ +#define ext3_ext_pblock(ex) ext_pblock((ex)) +#endif + +/* for kernels 2.6.18 and later */ +#define FSFILT_SINGLEDATA_TRANS_BLOCKS(sb) EXT3_SINGLEDATA_TRANS_BLOCKS(sb) + +#define fsfilt_ext3_ext_insert_extent(handle, inode, path, newext, flag) \ + ext3_ext_insert_extent(handle, inode, path, newext, flag) + +#define ext3_mb_discard_inode_preallocations(inode) \ + ext3_discard_preallocations(inode) + +#define fsfilt_log_start_commit(journal, tid) jbd2_log_start_commit(journal, tid) +#define fsfilt_log_wait_commit(journal, tid) jbd2_log_wait_commit(journal, tid) + +static struct kmem_cache *fcb_cache; + +struct fsfilt_cb_data { + struct ext4_journal_cb_entry cb_jcb; /* private data - MUST BE FIRST */ + fsfilt_cb_t cb_func; /* MDS/OBD completion function */ + struct obd_device *cb_obd; /* MDS/OBD completion device */ + __u64 cb_last_rcvd; /* MDS/OST last committed operation */ + void *cb_data; /* MDS/OST completion function data */ +}; + +static char *fsfilt_ext3_get_label(struct super_block *sb) +{ + return EXT3_SB(sb)->s_es->s_volume_name; +} + +/* kernel has ext4_blocks_for_truncate since linux-3.1.1 */ +# include + +/* + * We don't currently need any additional blocks for rmdir and + * unlink transactions because we are storing the OST oa_id inside + * the inode (which we will be changing anyways as part of this + * transaction). + */ +static void *fsfilt_ext3_start(struct inode *inode, int op, void *desc_private, + int logs) +{ + /* For updates to the last received file */ + int nblocks = FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb); + journal_t *journal; + void *handle; + + if (current->journal_info) { + CDEBUG(D_INODE, "increasing refcount on %p\n", + current->journal_info); + goto journal_start; + } + + switch(op) { + case FSFILT_OP_UNLINK: + /* delete one file + create/update logs for each stripe */ + nblocks += EXT3_DELETE_TRANS_BLOCKS(inode->i_sb); + nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS + + FSFILT_SINGLEDATA_TRANS_BLOCKS(inode->i_sb)) * logs; + break; + case FSFILT_OP_CANCEL_UNLINK: + LASSERT(logs == 1); + + /* blocks for log header bitmap update OR + * blocks for catalog header bitmap update + unlink of logs + + * blocks for delete the inode (include blocks truncating). */ + nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) + + EXT3_DELETE_TRANS_BLOCKS(inode->i_sb) + + ext4_blocks_for_truncate(inode) + 3; + break; + default: CERROR("unknown transaction start op %d\n", op); + LBUG(); + } + + LASSERT(current->journal_info == desc_private); + journal = EXT3_SB(inode->i_sb)->s_journal; + if (nblocks > journal->j_max_transaction_buffers) { + CWARN("too many credits %d for op %ux%u using %d instead\n", + nblocks, op, logs, journal->j_max_transaction_buffers); + nblocks = journal->j_max_transaction_buffers; + } + + journal_start: + LASSERTF(nblocks > 0, "can't start %d credit transaction\n", nblocks); + handle = ext3_journal_start(inode, nblocks); + + if (!IS_ERR(handle)) + LASSERT(current->journal_info == handle); + else + CERROR("error starting handle for op %u (%u credits): rc %ld\n", + op, nblocks, PTR_ERR(handle)); + return handle; +} + +static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync) +{ + int rc; + handle_t *handle = h; + + LASSERT(current->journal_info == handle); + if (force_sync) + handle->h_sync = 1; /* recovery likes this */ + + rc = ext3_journal_stop(handle); + + return rc; +} + +#ifndef EXT3_EXTENTS_FL +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#endif + +#ifndef EXT_ASSERT +#define EXT_ASSERT(cond) BUG_ON(!(cond)) +#endif + +#define EXT_GENERATION(inode) (EXT4_I(inode)->i_ext_generation) +#define ext3_ext_base inode +#define ext3_ext_base2inode(inode) (inode) +#define EXT_DEPTH(inode) ext_depth(inode) +#define fsfilt_ext3_ext_walk_space(inode, block, num, cb, cbdata) \ + ext3_ext_walk_space(inode, block, num, cb, cbdata); + +struct bpointers { + unsigned long *blocks; + unsigned long start; + int num; + int init_num; + int create; +}; + +static long ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, + unsigned long block, int *aflags) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + int depth; + + if (path) { + struct ext3_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + if ((ex = path[depth].p_ext)) + return ext4_ext_pblock(ex) + (block - le32_to_cpu(ex->ee_block)); + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour + block; +} + +#define ll_unmap_underlying_metadata(sb, blocknr) \ + unmap_underlying_metadata((sb)->s_bdev, blocknr) + +#ifndef EXT3_MB_HINT_GROUP_ALLOC +static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base, + struct ext3_ext_path *path, unsigned long block, + unsigned long *count, int *err) +{ + unsigned long pblock, goal; + int aflags = 0; + struct inode *inode = ext3_ext_base2inode(base); + + goal = ext3_ext_find_goal(inode, path, block, &aflags); + aflags |= 2; /* block have been already reserved */ + pblock = ext3_mb_new_blocks(handle, inode, goal, count, aflags, err); + return pblock; + +} +#else +static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base, + struct ext3_ext_path *path, unsigned long block, + unsigned long *count, int *err) +{ + struct inode *inode = ext3_ext_base2inode(base); + struct ext3_allocation_request ar; + unsigned long pblock; + int aflags; + + /* find neighbour allocated blocks */ + ar.lleft = block; + *err = ext3_ext_search_left(base, path, &ar.lleft, &ar.pleft); + if (*err) + return 0; + ar.lright = block; + *err = ext3_ext_search_right(base, path, &ar.lright, &ar.pright); + if (*err) + return 0; + + /* allocate new block */ + ar.goal = ext3_ext_find_goal(inode, path, block, &aflags); + ar.inode = inode; + ar.logical = block; + ar.len = *count; + ar.flags = EXT3_MB_HINT_DATA; + pblock = ext3_mb_new_blocks(handle, &ar, err); + *count = ar.len; + return pblock; +} +#endif + +static int ext3_ext_new_extent_cb(struct ext3_ext_base *base, + struct ext3_ext_path *path, + struct ext3_ext_cache *cex, +#ifdef HAVE_EXT_PREPARE_CB_EXTENT + struct ext3_extent *ex, +#endif + void *cbdata) +{ + struct bpointers *bp = cbdata; + struct inode *inode = ext3_ext_base2inode(base); + struct ext3_extent nex; + unsigned long pblock; + unsigned long tgen; + int err, i; + unsigned long count; + handle_t *handle; + +#ifdef EXT3_EXT_CACHE_EXTENT + if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) +#else + if ((cex->ec_len != 0) && (cex->ec_start != 0)) +#endif + { + err = EXT_CONTINUE; + goto map; + } + + if (bp->create == 0) { + i = 0; + if (cex->ec_block < bp->start) + i = bp->start - cex->ec_block; + if (i >= cex->ec_len) + CERROR("nothing to do?! i = %d, e_num = %u\n", + i, cex->ec_len); + for (; i < cex->ec_len && bp->num; i++) { + *(bp->blocks) = 0; + bp->blocks++; + bp->num--; + bp->start++; + } + + return EXT_CONTINUE; + } + + tgen = EXT_GENERATION(base); + count = ext3_ext_calc_credits_for_insert(base, path); + + handle = ext3_journal_start(inode, count+EXT3_ALLOC_NEEDED+1); + if (IS_ERR(handle)) { + return PTR_ERR(handle); + } + + if (tgen != EXT_GENERATION(base)) { + /* the tree has changed. so path can be invalid at moment */ + ext3_journal_stop(handle); + return EXT_REPEAT; + } + + /* In 2.6.32 kernel, ext4_ext_walk_space()'s callback func is not + * protected by i_data_sem as whole. so we patch it to store + * generation to path and now verify the tree hasn't changed */ + down_write((&EXT4_I(inode)->i_data_sem)); + + /* validate extent, make sure the extent tree does not changed */ + if (EXT_GENERATION(base) != path[0].p_generation) { + /* cex is invalid, try again */ + up_write(&EXT4_I(inode)->i_data_sem); + ext3_journal_stop(handle); + return EXT_REPEAT; + } + + count = cex->ec_len; + pblock = new_blocks(handle, base, path, cex->ec_block, &count, &err); + if (!pblock) + goto out; + EXT_ASSERT(count <= cex->ec_len); + + /* insert new extent */ + nex.ee_block = cpu_to_le32(cex->ec_block); + ext3_ext_store_pblock(&nex, pblock); + nex.ee_len = cpu_to_le16(count); + err = fsfilt_ext3_ext_insert_extent(handle, base, path, &nex, 0); + if (err) { + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ +#ifdef EXT3_MB_HINT_GROUP_ALLOC + ext3_mb_discard_inode_preallocations(inode); +#endif +#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */ + ext3_free_blocks(handle, inode, NULL, ext4_ext_pblock(&nex), + cpu_to_le16(nex.ee_len), 0); +#else + ext3_free_blocks(handle, inode, ext4_ext_pblock(&nex), + cpu_to_le16(nex.ee_len), 0); +#endif + goto out; + } + + /* + * Putting len of the actual extent we just inserted, + * we are asking ext3_ext_walk_space() to continue + * scaning after that block + */ + cex->ec_len = le16_to_cpu(nex.ee_len); + cex->ec_start = ext4_ext_pblock(&nex); + BUG_ON(le16_to_cpu(nex.ee_len) == 0); + BUG_ON(le32_to_cpu(nex.ee_block) != cex->ec_block); + +out: + up_write((&EXT4_I(inode)->i_data_sem)); + ext3_journal_stop(handle); +map: + if (err >= 0) { + /* map blocks */ + if (bp->num == 0) { + CERROR("hmm. why do we find this extent?\n"); + CERROR("initial space: %lu:%u\n", + bp->start, bp->init_num); +#ifdef EXT3_EXT_CACHE_EXTENT + CERROR("current extent: %u/%u/%llu %d\n", + cex->ec_block, cex->ec_len, + (unsigned long long)cex->ec_start, + cex->ec_type); +#else + CERROR("current extent: %u/%u/%llu\n", + cex->ec_block, cex->ec_len, + (unsigned long long)cex->ec_start); +#endif + } + i = 0; + if (cex->ec_block < bp->start) + i = bp->start - cex->ec_block; + if (i >= cex->ec_len) + CERROR("nothing to do?! i = %d, e_num = %u\n", + i, cex->ec_len); + for (; i < cex->ec_len && bp->num; i++) { + *(bp->blocks) = cex->ec_start + i; +#ifdef EXT3_EXT_CACHE_EXTENT + if (cex->ec_type != EXT3_EXT_CACHE_EXTENT) +#else + if ((cex->ec_len == 0) || (cex->ec_start == 0)) +#endif + { + /* unmap any possible underlying metadata from + * the block device mapping. bug 6998. */ + ll_unmap_underlying_metadata(inode->i_sb, + *(bp->blocks)); + } + bp->blocks++; + bp->num--; + bp->start++; + } + } + return err; +} + +int fsfilt_map_nblocks(struct inode *inode, unsigned long block, + unsigned long num, unsigned long *blocks, + int create) +{ + struct ext3_ext_base *base = inode; + struct bpointers bp; + int err; + + CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n", + block, block + num - 1, (unsigned) inode->i_ino); + + bp.blocks = blocks; + bp.start = block; + bp.init_num = bp.num = num; + bp.create = create; + + err = fsfilt_ext3_ext_walk_space(base, block, num, + ext3_ext_new_extent_cb, &bp); + ext3_ext_invalidate_cache(base); + + return err; +} + +int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int create) +{ + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + int rc = 0, i = 0; + struct page *fp = NULL; + int clen = 0; + + CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", + inode->i_ino, pages, (*page)->index); + + /* pages are sorted already. so, we just have to find + * contig. space and process them properly */ + while (i < pages) { + if (fp == NULL) { + /* start new extent */ + fp = *page++; + clen = 1; + i++; + continue; + } else if (fp->index + clen == (*page)->index) { + /* continue the extent */ + page++; + clen++; + i++; + continue; + } + + /* process found extent */ + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + create); + if (rc) + GOTO(cleanup, rc); + + /* look for next extent */ + fp = NULL; + blocks += blocks_per_page * clen; + } + + if (fp) + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + create); +cleanup: + return rc; +} + +int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int create) +{ + int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits; + unsigned long *b; + int rc = 0, i; + + for (i = 0, b = blocks; i < pages; i++, page++) { + rc = ext3_map_inode_page(inode, *page, b, create); + if (rc) { + CERROR("ino %lu, blk %lu create %d: rc %d\n", + inode->i_ino, *b, create, rc); + break; + } + + b += blocks_per_page; + } + return rc; +} + +int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int create, struct mutex *optional_mutex) +{ + int rc; + + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) { + rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages, + blocks, create); + return rc; + } + if (optional_mutex != NULL) + mutex_lock(optional_mutex); + rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create); + if (optional_mutex != NULL) + mutex_unlock(optional_mutex); + + return rc; +} + +int fsfilt_ext3_read(struct inode *inode, void *buf, int size, loff_t *offs) +{ + unsigned long block; + struct buffer_head *bh; + int err, blocksize, csize, boffs, osize = size; + + /* prevent reading after eof */ + spin_lock(&inode->i_lock); + if (i_size_read(inode) < *offs + size) { + size = i_size_read(inode) - *offs; + spin_unlock(&inode->i_lock); + if (size < 0) { + CDEBUG(D_EXT2, "size %llu is too short for read @%llu\n", + i_size_read(inode), *offs); + return -EBADR; + } else if (size == 0) { + return 0; + } + } else { + spin_unlock(&inode->i_lock); + } + + blocksize = 1 << inode->i_blkbits; + + while (size > 0) { + block = *offs >> inode->i_blkbits; + boffs = *offs & (blocksize - 1); + csize = min(blocksize - boffs, size); + bh = ext3_bread(NULL, inode, block, 0, &err); + if (!bh) { + CERROR("can't read block: %d\n", err); + return err; + } + + memcpy(buf, bh->b_data + boffs, csize); + brelse(bh); + + *offs += csize; + buf += csize; + size -= csize; + } + return osize; +} +EXPORT_SYMBOL(fsfilt_ext3_read); + +static int fsfilt_ext3_read_record(struct file * file, void *buf, + int size, loff_t *offs) +{ + int rc; + rc = fsfilt_ext3_read(file->f_dentry->d_inode, buf, size, offs); + if (rc > 0) + rc = 0; + return rc; +} + +int fsfilt_ext3_write_handle(struct inode *inode, void *buf, int bufsize, + loff_t *offs, handle_t *handle) +{ + struct buffer_head *bh = NULL; + loff_t old_size = i_size_read(inode), offset = *offs; + loff_t new_size = i_size_read(inode); + unsigned long block; + int err = 0, blocksize = 1 << inode->i_blkbits, size, boffs; + + while (bufsize > 0) { + if (bh != NULL) + brelse(bh); + + block = offset >> inode->i_blkbits; + boffs = offset & (blocksize - 1); + size = min(blocksize - boffs, bufsize); + bh = ext3_bread(handle, inode, block, 1, &err); + if (!bh) { + CERROR("can't read/create block: %d\n", err); + break; + } + + err = ext3_journal_get_write_access(handle, bh); + if (err) { + CERROR("journal_get_write_access() returned error %d\n", + err); + break; + } + LASSERT(bh->b_data + boffs + size <= bh->b_data + bh->b_size); + memcpy(bh->b_data + boffs, buf, size); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) { + CERROR("journal_dirty_metadata() returned error %d\n", + err); + break; + } + if (offset + size > new_size) + new_size = offset + size; + offset += size; + bufsize -= size; + buf += size; + } + if (bh) + brelse(bh); + + /* correct in-core and on-disk sizes */ + if (new_size > i_size_read(inode)) { + spin_lock(&inode->i_lock); + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (i_size_read(inode) > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = i_size_read(inode); + if (i_size_read(inode) > old_size) { + spin_unlock(&inode->i_lock); + mark_inode_dirty(inode); + } else { + spin_unlock(&inode->i_lock); + } + } + + if (err == 0) + *offs = offset; + return err; +} +EXPORT_SYMBOL(fsfilt_ext3_write_handle); + +static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize, + loff_t *offs, int force_sync) +{ + struct inode *inode = file->f_dentry->d_inode; + handle_t *handle; + int err, block_count = 0, blocksize; + + /* Determine how many transaction credits are needed */ + blocksize = 1 << inode->i_blkbits; + block_count = (*offs & (blocksize - 1)) + bufsize; + block_count = (block_count + blocksize - 1) >> inode->i_blkbits; + + handle = ext3_journal_start(inode, + block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2); + if (IS_ERR(handle)) { + CERROR("can't start transaction for %d blocks (%d bytes)\n", + block_count * EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + 2, + bufsize); + return PTR_ERR(handle); + } + + err = fsfilt_ext3_write_handle(inode, buf, bufsize, offs, handle); + + if (!err && force_sync) + handle->h_sync = 1; /* recovery likes this */ + + ext3_journal_stop(handle); + + return err; +} + +static int fsfilt_ext3_setup(struct super_block *sb) +{ + if (!EXT3_HAS_COMPAT_FEATURE(sb, + EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + CERROR("ext3 mounted without journal\n"); + return -EINVAL; + } + +#ifdef S_PDIROPS + CWARN("Enabling PDIROPS\n"); + set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS); + sb->s_flags |= S_PDIROPS; +#endif + if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) + CWARN("filesystem doesn't have dir_index feature enabled\n"); + return 0; +} +static struct fsfilt_operations fsfilt_ext3_ops = { + .fs_type = "ext3", + .fs_owner = THIS_MODULE, + .fs_getlabel = fsfilt_ext3_get_label, + .fs_start = fsfilt_ext3_start, + .fs_commit = fsfilt_ext3_commit, + .fs_map_inode_pages = fsfilt_ext3_map_inode_pages, + .fs_write_record = fsfilt_ext3_write_record, + .fs_read_record = fsfilt_ext3_read_record, + .fs_setup = fsfilt_ext3_setup, +}; + +static int __init fsfilt_ext3_init(void) +{ + int rc; + + fcb_cache = kmem_cache_create("fsfilt_ext3_fcb", + sizeof(struct fsfilt_cb_data), 0, 0); + if (!fcb_cache) { + CERROR("error allocating fsfilt journal callback cache\n"); + GOTO(out, rc = -ENOMEM); + } + + rc = fsfilt_register_ops(&fsfilt_ext3_ops); + + if (rc) { + int err = kmem_cache_destroy(fcb_cache); + LASSERTF(err == 0, "error destroying new cache: rc %d\n", err); + } +out: + return rc; +} + +static void __exit fsfilt_ext3_exit(void) +{ + int rc; + + fsfilt_unregister_ops(&fsfilt_ext3_ops); + rc = kmem_cache_destroy(fcb_cache); + LASSERTF(rc == 0, "couldn't destroy fcb_cache slab\n"); +} + +module_init(fsfilt_ext3_init); +module_exit(fsfilt_ext3_exit); + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre ext3 Filesystem Helper v0.1"); +MODULE_LICENSE("GPL"); diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c new file mode 100644 index 000000000000..97a8be2300dd --- /dev/null +++ b/drivers/staging/lustre/lustre/lvfs/lvfs_lib.c @@ -0,0 +1,173 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lvfs/lvfs_lib.c + * + * Lustre filesystem abstraction routines + * + * Author: Andreas Dilger + */ +#include +#include +#include + +#ifdef LPROCFS +void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + percpu_cntr->lc_count++; + + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * lprocfs_counter_add() can be called in interrupt context, + * as memory allocation could trigger memory shrinker call + * ldlm_pool_shrink(), which calls lprocfs_counter_add(). + * LU-1727. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq += amount; + else + percpu_cntr->lc_sum += amount; + + if (header->lc_config & LPROCFS_CNTR_STDDEV) + percpu_cntr->lc_sumsquare += (__s64)amount * amount; + if (amount < percpu_cntr->lc_min) + percpu_cntr->lc_min = amount; + if (amount > percpu_cntr->lc_max) + percpu_cntr->lc_max = amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_add); + +void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int smp_id; + unsigned long flags = 0; + + if (stats == NULL) + return; + + /* With per-client stats, statistics are allocated only for + * single CPU area, so the smp_id should be 0 always. */ + smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); + if (smp_id < 0) + return; + + header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); + if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + /* + * Sometimes we use RCU callbacks to free memory which calls + * lprocfs_counter_sub(), and RCU callbacks may execute in + * softirq context - right now that's the only case we're in + * softirq context here, use separate counter for that. + * bz20650. + * + * Only obd_memory uses LPROCFS_STATS_FLAG_IRQ_SAFE + * flag, because it needs accurate counting lest memory leak + * check reports error. + */ + if (in_interrupt() && + (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq -= amount; + else + percpu_cntr->lc_sum -= amount; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_sub); + +int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) +{ + struct lprocfs_counter *cntr; + unsigned int percpusize; + int rc = -ENOMEM; + unsigned long flags = 0; + int i; + + LASSERT(stats->ls_percpu[cpuid] == NULL); + LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0); + + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[cpuid], percpusize); + if (stats->ls_percpu[cpuid] != NULL) { + rc = 0; + if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, flags); + else + spin_lock(&stats->ls_lock); + if (stats->ls_biggest_alloc_num <= cpuid) + stats->ls_biggest_alloc_num = cpuid + 1; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + /* initialize the ls_percpu[cpuid] non-zero counter */ + for (i = 0; i < stats->ls_num; ++i) { + cntr = lprocfs_stats_counter_get(stats, cpuid, i); + cntr->lc_min = LC_MIN_INIT; + } + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_stats_alloc_one); +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c new file mode 100644 index 000000000000..1e6f32c3549b --- /dev/null +++ b/drivers/staging/lustre/lustre/lvfs/lvfs_linux.c @@ -0,0 +1,295 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lvfs/lvfs_linux.c + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct lprocfs_stats *obd_memory = NULL; +EXPORT_SYMBOL(obd_memory); +/* refine later and change to seqlock or simlar from libcfs */ + +/* Debugging check only needed during development */ +#ifdef OBD_CTXT_DEBUG +# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) +# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERTF(!segment_eq(get_fs(), get_ds()),\ + msg) +# define ASSERT_KERNEL_CTXT(msg) LASSERTF(segment_eq(get_fs(), get_ds()), msg) +#else +# define ASSERT_CTXT_MAGIC(magic) do {} while(0) +# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0) +# define ASSERT_KERNEL_CTXT(msg) do {} while(0) +#endif + +static void push_group_info(struct lvfs_run_ctxt *save, + struct group_info *ginfo) +{ + if (!ginfo) { + save->ngroups = current_ngroups; + current_ngroups = 0; + } else { + struct cred *cred; + task_lock(current); + save->group_info = current_cred()->group_info; + if ((cred = prepare_creds())) { + cred->group_info = ginfo; + commit_creds(cred); + } + task_unlock(current); + } +} + +static void pop_group_info(struct lvfs_run_ctxt *save, + struct group_info *ginfo) +{ + if (!ginfo) { + current_ngroups = save->ngroups; + } else { + struct cred *cred; + task_lock(current); + if ((cred = prepare_creds())) { + cred->group_info = save->group_info; + commit_creds(cred); + } + task_unlock(current); + } +} + +/* push / pop to root of obd store */ +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *uc) +{ + /* if there is underlaying dt_device then push_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + //ASSERT_NOT_KERNEL_CTXT("already in kernel context!\n"); + ASSERT_CTXT_MAGIC(new_ctx->magic); + OBD_SET_CTXT_MAGIC(save); + + save->fs = get_fs(); + LASSERT(d_refcount(cfs_fs_pwd(current->fs))); + LASSERT(d_refcount(new_ctx->pwd)); + save->pwd = dget(cfs_fs_pwd(current->fs)); + save->pwdmnt = mntget(cfs_fs_mnt(current->fs)); + save->luc.luc_umask = current_umask(); + save->ngroups = current_cred()->group_info->ngroups; + + LASSERT(save->pwd); + LASSERT(save->pwdmnt); + LASSERT(new_ctx->pwd); + LASSERT(new_ctx->pwdmnt); + + if (uc) { + struct cred *cred; + save->luc.luc_uid = current_uid(); + save->luc.luc_gid = current_gid(); + save->luc.luc_fsuid = current_fsuid(); + save->luc.luc_fsgid = current_fsgid(); + save->luc.luc_cap = current_cap(); + + if ((cred = prepare_creds())) { + cred->uid = uc->luc_uid; + cred->gid = uc->luc_gid; + cred->fsuid = uc->luc_fsuid; + cred->fsgid = uc->luc_fsgid; + cred->cap_effective = uc->luc_cap; + commit_creds(cred); + } + + push_group_info(save, + uc->luc_ginfo ?: + uc->luc_identity ? uc->luc_identity->mi_ginfo : + NULL); + } + current->fs->umask = 0; /* umask already applied on client */ + set_fs(new_ctx->fs); + ll_set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd); +} +EXPORT_SYMBOL(push_ctxt); + +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *uc) +{ + /* if there is underlaying dt_device then pop_ctxt is not needed */ + if (new_ctx->dt != NULL) + return; + + ASSERT_CTXT_MAGIC(saved->magic); + ASSERT_KERNEL_CTXT("popping non-kernel context!\n"); + + LASSERTF(cfs_fs_pwd(current->fs) == new_ctx->pwd, "%p != %p\n", + cfs_fs_pwd(current->fs), new_ctx->pwd); + LASSERTF(cfs_fs_mnt(current->fs) == new_ctx->pwdmnt, "%p != %p\n", + cfs_fs_mnt(current->fs), new_ctx->pwdmnt); + + set_fs(saved->fs); + ll_set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd); + + dput(saved->pwd); + mntput(saved->pwdmnt); + current->fs->umask = saved->luc.luc_umask; + if (uc) { + struct cred *cred; + if ((cred = prepare_creds())) { + cred->uid = saved->luc.luc_uid; + cred->gid = saved->luc.luc_gid; + cred->fsuid = saved->luc.luc_fsuid; + cred->fsgid = saved->luc.luc_fsgid; + cred->cap_effective = saved->luc.luc_cap; + commit_creds(cred); + } + + pop_group_info(saved, + uc->luc_ginfo ?: + uc->luc_identity ? uc->luc_identity->mi_ginfo : + NULL); + } +} +EXPORT_SYMBOL(pop_ctxt); + +/* utility to rename a file */ +int lustre_rename(struct dentry *dir, struct vfsmount *mnt, + char *oldname, char *newname) +{ + struct dentry *dchild_old, *dchild_new; + int err = 0; + ENTRY; + + ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n"); + CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", + (int)strlen(oldname), oldname, (int)strlen(newname), newname); + + dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname)); + if (IS_ERR(dchild_old)) + RETURN(PTR_ERR(dchild_old)); + + if (!dchild_old->d_inode) + GOTO(put_old, err = -ENOENT); + + dchild_new = ll_lookup_one_len(newname, dir, strlen(newname)); + if (IS_ERR(dchild_new)) + GOTO(put_old, err = PTR_ERR(dchild_new)); + + err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, + dir->d_inode, dchild_new, mnt); + + dput(dchild_new); +put_old: + dput(dchild_old); + RETURN(err); +} +EXPORT_SYMBOL(lustre_rename); + +/* Note: dput(dchild) will *not* be called if there is an error */ +struct l_file *l_dentry_open(struct lvfs_run_ctxt *ctxt, struct l_dentry *de, + int flags) +{ + struct path path = { + .dentry = de, + .mnt = ctxt->pwdmnt, + }; + return ll_dentry_open(&path, flags, current_cred()); +} +EXPORT_SYMBOL(l_dentry_open); + +#ifdef LPROCFS +__s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field) +{ + __s64 ret = 0; + + if (lc == NULL || header == NULL) + RETURN(0); + + switch (field) { + case LPROCFS_FIELDS_FLAGS_CONFIG: + ret = header->lc_config; + break; + case LPROCFS_FIELDS_FLAGS_SUM: + ret = lc->lc_sum; + if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + ret += lc->lc_sum_irq; + break; + case LPROCFS_FIELDS_FLAGS_MIN: + ret = lc->lc_min; + break; + case LPROCFS_FIELDS_FLAGS_MAX: + ret = lc->lc_max; + break; + case LPROCFS_FIELDS_FLAGS_AVG: + ret = (lc->lc_max - lc->lc_min) / 2; + break; + case LPROCFS_FIELDS_FLAGS_SUMSQUARE: + ret = lc->lc_sumsquare; + break; + case LPROCFS_FIELDS_FLAGS_COUNT: + ret = lc->lc_count; + break; + default: + break; + }; + + RETURN(ret); +} +EXPORT_SYMBOL(lprocfs_read_helper); +#endif /* LPROCFS */ + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre VFS Filesystem Helper v0.1"); +MODULE_LICENSE("GPL"); diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile new file mode 100644 index 000000000000..93bae242e761 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += mdc.o +mdc-y := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c new file mode 100644 index 000000000000..a6a8a0d3d009 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c @@ -0,0 +1,183 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#ifdef LPROCFS + +static int mdc_rd_max_rpcs_in_flight(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int mdc_wr_max_rpcs_in_flight(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 1 || val > MDC_MAX_RIF_MAX) + return -ERANGE; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return count; +} + +/* temporary for testing */ +static int mdc_wr_kuc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct kuc_hdr *lh; + struct hsm_action_list *hal; + struct hsm_action_item *hai; + int len; + int fd, rc; + ENTRY; + + rc = lprocfs_write_helper(buffer, count, &fd); + if (rc) + RETURN(rc); + + if (fd < 0) + RETURN(-ERANGE); + CWARN("message to fd %d\n", fd); + + len = sizeof(*lh) + sizeof(*hal) + MTI_NAME_MAXLEN + + /* for mockup below */ 2 * cfs_size_round(sizeof(*hai)); + + OBD_ALLOC(lh, len); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = KUC_TRANSPORT_HSM; + lh->kuc_msgtype = HMT_ACTION_LIST; + lh->kuc_msglen = len; + + hal = (struct hsm_action_list *)(lh + 1); + hal->hal_version = HAL_VERSION; + hal->hal_archive_id = 1; + hal->hal_flags = 0; + obd_uuid2fsname(hal->hal_fsname, obd->obd_name, MTI_NAME_MAXLEN); + + /* mock up an action list */ + hal->hal_count = 2; + hai = hai_zero(hal); + hai->hai_action = HSMA_ARCHIVE; + hai->hai_fid.f_oid = 5; + hai->hai_len = sizeof(*hai); + hai = hai_next(hai); + hai->hai_action = HSMA_RESTORE; + hai->hai_fid.f_oid = 10; + hai->hai_len = sizeof(*hai); + + /* This works for either broadcast or unicast to a single fd */ + if (fd == 0) { + rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh); + } else { + struct file *fp = fget(fd); + + rc = libcfs_kkuc_msg_put(fp, lh); + fput(fp); + } + OBD_FREE(lh, len); + if (rc < 0) + RETURN(rc); + RETURN(count); +} + +static struct lprocfs_vars lprocfs_mdc_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "ping", 0, lprocfs_wr_ping, 0, 0, 0222 }, + { "connect_flags", lprocfs_rd_connect_flags, 0, 0 }, + { "blocksize", lprocfs_rd_blksize, 0, 0 }, + { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, + { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, + { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, + { "filestotal", lprocfs_rd_filestotal, 0, 0 }, + { "filesfree", lprocfs_rd_filesfree, 0, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/ + { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, + { "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, + /* + * FIXME: below proc entry is provided, but not in used, instead + * sbi->sb_md_brw_size is used, the per obd variable should be used + * when CMD is enabled, and dir pages are managed in MDC layer. + * Remember to enable proc write function. + */ + { "max_pages_per_rpc", lprocfs_obd_rd_max_pages_per_rpc, + /* lprocfs_obd_wr_max_pages_per_rpc */0, 0 }, + { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight, + mdc_wr_max_rpcs_in_flight, 0 }, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, + { "import", lprocfs_rd_import, lprocfs_wr_import, 0 }, + { "state", lprocfs_rd_state, 0, 0 }, + { "hsm_nl", 0, mdc_wr_kuc, 0, 0, 0200 }, + { "pinger_recov", lprocfs_rd_pinger_recov, + lprocfs_wr_pinger_recov, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_mdc_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_mdc_module_vars; + lvars->obd_vars = lprocfs_mdc_obd_vars; +} +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h new file mode 100644 index 000000000000..2aeff0ecec34 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/mdc_internal.h @@ -0,0 +1,180 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MDC_INTERNAL_H +#define _MDC_INTERNAL_H + +#include +#include + +#ifdef LPROCFS +void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, + struct obd_capa *oc, __u64 valid, int ea_size, + __u32 suppgid, int flags); +void mdc_pack_capa(struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa *oc); +int mdc_pack_req(struct ptlrpc_request *req, int version, int opc); +void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, + const struct lu_fid *cfid, int flags); +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data); +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, __u32 size, + const struct lu_fid *fid, struct obd_capa *oc); +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, + struct md_op_data *data, int ea_size); +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len); +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, int datalen, __u32 mode, __u32 uid, + __u32 gid, cfs_cap_t capability, __u64 rdev); +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + __u32 mode, __u64 rdev, __u32 flags, const void *data, + int datalen); +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen); +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); +int mdc_enter_request(struct client_obd *cli); +void mdc_exit_request(struct client_obd *cli); + +/* mdc/mdc_locks.c */ +int mdc_set_lock_data(struct obd_export *exp, + __u64 *lockh, void *data, __u64 *bits); + +int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); + +int mdc_find_cbdata(struct obd_export *exp, const struct lu_fid *fid, + ldlm_iterator_t it, void *data); + +int mdc_intent_lock(struct obd_export *exp, + struct md_op_data *, + void *lmm, int lmmsize, + struct lookup_intent *, int, + struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags); +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **req, __u64 extra_lock_flags); + +int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid, + struct list_head *cancels, ldlm_mode_t mode, + __u64 bits); +/* mdc/mdc_request.c */ +int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + +int mdc_open(struct obd_export *exp, obd_id ino, int type, int flags, + struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, + struct ptlrpc_request **); + +struct obd_client_handle; + +int mdc_get_lustre_md(struct obd_export *md_exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *lmv_exp, + struct lustre_md *md); + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md); + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct ptlrpc_request *open_req); + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och); +void mdc_commit_open(struct ptlrpc_request *req); +void mdc_replay_open(struct ptlrpc_request *req); + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, __u32 gid, + cfs_cap_t capability, __u64 rdev, + struct ptlrpc_request **request); +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request); +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, struct md_open_data **mod); +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request); +int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + ldlm_cancel_flags_t flags, void *opaque); + +static inline void mdc_set_capa_size(struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa *oc) +{ + if (oc == NULL) + req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0); + else + /* it is already calculated as sizeof struct obd_capa */ + ; +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits); + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo); + +ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh); + +static inline int mdc_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, int opc, + struct list_head *cancels, int count) +{ + return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels, + count); +} + +#endif diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c new file mode 100644 index 000000000000..05c6968119c8 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/mdc_lib.c @@ -0,0 +1,564 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC +#include +#include +#include "mdc_internal.h" + + +static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) +{ + LASSERT (b != NULL); + + b->suppgid = suppgid; + b->uid = current_uid(); + b->gid = current_gid(); + b->fsuid = current_fsuid(); + b->fsgid = current_fsgid(); + b->capability = cfs_curproc_cap_pack(); +} + +void mdc_pack_capa(struct ptlrpc_request *req, const struct req_msg_field *field, + struct obd_capa *oc) +{ + struct req_capsule *pill = &req->rq_pill; + struct lustre_capa *c; + + if (oc == NULL) { + LASSERT(req_capsule_get_size(pill, field, RCL_CLIENT) == 0); + return; + } + + c = req_capsule_client_get(pill, field); + LASSERT(c != NULL); + capa_cpy(c, oc); + DEBUG_CAPA(D_SEC, c, "pack"); +} + +void mdc_is_subdir_pack(struct ptlrpc_request *req, const struct lu_fid *pfid, + const struct lu_fid *cfid, int flags) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + if (pfid) { + b->fid1 = *pfid; + b->valid = OBD_MD_FLID; + } + if (cfid) + b->fid2 = *cfid; + b->flags = flags; +} + +void mdc_swap_layouts_pack(struct ptlrpc_request *req, + struct md_op_data *op_data) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + __mdc_pack_body(b, op_data->op_suppgids[0]); + b->fid1 = op_data->op_fid1; + b->fid2 = op_data->op_fid2; + b->valid |= OBD_MD_FLID; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); +} + +void mdc_pack_body(struct ptlrpc_request *req, + const struct lu_fid *fid, struct obd_capa *oc, + __u64 valid, int ea_size, __u32 suppgid, int flags) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + LASSERT(b != NULL); + b->valid = valid; + b->eadatasize = ea_size; + b->flags = flags; + __mdc_pack_body(b, suppgid); + if (fid) { + b->fid1 = *fid; + b->valid |= OBD_MD_FLID; + mdc_pack_capa(req, &RMF_CAPA1, oc); + } +} + +void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, + __u32 size, const struct lu_fid *fid, struct obd_capa *oc) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + b->fid1 = *fid; + b->valid |= OBD_MD_FLID; + b->size = pgoff; /* !! */ + b->nlink = size; /* !! */ + __mdc_pack_body(b, -1); + b->mode = LUDA_FID | LUDA_TYPE; + + mdc_pack_capa(req, &RMF_CAPA1, oc); +} + +/* packing of MDS records */ +void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const void *data, int datalen, __u32 mode, + __u32 uid, __u32 gid, cfs_cap_t cap_effective, __u64 rdev) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS; + if (op_data->op_bias & MDS_CREATE_VOLATILE) + flags |= MDS_OPEN_VOLATILE; + set_mrc_cr_flags(rec, flags); + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + + if (data) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, data, datalen); + } +} + +static __u64 mds_pack_open_flags(__u32 flags, __u32 mode) +{ + __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE | + MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | + MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | + MDS_OPEN_BY_FID)); + if (flags & O_CREAT) + cr_flags |= MDS_OPEN_CREAT; + if (flags & O_EXCL) + cr_flags |= MDS_OPEN_EXCL; + if (flags & O_TRUNC) + cr_flags |= MDS_OPEN_TRUNC; + if (flags & O_APPEND) + cr_flags |= MDS_OPEN_APPEND; + if (flags & O_SYNC) + cr_flags |= MDS_OPEN_SYNC; + if (flags & O_DIRECTORY) + cr_flags |= MDS_OPEN_DIRECTORY; +#ifdef FMODE_EXEC + if (flags & FMODE_EXEC) + cr_flags |= MDS_FMODE_EXEC; +#endif + if (flags & O_LOV_DELAY_CREATE) + cr_flags |= MDS_OPEN_DELAY_CREATE; + + if ((flags & O_NOACCESS) || (flags & O_NONBLOCK)) + cr_flags |= MDS_OPEN_NORESTORE; + + return cr_flags; +} + +/* packing of MDS records */ +void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + __u32 mode, __u64 rdev, __u32 flags, const void *lmm, + int lmmlen) +{ + struct mdt_rec_create *rec; + char *tmp; + __u64 cr_flags; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_create)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = current_fsuid(); + rec->cr_fsgid = current_fsgid(); + rec->cr_cap = cfs_curproc_cap_pack(); + if (op_data != NULL) { + rec->cr_fid1 = op_data->op_fid1; + rec->cr_fid2 = op_data->op_fid2; + } + rec->cr_mode = mode; + cr_flags = mds_pack_open_flags(flags, mode); + rec->cr_rdev = rdev; + rec->cr_time = op_data->op_mod_time; + rec->cr_suppgid1 = op_data->op_suppgids[0]; + rec->cr_suppgid2 = op_data->op_suppgids[1]; + rec->cr_bias = op_data->op_bias; + rec->cr_umask = current_umask(); + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + /* the next buffer is child capa, which is used for replay, + * will be packed from the data in reply message. */ + + if (op_data->op_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + if (op_data->op_bias & MDS_CREATE_VOLATILE) + cr_flags |= MDS_OPEN_VOLATILE; + } + + if (lmm) { + cr_flags |= MDS_OPEN_HAS_EA; + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, lmm, lmmlen); + } + set_mrc_cr_flags(rec, cr_flags); +} + +static inline __u64 attr_pack(unsigned int ia_valid) { + __u64 sa_valid = 0; + + if (ia_valid & ATTR_MODE) + sa_valid |= MDS_ATTR_MODE; + if (ia_valid & ATTR_UID) + sa_valid |= MDS_ATTR_UID; + if (ia_valid & ATTR_GID) + sa_valid |= MDS_ATTR_GID; + if (ia_valid & ATTR_SIZE) + sa_valid |= MDS_ATTR_SIZE; + if (ia_valid & ATTR_ATIME) + sa_valid |= MDS_ATTR_ATIME; + if (ia_valid & ATTR_MTIME) + sa_valid |= MDS_ATTR_MTIME; + if (ia_valid & ATTR_CTIME) + sa_valid |= MDS_ATTR_CTIME; + if (ia_valid & ATTR_ATIME_SET) + sa_valid |= MDS_ATTR_ATIME_SET; + if (ia_valid & ATTR_MTIME_SET) + sa_valid |= MDS_ATTR_MTIME_SET; + if (ia_valid & ATTR_FORCE) + sa_valid |= MDS_ATTR_FORCE; + if (ia_valid & ATTR_ATTR_FLAG) + sa_valid |= MDS_ATTR_ATTR_FLAG; + if (ia_valid & ATTR_KILL_SUID) + sa_valid |= MDS_ATTR_KILL_SUID; + if (ia_valid & ATTR_KILL_SGID) + sa_valid |= MDS_ATTR_KILL_SGID; + if (ia_valid & ATTR_CTIME_SET) + sa_valid |= MDS_ATTR_CTIME_SET; + if (ia_valid & ATTR_FROM_OPEN) + sa_valid |= MDS_ATTR_FROM_OPEN; + if (ia_valid & ATTR_BLOCKS) + sa_valid |= MDS_ATTR_BLOCKS; + if (ia_valid & MDS_OPEN_OWNEROVERRIDE) + /* NFSD hack (see bug 5781) */ + sa_valid |= MDS_OPEN_OWNEROVERRIDE; + return sa_valid; +} + +static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec, + struct md_op_data *op_data) +{ + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = current_fsuid(); + rec->sa_fsgid = current_fsgid(); + rec->sa_cap = cfs_curproc_cap_pack(); + rec->sa_suppgid = -1; + + rec->sa_fid = op_data->op_fid1; + rec->sa_valid = attr_pack(op_data->op_attr.ia_valid); + rec->sa_mode = op_data->op_attr.ia_mode; + rec->sa_uid = op_data->op_attr.ia_uid; + rec->sa_gid = op_data->op_attr.ia_gid; + rec->sa_size = op_data->op_attr.ia_size; + rec->sa_blocks = op_data->op_attr_blocks; + rec->sa_atime = LTIME_S(op_data->op_attr.ia_atime); + rec->sa_mtime = LTIME_S(op_data->op_attr.ia_mtime); + rec->sa_ctime = LTIME_S(op_data->op_attr.ia_ctime); + rec->sa_attr_flags = ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags; + if ((op_data->op_attr.ia_valid & ATTR_GID) && + current_is_in_group(op_data->op_attr.ia_gid)) + rec->sa_suppgid = op_data->op_attr.ia_gid; + else + rec->sa_suppgid = op_data->op_suppgids[0]; + + rec->sa_bias = op_data->op_bias; +} + +static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, + struct md_op_data *op_data) +{ + memcpy(&epoch->handle, &op_data->op_handle, sizeof(epoch->handle)); + epoch->ioepoch = op_data->op_ioepoch; + epoch->flags = op_data->op_flags & MF_SOM_LOCAL_FLAGS; +} + +void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len) +{ + struct mdt_rec_setattr *rec; + struct mdt_ioepoch *epoch; + struct lov_user_md *lum = NULL; + + CLASSERT(sizeof(struct mdt_rec_reint) ==sizeof(struct mdt_rec_setattr)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + mdc_setattr_pack_rec(rec, op_data); + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + if (op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) { + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + mdc_ioepoch_pack(epoch, op_data); + } + + if (ealen == 0) + return; + + lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + if (ea == NULL) { /* Remove LOV EA */ + lum->lmm_magic = LOV_USER_MAGIC_V1; + lum->lmm_stripe_size = 0; + lum->lmm_stripe_count = 0; + lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1); + } else { + memcpy(lum, ea, ealen); + } + + if (ea2len == 0) + return; + + memcpy(req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES), ea2, + ea2len); +} + +void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_unlink *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_unlink)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT(rec != NULL); + + rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ? + REINT_RMENTRY : REINT_UNLINK; + rec->ul_fsuid = op_data->op_fsuid; + rec->ul_fsgid = op_data->op_fsgid; + rec->ul_cap = op_data->op_cap; + rec->ul_mode = op_data->op_mode; + rec->ul_suppgid1= op_data->op_suppgids[0]; + rec->ul_suppgid2= -1; + rec->ul_fid1 = op_data->op_fid1; + rec->ul_fid2 = op_data->op_fid2; + rec->ul_time = op_data->op_mod_time; + rec->ul_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LASSERT(tmp != NULL); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); +} + +void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_rec_link *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_link)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + LASSERT (rec != NULL); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = op_data->op_fsuid;//current->fsuid; + rec->lk_fsgid = op_data->op_fsgid;//current->fsgid; + rec->lk_cap = op_data->op_cap;//current->cap_effective; + rec->lk_suppgid1 = op_data->op_suppgids[0]; + rec->lk_suppgid2 = op_data->op_suppgids[1]; + rec->lk_fid1 = op_data->op_fid1; + rec->lk_fid2 = op_data->op_fid2; + rec->lk_time = op_data->op_mod_time; + rec->lk_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); +} + +void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen) +{ + struct mdt_rec_rename *rec; + char *tmp; + + CLASSERT(sizeof(struct mdt_rec_reint) == sizeof(struct mdt_rec_rename)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = op_data->op_fsuid; + rec->rn_fsgid = op_data->op_fsgid; + rec->rn_cap = op_data->op_cap; + rec->rn_suppgid1 = op_data->op_suppgids[0]; + rec->rn_suppgid2 = op_data->op_suppgids[1]; + rec->rn_fid1 = op_data->op_fid1; + rec->rn_fid2 = op_data->op_fid2; + rec->rn_time = op_data->op_mod_time; + rec->rn_mode = op_data->op_mode; + rec->rn_bias = op_data->op_bias; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_pack_capa(req, &RMF_CAPA2, op_data->op_capa2); + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(old, oldlen, tmp); + + if (new) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SYMTGT); + LOGL0(new, newlen, tmp); + } +} + +void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, int flags, + struct md_op_data *op_data, int ea_size) +{ + struct mdt_body *b = req_capsule_client_get(&req->rq_pill, + &RMF_MDT_BODY); + + b->valid = valid; + if (op_data->op_bias & MDS_CHECK_SPLIT) + b->valid |= OBD_MD_FLCKSPLIT; + if (op_data->op_bias & MDS_CROSS_REF) + b->valid |= OBD_MD_FLCROSSREF; + b->eadatasize = ea_size; + b->flags = flags; + __mdc_pack_body(b, op_data->op_suppgids[0]); + + b->fid1 = op_data->op_fid1; + b->fid2 = op_data->op_fid2; + b->valid |= OBD_MD_FLID; + + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + + if (op_data->op_name) { + char *tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LOGL0(op_data->op_name, op_data->op_namelen, tmp); + + } +} + +void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) +{ + struct mdt_ioepoch *epoch; + struct mdt_rec_setattr *rec; + + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + + mdc_setattr_pack_rec(rec, op_data); + mdc_pack_capa(req, &RMF_CAPA1, op_data->op_capa1); + mdc_ioepoch_pack(epoch, op_data); +} + +static int mdc_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) +{ + int rc; + ENTRY; + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&mcw->mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + RETURN(rc); +}; + +/* We record requests in flight in cli->cl_r_in_flight here. + * There is only one write rpc possible in mdc anyway. If this to change + * in the future - the code may need to be revisited. */ +int mdc_enter_request(struct client_obd *cli) +{ + int rc = 0; + struct mdc_cache_waiter mcw; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + list_add_tail(&mcw.mcw_entry, &cli->cl_cache_waiters); + init_waitqueue_head(&mcw.mcw_waitq); + client_obd_list_unlock(&cli->cl_loi_list_lock); + rc = l_wait_event(mcw.mcw_waitq, mdc_req_avail(cli, &mcw), &lwi); + if (rc) { + client_obd_list_lock(&cli->cl_loi_list_lock); + if (list_empty(&mcw.mcw_entry)) + cli->cl_r_in_flight--; + list_del_init(&mcw.mcw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + } else { + cli->cl_r_in_flight++; + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + return rc; +} + +void mdc_exit_request(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct mdc_cache_waiter *mcw; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_r_in_flight--; + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { + /* No free request slots anymore */ + break; + } + + mcw = list_entry(l, struct mdc_cache_waiter, mcw_entry); + list_del_init(&mcw->mcw_entry); + cli->cl_r_in_flight++; + wake_up(&mcw->mcw_waitq); + } + /* Empty waiting list? Decrease reqs in-flight number */ + + client_obd_list_unlock(&cli->cl_loi_list_lock); +} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c new file mode 100644 index 000000000000..1cc90b635fb5 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/mdc_locks.c @@ -0,0 +1,1229 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include +# include +# include +# include + +#include +#include +#include +/* fid_res_name_eq() */ +#include +#include +#include "mdc_internal.h" + +struct mdc_getattr_args { + struct obd_export *ga_exp; + struct md_enqueue_info *ga_minfo; + struct ldlm_enqueue_info *ga_einfo; +}; + +int it_disposition(struct lookup_intent *it, int flag) +{ + return it->d.lustre.it_disposition & flag; +} +EXPORT_SYMBOL(it_disposition); + +void it_set_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition |= flag; +} +EXPORT_SYMBOL(it_set_disposition); + +void it_clear_disposition(struct lookup_intent *it, int flag) +{ + it->d.lustre.it_disposition &= ~flag; +} +EXPORT_SYMBOL(it_clear_disposition); + +int it_open_error(int phase, struct lookup_intent *it) +{ + if (it_disposition(it, DISP_OPEN_OPEN)) { + if (phase >= DISP_OPEN_OPEN) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_OPEN_CREATE)) { + if (phase >= DISP_OPEN_CREATE) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_LOOKUP_EXECD)) { + if (phase >= DISP_LOOKUP_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + + if (it_disposition(it, DISP_IT_EXECD)) { + if (phase >= DISP_IT_EXECD) + return it->d.lustre.it_status; + else + return 0; + } + CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, + it->d.lustre.it_status); + LBUG(); + return 0; +} +EXPORT_SYMBOL(it_open_error); + +/* this must be called on a lockh that is known to have a referenced lock */ +int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data, + __u64 *bits) +{ + struct ldlm_lock *lock; + struct inode *new_inode = data; + ENTRY; + + if(bits) + *bits = 0; + + if (!*lockh) + RETURN(0); + + lock = ldlm_handle2lock((struct lustre_handle *)lockh); + + LASSERT(lock != NULL); + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode && + lock->l_resource->lr_lvb_inode != data) { + struct inode *old_inode = lock->l_resource->lr_lvb_inode; + LASSERTF(old_inode->i_state & I_FREEING, + "Found existing inode %p/%lu/%u state %lu in lock: " + "setting data to %p/%lu/%u\n", old_inode, + old_inode->i_ino, old_inode->i_generation, + old_inode->i_state, + new_inode, new_inode->i_ino, new_inode->i_generation); + } + lock->l_resource->lr_lvb_inode = new_inode; + if (bits) + *bits = lock->l_policy_data.l_inodebits.bits; + + unlock_res_and_lock(lock); + LDLM_LOCK_PUT(lock); + + RETURN(0); +} + +ldlm_mode_t mdc_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, ldlm_type_t type, + ldlm_policy_data_t *policy, ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id; + ldlm_mode_t rc; + ENTRY; + + fid_build_reg_res_name(fid, &res_id); + rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, + &res_id, type, policy, mode, lockh, 0); + RETURN(rc); +} + +int mdc_cancel_unused(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + int rc; + + ENTRY; + + fid_build_reg_res_name(fid, &res_id); + rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id, + policy, mode, flags, opaque); + RETURN(rc); +} + +int mdc_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + struct ldlm_res_id res_id; + struct ldlm_resource *res; + struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace; + ENTRY; + + LASSERTF(ns != NULL, "no namespace passed\n"); + + fid_build_reg_res_name(fid, &res_id); + + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if(res == NULL) + RETURN(0); + + lock_res(res); + res->lr_lvb_inode = NULL; + unlock_res(res); + + ldlm_resource_putref(res); + RETURN(0); +} + +/* find any ldlm lock of the inode in mdc + * return 0 not find + * 1 find one + * < 0 error */ +int mdc_find_cbdata(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + struct ldlm_res_id res_id; + int rc = 0; + ENTRY; + + fid_build_reg_res_name((struct lu_fid*)fid, &res_id); + rc = ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id, + it, data); + if (rc == LDLM_ITER_STOP) + RETURN(1); + else if (rc == LDLM_ITER_CONTINUE) + RETURN(0); + RETURN(rc); +} + +static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) +{ + /* Don't hold error requests for replay. */ + if (req->rq_replay) { + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + } + if (rc && req->rq_transno != 0) { + DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc); + LBUG(); + } +} + +/* Save a large LOV EA into the request buffer so that it is available + * for replay. We don't do this in the initial request because the + * original request doesn't need this buffer (at most it sends just the + * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty + * buffer and may also be difficult to allocate and save a very large + * request buffer for each open. (bug 5707) + * + * OOM here may cause recovery failure if lmm is needed (only for the + * original open if the MDS crashed just when this client also OOM'd) + * but this is incredibly unlikely, and questionable whether the client + * could do MDS recovery under OOM anyways... */ +static void mdc_realloc_openmsg(struct ptlrpc_request *req, + struct mdt_body *body) +{ + int rc; + + /* FIXME: remove this explicit offset. */ + rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4, + body->eadatasize); + if (rc) { + CERROR("Can't enlarge segment %d size to %d\n", + DLM_INTENT_REC_OFF + 4, body->eadatasize); + body->valid &= ~OBD_MD_FLEASIZE; + body->eadatasize = 0; + } +} + +static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data, + void *lmm, int lmmsize, + void *cb_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_intent *lit; + LIST_HEAD(cancels); + int count = 0; + int mode; + int rc; + ENTRY; + + it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; + + /* XXX: openlock is not cancelled for cross-refs. */ + /* If inode is known, cancel conflicting OPEN locks. */ + if (fid_is_sane(&op_data->op_fid2)) { + if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC)) + mode = LCK_CW; +#ifdef FMODE_EXEC + else if (it->it_flags & FMODE_EXEC) + mode = LCK_PR; +#endif + else + mode = LCK_CR; + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, mode, + MDS_INODELOCK_OPEN); + } + + /* If CREATE, cancel parent's UPDATE lock. */ + if (it->it_op & IT_CREAT) + mode = LCK_EX; + else + mode = LCK_CR; + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, mode, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_OPEN); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(ERR_PTR(-ENOMEM)); + } + + /* parent capability */ + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + /* child capability, reserve the size according to parent capa, it will + * be filled after we get the reply */ + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa1); + + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + max(lmmsize, obddev->u.cli.cl_default_mds_easize)); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + return NULL; + } + + spin_lock(&req->rq_lock); + req->rq_replay = req->rq_import->imp_replayable; + spin_unlock(&req->rq_lock); + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm, + lmmsize); + + /* for remote client, fetch remote perm for current user */ + if (client_is_remote(exp)) + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + ptlrpc_request_set_replen(req); + return req; +} + +static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_intent *lit; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_UNLINK); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_unlink_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obddev->u.cli.cl_max_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + obddev->u.cli.cl_max_mds_cookiesize); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *op_data) +{ + struct ptlrpc_request *req; + struct obd_device *obddev = class_exp2obd(exp); + obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | + OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA | + OBD_MD_FLMDSCAPA | OBD_MD_MEA | + (client_is_remote(exp) ? + OBD_MD_FLRMTPERM : OBD_MD_FLACL); + struct ldlm_intent *lit; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_GETATTR); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the intended request */ + mdc_getattr_pack(req, valid, it->it_flags, op_data, + obddev->u.cli.cl_max_mds_easize); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obddev->u.cli.cl_max_mds_easize); + if (client_is_remote(exp)) + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp, + struct lookup_intent *it, + struct md_op_data *unused) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct ldlm_intent *lit; + struct layout_intent *layout; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_INTENT_LAYOUT); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + /* pack the intent */ + lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); + lit->opc = (__u64)it->it_op; + + /* pack the layout intent request */ + layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT); + /* LAYOUT_INTENT_ACCESS is generic, specific operation will be + * set for replication */ + layout->li_opc = LAYOUT_INTENT_ACCESS; + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static struct ptlrpc_request * +mdc_enqueue_pack(struct obd_export *exp, int lvb_len) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); + if (req == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(ERR_PTR(rc)); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); + ptlrpc_request_set_replen(req); + RETURN(req); +} + +static int mdc_finish_enqueue(struct obd_export *exp, + struct ptlrpc_request *req, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct lustre_handle *lockh, + int rc) +{ + struct req_capsule *pill = &req->rq_pill; + struct ldlm_request *lockreq; + struct ldlm_reply *lockrep; + struct lustre_intent_data *intent = &it->d.lustre; + struct ldlm_lock *lock; + void *lvb_data = NULL; + int lvb_len = 0; + ENTRY; + + LASSERT(rc >= 0); + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. */ + if (req->rq_transno || req->rq_replay) { + lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ); + lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY); + } + + if (rc == ELDLM_LOCK_ABORTED) { + einfo->ei_mode = 0; + memset(lockh, 0, sizeof(*lockh)); + rc = 0; + } else { /* rc = 0 */ + lock = ldlm_handle2lock(lockh); + LASSERT(lock != NULL); + + /* If the server gave us back a different lock mode, we should + * fix up our variables. */ + if (lock->l_req_mode != einfo->ei_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, einfo->ei_mode); + einfo->ei_mode = lock->l_req_mode; + } + LDLM_LOCK_PUT(lock); + } + + lockrep = req_capsule_server_get(pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ + + intent->it_disposition = (int)lockrep->lock_policy_res1; + intent->it_status = (int)lockrep->lock_policy_res2; + intent->it_lock_mode = einfo->ei_mode; + intent->it_lock_handle = lockh->cookie; + intent->it_data = req; + + /* Technically speaking rq_transno must already be zero if + * it_status is in error, so the check is a bit redundant */ + if ((!req->rq_transno || intent->it_status < 0) && req->rq_replay) + mdc_clear_replay_flag(req, intent->it_status); + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. + * + * It's important that we do this first! Otherwise we might exit the + * function without doing so, and try to replay a failed create + * (bug 3440) */ + if (it->it_op & IT_OPEN && req->rq_replay && + (!it_disposition(it, DISP_OPEN_OPEN) ||intent->it_status != 0)) + mdc_clear_replay_flag(req, intent->it_status); + + DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d", + it->it_op, intent->it_disposition, intent->it_status); + + /* We know what to expect, so we do any byte flipping required here */ + if (it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR)) { + struct mdt_body *body; + + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) { + CERROR ("Can't swab mdt_body\n"); + RETURN (-EPROTO); + } + + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + /* + * If this is a successful OPEN request, we need to set + * replay handler and data early, so that if replay + * happens immediately after swabbing below, new reply + * is swabbed by that handler correctly. + */ + mdc_set_open_replay_data(NULL, NULL, req); + } + + if ((body->valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) { + void *eadata; + + mdc_update_max_ea_from_body(exp, body); + + /* + * The eadata is opaque; just check that it is there. + * Eventually, obd_unpackmd() will check the contents. + */ + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + + /* save lvb data and length in case this is for layout + * lock */ + lvb_data = eadata; + lvb_len = body->eadatasize; + + /* + * We save the reply LOV EA in case we have to replay a + * create for recovery. If we didn't allocate a large + * enough request buffer above we need to reallocate it + * here to hold the actual LOV EA. + * + * To not save LOV EA if request is not going to replay + * (for example error one). + */ + if ((it->it_op & IT_OPEN) && req->rq_replay) { + void *lmm; + if (req_capsule_get_size(pill, &RMF_EADATA, + RCL_CLIENT) < + body->eadatasize) + mdc_realloc_openmsg(req, body); + else + req_capsule_shrink(pill, &RMF_EADATA, + body->eadatasize, + RCL_CLIENT); + + req_capsule_set_size(pill, &RMF_EADATA, + RCL_CLIENT, + body->eadatasize); + + lmm = req_capsule_client_get(pill, &RMF_EADATA); + if (lmm) + memcpy(lmm, eadata, body->eadatasize); + } + } + + if (body->valid & OBD_MD_FLRMTPERM) { + struct mdt_remote_perm *perm; + + LASSERT(client_is_remote(exp)); + perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (perm == NULL) + RETURN(-EPROTO); + } + if (body->valid & OBD_MD_FLMDSCAPA) { + struct lustre_capa *capa, *p; + + capa = req_capsule_server_get(pill, &RMF_CAPA1); + if (capa == NULL) + RETURN(-EPROTO); + + if (it->it_op & IT_OPEN) { + /* client fid capa will be checked in replay */ + p = req_capsule_client_get(pill, &RMF_CAPA2); + LASSERT(p); + *p = *capa; + } + } + if (body->valid & OBD_MD_FLOSSCAPA) { + struct lustre_capa *capa; + + capa = req_capsule_server_get(pill, &RMF_CAPA2); + if (capa == NULL) + RETURN(-EPROTO); + } + } else if (it->it_op & IT_LAYOUT) { + /* maybe the lock was granted right away and layout + * is packed into RMF_DLM_LVB of req */ + lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); + if (lvb_len > 0) { + lvb_data = req_capsule_server_sized_get(pill, + &RMF_DLM_LVB, lvb_len); + if (lvb_data == NULL) + RETURN(-EPROTO); + } + } + + /* fill in stripe data for layout lock */ + lock = ldlm_handle2lock(lockh); + if (lock != NULL && ldlm_has_layout(lock) && lvb_data != NULL) { + void *lmm; + + LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d\n", + ldlm_it2str(it->it_op), lvb_len); + + OBD_ALLOC_LARGE(lmm, lvb_len); + if (lmm == NULL) { + LDLM_LOCK_PUT(lock); + RETURN(-ENOMEM); + } + memcpy(lmm, lvb_data, lvb_len); + + /* install lvb_data */ + lock_res_and_lock(lock); + if (lock->l_lvb_data == NULL) { + lock->l_lvb_data = lmm; + lock->l_lvb_len = lvb_len; + lmm = NULL; + } + unlock_res_and_lock(lock); + if (lmm != NULL) + OBD_FREE_LARGE(lmm, lvb_len); + } + if (lock != NULL) + LDLM_LOCK_PUT(lock); + + RETURN(rc); +} + +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. */ +int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, struct md_op_data *op_data, + struct lustre_handle *lockh, void *lmm, int lmmsize, + struct ptlrpc_request **reqp, __u64 extra_lock_flags) +{ + struct obd_device *obddev = class_exp2obd(exp); + struct ptlrpc_request *req = NULL; + __u64 flags, saved_flags = extra_lock_flags; + int rc; + struct ldlm_res_id res_id; + static const ldlm_policy_data_t lookup_policy = + { .l_inodebits = { MDS_INODELOCK_LOOKUP } }; + static const ldlm_policy_data_t update_policy = + { .l_inodebits = { MDS_INODELOCK_UPDATE } }; + static const ldlm_policy_data_t layout_policy = + { .l_inodebits = { MDS_INODELOCK_LAYOUT } }; + ldlm_policy_data_t const *policy = &lookup_policy; + int generation, resends = 0; + struct ldlm_reply *lockrep; + enum lvb_type lvb_type = 0; + ENTRY; + + LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", + einfo->ei_type); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + + if (it) { + saved_flags |= LDLM_FL_HAS_INTENT; + if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR)) + policy = &update_policy; + else if (it->it_op & IT_LAYOUT) + policy = &layout_policy; + } + + LASSERT(reqp == NULL); + + generation = obddev->u.cli.cl_import->imp_generation; +resend: + flags = saved_flags; + if (!it) { + /* The only way right now is FLOCK, in this case we hide flock + policy as lmm, but lmmsize is 0 */ + LASSERT(lmm && lmmsize == 0); + LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", + einfo->ei_type); + policy = (ldlm_policy_data_t *)lmm; + res_id.name[3] = LDLM_FLOCK; + } else if (it->it_op & IT_OPEN) { + req = mdc_intent_open_pack(exp, it, op_data, lmm, lmmsize, + einfo->ei_cbdata); + policy = &update_policy; + einfo->ei_cbdata = NULL; + lmm = NULL; + } else if (it->it_op & IT_UNLINK) { + req = mdc_intent_unlink_pack(exp, it, op_data); + } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { + req = mdc_intent_getattr_pack(exp, it, op_data); + } else if (it->it_op & IT_READDIR) { + req = mdc_enqueue_pack(exp, 0); + } else if (it->it_op & IT_LAYOUT) { + if (!imp_connect_lvb_type(class_exp2cliimp(exp))) + RETURN(-EOPNOTSUPP); + + req = mdc_intent_layout_pack(exp, it, op_data); + lvb_type = LVB_T_LAYOUT; + } else { + LBUG(); + RETURN(-EINVAL); + } + + if (IS_ERR(req)) + RETURN(PTR_ERR(req)); + + if (req != NULL && it && it->it_op & IT_CREAT) + /* ask ptlrpc not to resend on EINPROGRESS since we have our own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = cfs_time_current_sec() + resends; + } + + /* It is important to obtain rpc_lock first (if applicable), so that + * threads that are serialised with rpc_lock are not polluting our + * rpcs in flight counter. We do not do flock request limiting, though*/ + if (it) { + mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + rc = mdc_enter_request(&obddev->u.cli); + if (rc != 0) { + mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + mdc_clear_replay_flag(req, 0); + ptlrpc_req_finished(req); + RETURN(rc); + } + } + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL, + 0, lvb_type, lockh, 0); + if (!it) { + /* For flock requests we immediatelly return without further + delay and let caller deal with the rest, since rest of + this function metadata processing makes no sense for flock + requests anyway */ + RETURN(rc); + } + + mdc_exit_request(&obddev->u.cli); + mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); + + if (rc < 0) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + RETURN(rc); + } + + lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); + LASSERT(lockrep != NULL); + + /* Retry the create infinitely when we get -EINPROGRESS from + * server. This is required by the new quota design. */ + if (it && it->it_op & IT_CREAT && + (int)lockrep->lock_policy_res2 == -EINPROGRESS) { + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + resends++; + + CDEBUG(D_HA, "%s: resend:%d op:%d "DFID"/"DFID"\n", + obddev->obd_name, resends, it->it_op, + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); + + if (generation == obddev->u.cli.cl_import->imp_generation) { + goto resend; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc < 0) { + if (lustre_handle_is_used(lockh)) { + ldlm_lock_decref(lockh, einfo->ei_mode); + memset(lockh, 0, sizeof(*lockh)); + } + ptlrpc_req_finished(req); + } + RETURN(rc); +} + +static int mdc_finish_intent_lock(struct obd_export *exp, + struct ptlrpc_request *request, + struct md_op_data *op_data, + struct lookup_intent *it, + struct lustre_handle *lockh) +{ + struct lustre_handle old_lock; + struct mdt_body *mdt_body; + struct ldlm_lock *lock; + int rc; + + + LASSERT(request != NULL); + LASSERT(request != LP_POISON); + LASSERT(request->rq_repmsg != LP_POISON); + + if (!it_disposition(it, DISP_IT_EXECD)) { + /* The server failed before it even started executing the + * intent, i.e. because it couldn't unpack the request. */ + LASSERT(it->d.lustre.it_status != 0); + RETURN(it->d.lustre.it_status); + } + rc = it_open_error(DISP_IT_EXECD, it); + if (rc) + RETURN(rc); + + mdt_body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); + LASSERT(mdt_body != NULL); /* mdc_enqueue checked */ + + /* If we were revalidating a fid/name pair, mark the intent in + * case we fail and get called again from lookup */ + if (fid_is_sane(&op_data->op_fid2) && + it->it_create_mode & M_CHECK_STALE && + it->it_op != IT_GETATTR) { + it_set_disposition(it, DISP_ENQ_COMPLETE); + + /* Also: did we find the same inode? */ + /* sever can return one of two fids: + * op_fid2 - new allocated fid - if file is created. + * op_fid3 - existent fid - if file only open. + * op_fid3 is saved in lmv_intent_open */ + if ((!lu_fid_eq(&op_data->op_fid2, &mdt_body->fid1)) && + (!lu_fid_eq(&op_data->op_fid3, &mdt_body->fid1))) { + CDEBUG(D_DENTRY, "Found stale data "DFID"("DFID")/"DFID + "\n", PFID(&op_data->op_fid2), + PFID(&op_data->op_fid2), PFID(&mdt_body->fid1)); + RETURN(-ESTALE); + } + } + + rc = it_open_error(DISP_LOOKUP_EXECD, it); + if (rc) + RETURN(rc); + + /* keep requests around for the multiple phases of the call + * this shows the DISP_XX must guarantee we make it into the call + */ + if (!it_disposition(it, DISP_ENQ_CREATE_REF) && + it_disposition(it, DISP_OPEN_CREATE) && + !it_open_error(DISP_OPEN_CREATE, it)) { + it_set_disposition(it, DISP_ENQ_CREATE_REF); + ptlrpc_request_addref(request); /* balanced in ll_create_node */ + } + if (!it_disposition(it, DISP_ENQ_OPEN_REF) && + it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) { + it_set_disposition(it, DISP_ENQ_OPEN_REF); + ptlrpc_request_addref(request); /* balanced in ll_file_open */ + /* BUG 11546 - eviction in the middle of open rpc processing */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout); + } + + if (it->it_op & IT_CREAT) { + /* XXX this belongs in ll_create_it */ + } else if (it->it_op == IT_OPEN) { + LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); + } else { + LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP | IT_LAYOUT)); + } + + /* If we already have a matching lock, then cancel the new + * one. We have to set the data here instead of in + * mdc_enqueue, because we need to use the child's inode as + * the l_ast_data to match, and that's not available until + * intent_finish has performed the iget().) */ + lock = ldlm_handle2lock(lockh); + if (lock) { + ldlm_policy_data_t policy = lock->l_policy_data; + LDLM_DEBUG(lock, "matching against this"); + + LASSERTF(fid_res_name_eq(&mdt_body->fid1, + &lock->l_resource->lr_name), + "Lock res_id: %lu/%lu/%lu, fid: %lu/%lu/%lu.\n", + (unsigned long)lock->l_resource->lr_name.name[0], + (unsigned long)lock->l_resource->lr_name.name[1], + (unsigned long)lock->l_resource->lr_name.name[2], + (unsigned long)fid_seq(&mdt_body->fid1), + (unsigned long)fid_oid(&mdt_body->fid1), + (unsigned long)fid_ver(&mdt_body->fid1)); + LDLM_LOCK_PUT(lock); + + memcpy(&old_lock, lockh, sizeof(*lockh)); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) { + ldlm_lock_decref_and_cancel(lockh, + it->d.lustre.it_lock_mode); + memcpy(lockh, &old_lock, sizeof(old_lock)); + it->d.lustre.it_lock_handle = lockh->cookie; + } + } + CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", + op_data->op_namelen, op_data->op_name, ldlm_it2str(it->it_op), + it->d.lustre.it_status, it->d.lustre.it_disposition, rc); + RETURN(rc); +} + +int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + struct ldlm_res_id res_id; + struct lustre_handle lockh; + ldlm_policy_data_t policy; + ldlm_mode_t mode; + ENTRY; + + if (it->d.lustre.it_lock_handle) { + lockh.cookie = it->d.lustre.it_lock_handle; + mode = ldlm_revalidate_lock_handle(&lockh, bits); + } else { + fid_build_reg_res_name(fid, &res_id); + switch (it->it_op) { + case IT_GETATTR: + policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; + break; + case IT_LAYOUT: + policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; + break; + default: + policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP; + break; + } + mode = ldlm_lock_match(exp->exp_obd->obd_namespace, + LDLM_FL_BLOCK_GRANTED, &res_id, + LDLM_IBITS, &policy, + LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0); + } + + if (mode) { + it->d.lustre.it_lock_handle = lockh.cookie; + it->d.lustre.it_lock_mode = mode; + } else { + it->d.lustre.it_lock_handle = 0; + it->d.lustre.it_lock_mode = 0; + } + + RETURN(!!mode); +} + +/* + * This long block is all about fixing up the lock and request state + * so that it is correct as of the moment _before_ the operation was + * applied; that way, the VFS will think that everything is normal and + * call Lustre's regular VFS methods. + * + * If we're performing a creation, that means that unless the creation + * failed with EEXIST, we should fake up a negative dentry. + * + * For everything else, we want to lookup to succeed. + * + * One additional note: if CREATE or OPEN succeeded, we add an extra + * reference to the request because we need to keep it around until + * ll_create/ll_open gets called. + * + * The server will return to us, in it_disposition, an indication of + * exactly what d.lustre.it_status refers to. + * + * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call, + * otherwise if DISP_OPEN_CREATE is set, then it status is the + * creation failure mode. In either case, one of DISP_LOOKUP_NEG or + * DISP_LOOKUP_POS will be set, indicating whether the child lookup + * was successful. + * + * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the + * child lookup. + */ +int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, + int lookup_flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + struct lustre_handle lockh; + int rc = 0; + ENTRY; + LASSERT(it); + + CDEBUG(D_DLMTRACE, "(name: %.*s,"DFID") in obj "DFID + ", intent: %s flags %#o\n", op_data->op_namelen, + op_data->op_name, PFID(&op_data->op_fid2), + PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), + it->it_flags); + + lockh.cookie = 0; + if (fid_is_sane(&op_data->op_fid2) && + (it->it_op & (IT_LOOKUP | IT_GETATTR))) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate_it if we already have a lock, let's + * verify that. */ + it->d.lustre.it_lock_handle = 0; + rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL); + /* Only return failure if it was not GETATTR by cfid + (from inode_revalidate) */ + if (rc || op_data->op_namelen != 0) + RETURN(rc); + } + + /* lookup_it may be called only after revalidate_it has run, because + * revalidate_it cannot return errors, only zero. Returning zero causes + * this call to lookup, which *can* return an error. + * + * We only want to execute the request associated with the intent one + * time, however, so don't send the request again. Instead, skip past + * this and use the request from revalidate. In this case, revalidate + * never dropped its reference, so the refcounts are all OK */ + if (!it_disposition(it, DISP_ENQ_COMPLETE)) { + struct ldlm_enqueue_info einfo = + { LDLM_IBITS, it_to_lock_mode(it), cb_blocking, + ldlm_completion_ast, NULL, NULL, NULL }; + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { + rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("Can't alloc new fid, rc %d\n", rc); + RETURN(rc); + } + } + rc = mdc_enqueue(exp, &einfo, it, op_data, &lockh, + lmm, lmmsize, NULL, extra_lock_flags); + if (rc < 0) + RETURN(rc); + } else if (!fid_is_sane(&op_data->op_fid2) || + !(it->it_create_mode & M_CHECK_STALE)) { + /* DISP_ENQ_COMPLETE set means there is extra reference on + * request referenced from this intent, saved for subsequent + * lookup. This path is executed when we proceed to this + * lookup, so we clear DISP_ENQ_COMPLETE */ + it_clear_disposition(it, DISP_ENQ_COMPLETE); + } + *reqp = it->d.lustre.it_data; + rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh); + RETURN(rc); +} + +static int mdc_intent_getattr_async_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *args, int rc) +{ + struct mdc_getattr_args *ga = args; + struct obd_export *exp = ga->ga_exp; + struct md_enqueue_info *minfo = ga->ga_minfo; + struct ldlm_enqueue_info *einfo = ga->ga_einfo; + struct lookup_intent *it; + struct lustre_handle *lockh; + struct obd_device *obddev; + __u64 flags = LDLM_FL_HAS_INTENT; + ENTRY; + + it = &minfo->mi_it; + lockh = &minfo->mi_lockh; + + obddev = class_exp2obd(exp); + + mdc_exit_request(&obddev->u.cli); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) + rc = -ETIMEDOUT; + + rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode, + &flags, NULL, 0, lockh, rc); + if (rc < 0) { + CERROR("ldlm_cli_enqueue_fini: %d\n", rc); + mdc_clear_replay_flag(req, rc); + GOTO(out, rc); + } + + rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); + if (rc) + GOTO(out, rc); + + rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh); + EXIT; + +out: + OBD_FREE_PTR(einfo); + minfo->mi_cb(req, minfo, rc); + return 0; +} + +int mdc_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + struct md_op_data *op_data = &minfo->mi_data; + struct lookup_intent *it = &minfo->mi_it; + struct ptlrpc_request *req; + struct mdc_getattr_args *ga; + struct obd_device *obddev = class_exp2obd(exp); + struct ldlm_res_id res_id; + /*XXX: Both MDS_INODELOCK_LOOKUP and MDS_INODELOCK_UPDATE are needed + * for statahead currently. Consider CMD in future, such two bits + * maybe managed by different MDS, should be adjusted then. */ + ldlm_policy_data_t policy = { + .l_inodebits = { MDS_INODELOCK_LOOKUP | + MDS_INODELOCK_UPDATE } + }; + int rc = 0; + __u64 flags = LDLM_FL_HAS_INTENT; + ENTRY; + + CDEBUG(D_DLMTRACE,"name: %.*s in inode "DFID", intent: %s flags %#o\n", + op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), + ldlm_it2str(it->it_op), it->it_flags); + + fid_build_reg_res_name(&op_data->op_fid1, &res_id); + req = mdc_intent_getattr_pack(exp, it, op_data); + if (!req) + RETURN(-ENOMEM); + + rc = mdc_enter_request(&obddev->u.cli); + if (rc != 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, &policy, &flags, NULL, + 0, LVB_T_NONE, &minfo->mi_lockh, 1); + if (rc < 0) { + mdc_exit_request(&obddev->u.cli); + ptlrpc_req_finished(req); + RETURN(rc); + } + + CLASSERT(sizeof(*ga) <= sizeof(req->rq_async_args)); + ga = ptlrpc_req_async_args(req); + ga->ga_exp = exp; + ga->ga_minfo = minfo; + ga->ga_einfo = einfo; + + req->rq_interpret_reply = mdc_intent_getattr_async_interpret; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + + RETURN(0); +} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c new file mode 100644 index 000000000000..5e25a07c52bd --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/mdc_reint.c @@ -0,0 +1,489 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include +# include + +#include +#include "mdc_internal.h" +#include + +/* mdc_setattr does its own semaphore handling */ +static int mdc_reint(struct ptlrpc_request *request, + struct mdc_rpc_lock *rpc_lock, + int level) +{ + int rc; + + request->rq_send_state = level; + + mdc_get_rpc_lock(rpc_lock, NULL); + rc = ptlrpc_queue_wait(request); + mdc_put_rpc_lock(rpc_lock, NULL); + if (rc) + CDEBUG(D_INFO, "error in handling %d\n", rc); + else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) { + rc = -EPROTO; + } + return rc; +} + +/* Find and cancel locally locks matched by inode @bits & @mode in the resource + * found by @fid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +int mdc_resource_get_unused(struct obd_export *exp, struct lu_fid *fid, + struct list_head *cancels, ldlm_mode_t mode, + __u64 bits) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + ldlm_policy_data_t policy = {{0}}; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + fid_build_reg_res_name(fid, &res_id); + res = ldlm_resource_get(exp->exp_obd->obd_namespace, + NULL, &res_id, 0, 0); + if (res == NULL) + RETURN(0); + LDLM_RESOURCE_ADDREF(res); + /* Initialize ibits lock policy. */ + policy.l_inodebits.bits = bits; + count = ldlm_cancel_resource_local(res, cancels, &policy, + mode, 0, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, struct md_open_data **mod) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + struct mdc_rpc_lock *rpc_lock; + struct obd_device *obd = exp->exp_obd; + int count = 0, rc; + __u64 bits; + ENTRY; + + LASSERT(op_data != NULL); + + bits = MDS_INODELOCK_UPDATE; + if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + bits |= MDS_INODELOCK_LOOKUP; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, bits); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_SETATTR); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + if ((op_data->op_flags & (MF_SOM_CHANGE | MF_EPOCH_OPEN)) == 0) + req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, + 0); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, + ea2len); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + rpc_lock = obd->u.cli.cl_rpc_lock; + + if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime "CFS_TIME_T + ", ctime "CFS_TIME_T"\n", + LTIME_S(op_data->op_attr.ia_mtime), + LTIME_S(op_data->op_attr.ia_ctime)); + mdc_setattr_pack(req, op_data, ea, ealen, ea2, ea2len); + + ptlrpc_request_set_replen(req); + if (mod && (op_data->op_flags & MF_EPOCH_OPEN) && + req->rq_import->imp_replayable) + { + LASSERT(*mod == NULL); + + *mod = obd_mod_alloc(); + if (*mod == NULL) { + DEBUG_REQ(D_ERROR, req, "Can't allocate " + "md_open_data"); + } else { + req->rq_replay = 1; + req->rq_cb_data = *mod; + (*mod)->mod_open_req = req; + req->rq_commit_cb = mdc_commit_open; + /** + * Take an extra reference on \var mod, it protects \var + * mod from being freed on eviction (commit callback is + * called despite rq_replay flag). + * Will be put on mdc_done_writing(). + */ + obd_mod_get(*mod); + } + } + + rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL); + + /* Save the obtained info in the original RPC for the replay case. */ + if (rc == 0 && (op_data->op_flags & MF_EPOCH_OPEN)) { + struct mdt_ioepoch *epoch; + struct mdt_body *body; + + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(epoch != NULL); + LASSERT(body != NULL); + epoch->handle = body->handle; + epoch->ioepoch = body->ioepoch; + req->rq_replay_cb = mdc_replay_open; + /** bug 3633, open may be committed and estale answer is not error */ + } else if (rc == -ESTALE && (op_data->op_flags & MF_SOM_CHANGE)) { + rc = 0; + } else if (rc == -ERESTARTSYS) { + rc = 0; + } + *request = req; + if (rc && req->rq_commit_cb) { + /* Put an extra reference on \var mod on error case. */ + obd_mod_put(*mod); + req->rq_commit_cb(req); + } + RETURN(rc); +} + +int mdc_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, __u32 gid, + cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int level, rc; + int count, resends = 0; + struct obd_import *import = exp->exp_obd->u.cli.cl_import; + int generation = import->imp_generation; + LIST_HEAD(cancels); + ENTRY; + + /* For case if upper layer did not alloc fid, do it now. */ + if (!fid_is_sane(&op_data->op_fid2)) { + /* + * mdc_fid_alloc() may return errno 1 in case of switch to new + * sequence, handle this. + */ + rc = mdc_fid_alloc(exp, &op_data->op_fid2, op_data); + if (rc < 0) { + CERROR("Can't alloc new fid, rc %d\n", rc); + RETURN(rc); + } + } + +rebuild: + count = 0; + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_CREATE_RMT_ACL); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + data && datalen ? datalen : 0); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* + * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with + * tgt, for symlinks or lov MD data. + */ + mdc_create_pack(req, op_data, data, datalen, mode, uid, + gid, cap_effective, rdev); + + ptlrpc_request_set_replen(req); + + /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry + * logic here */ + req->rq_no_retry_einprogress = 1; + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = cfs_time_current_sec() + resends; + } + level = LUSTRE_IMP_FULL; + resend: + rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level); + + /* Resend if we were told to. */ + if (rc == -ERESTARTSYS) { + level = LUSTRE_IMP_RECOVER; + goto resend; + } else if (rc == -EINPROGRESS) { + /* Retry create infinitely until succeed or get other + * error code. */ + ptlrpc_req_finished(req); + resends++; + + CDEBUG(D_HA, "%s: resend:%d create on "DFID"/"DFID"\n", + exp->exp_obd->obd_name, resends, + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); + + if (generation == import->imp_generation) { + goto rebuild; + } else { + CDEBUG(D_HA, "resend cross eviction\n"); + RETURN(-EIO); + } + } else if (rc == 0) { + struct mdt_body *body; + struct lustre_capa *capa; + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body); + if (body->valid & OBD_MD_FLMDSCAPA) { + capa = req_capsule_server_get(&req->rq_pill, + &RMF_CAPA1); + if (capa == NULL) + rc = -EPROTO; + } + } + + *request = req; + RETURN(rc); +} + +int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req = *request; + int count = 0, rc; + ENTRY; + + LASSERT(req == NULL); + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3)) && + !OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_FULL); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_UNLINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_unlink_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_max_mds_cookiesize); + ptlrpc_request_set_replen(req); + + *request = req; + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + if (rc == -ERESTARTSYS) + rc = 0; + RETURN(rc); +} + +int mdc_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count = mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count += mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_link_pack(req, op_data); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} + +int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request) +{ + LIST_HEAD(cancels); + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req; + int count = 0, rc; + ENTRY; + + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + (fid_is_sane(&op_data->op_fid1))) + count = mdc_resource_get_unused(exp, &op_data->op_fid1, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && + (fid_is_sane(&op_data->op_fid2))) + count += mdc_resource_get_unused(exp, &op_data->op_fid2, + &cancels, LCK_EX, + MDS_INODELOCK_UPDATE); + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + (fid_is_sane(&op_data->op_fid3))) + count += mdc_resource_get_unused(exp, &op_data->op_fid3, + &cancels, LCK_EX, + MDS_INODELOCK_LOOKUP); + if ((op_data->op_flags & MF_MDC_CANCEL_FID4) && + (fid_is_sane(&op_data->op_fid4))) + count += mdc_resource_get_unused(exp, &op_data->op_fid4, + &cancels, LCK_EX, + MDS_INODELOCK_FULL); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_REINT_RENAME); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); + req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, newlen+1); + + rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (exp_connect_cancelset(exp) && req) + ldlm_cli_cancel_list(&cancels, count, req, 0); + + mdc_rename_pack(req, op_data, old, oldlen, new, newlen); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_max_mds_cookiesize); + ptlrpc_request_set_replen(req); + + rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL); + *request = req; + if (rc == -ERESTARTSYS) + rc = 0; + + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c new file mode 100644 index 000000000000..88454bf75a71 --- /dev/null +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -0,0 +1,2752 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_MDC + +# include +# include +# include +# include +# include + +#include +#include +#include +#include +#include +#include + +#include "mdc_internal.h" + +#define REQUEST_MINOR 244 + +struct mdc_renew_capa_args { + struct obd_capa *ra_oc; + renew_capa_cb_t ra_cb; +}; + +static int mdc_cleanup(struct obd_device *obd); + +int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa **oc) +{ + struct lustre_capa *capa; + struct obd_capa *c; + ENTRY; + + /* swabbed already in mdc_enqueue */ + capa = req_capsule_server_get(&req->rq_pill, field); + if (capa == NULL) + RETURN(-EPROTO); + + c = alloc_capa(CAPA_SITE_CLIENT); + if (IS_ERR(c)) { + CDEBUG(D_INFO, "alloc capa failed!\n"); + RETURN(PTR_ERR(c)); + } else { + c->c_capa = *capa; + *oc = c; + RETURN(0); + } +} + +static inline int mdc_queue_wait(struct ptlrpc_request *req) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + int rc; + + /* mdc_enter_request() ensures that this client has no more + * than cl_max_rpcs_in_flight RPCs simultaneously inf light + * against an MDT. */ + rc = mdc_enter_request(cli); + if (rc != 0) + return rc; + + rc = ptlrpc_queue_wait(req); + mdc_exit_request(cli); + + return rc; +} + +/* Helper that implements most of mdc_getstatus and signal_completed_replay. */ +/* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */ +static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid, + struct obd_capa **pc, int level, int msg_flags) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_GETSTATUS, + LUSTRE_MDS_VERSION, MDS_GETSTATUS); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_pack_body(req, NULL, NULL, 0, 0, -1, 0); + lustre_msg_add_flags(req->rq_reqmsg, msg_flags); + req->rq_send_state = level; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + if (body->valid & OBD_MD_FLMDSCAPA) { + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc); + if (rc) + GOTO(out, rc); + } + + *rootfid = body->fid1; + CDEBUG(D_NET, + "root fid="DFID", last_committed="LPU64"\n", + PFID(rootfid), + lustre_msg_get_last_committed(req->rq_repmsg)); + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +/* This should be mdc_get_info("rootfid") */ +int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid, + struct obd_capa **pc) +{ + return send_getstatus(class_exp2cliimp(exp), rootfid, pc, + LUSTRE_IMP_FULL, 0); +} + +/* + * This function now is known to always saying that it will receive 4 buffers + * from server. Even for cases when acl_size and md_size is zero, RPC header + * will contain 4 fields and RPC itself will contain zero size fields. This is + * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed + * and thus zero, it shrinks it, making zero size. The same story about + * md_size. And this is course of problem when client waits for smaller number + * of fields. This issue will be fixed later when client gets aware of RPC + * layouts. --umka + */ +static int mdc_getattr_common(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body; + void *eadata; + int rc; + ENTRY; + + /* Request message already built. */ + rc = ptlrpc_queue_wait(req); + if (rc != 0) + RETURN(rc); + + /* sanity check for the reply */ + body = req_capsule_server_get(pill, &RMF_MDT_BODY); + if (body == NULL) + RETURN(-EPROTO); + + CDEBUG(D_NET, "mode: %o\n", body->mode); + + if (body->eadatasize != 0) { + mdc_update_max_ea_from_body(exp, body); + + eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + body->eadatasize); + if (eadata == NULL) + RETURN(-EPROTO); + } + + if (body->valid & OBD_MD_FLRMTPERM) { + struct mdt_remote_perm *perm; + + LASSERT(client_is_remote(exp)); + perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (perm == NULL) + RETURN(-EPROTO); + } + + if (body->valid & OBD_MD_FLMDSCAPA) { + struct lustre_capa *capa; + capa = req_capsule_server_get(pill, &RMF_CAPA1); + if (capa == NULL) + RETURN(-EPROTO); + } + + RETURN(0); +} + +int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + /* Single MDS without an LMV case */ + if (op_data->op_flags & MF_GET_MDT_IDX) { + op_data->op_mds = 0; + RETURN(0); + } + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + op_data->op_valid, op_data->op_mode, -1, 0); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + if (op_data->op_valid & OBD_MD_FLRMTPERM) { + LASSERT(client_is_remote(exp)); + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + } + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_GETATTR_NAME); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + op_data->op_namelen + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + op_data->op_valid, op_data->op_mode, + op_data->op_suppgids[0], 0); + + if (op_data->op_name) { + char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + LASSERT(strnlen(op_data->op_name, op_data->op_namelen) == + op_data->op_namelen); + memcpy(name, op_data->op_name, op_data->op_namelen); + } + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + op_data->op_mode); + ptlrpc_request_set_replen(req); + + rc = mdc_getattr_common(exp, req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_is_subdir(struct obd_export *exp, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_IS_SUBDIR, LUSTRE_MDS_VERSION, + MDS_IS_SUBDIR); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_is_subdir_pack(req, pfid, cfid, 0); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc && rc != -EREMOTE) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_xattr_common(struct obd_export *exp,const struct req_format *fmt, + const struct lu_fid *fid, + struct obd_capa *oc, int opcode, obd_valid valid, + const char *xattr_name, const char *input, + int input_size, int output_size, int flags, + __u32 suppgid, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int xattr_namelen = 0; + char *tmp; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + if (xattr_name) { + xattr_namelen = strlen(xattr_name) + 1; + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + xattr_namelen); + } + if (input_size) { + LASSERT(input); + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, + input_size); + } + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (opcode == MDS_REINT) { + struct mdt_rec_setxattr *rec; + + CLASSERT(sizeof(struct mdt_rec_setxattr) == + sizeof(struct mdt_rec_reint)); + rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); + rec->sx_opcode = REINT_SETXATTR; + /* TODO: + * cfs_curproc_fs{u,g}id() should replace + * current->fs{u,g}id for portability. + */ + rec->sx_fsuid = current_fsuid(); + rec->sx_fsgid = current_fsgid(); + rec->sx_cap = cfs_curproc_cap_pack(); + rec->sx_suppgid1 = suppgid; + rec->sx_suppgid2 = -1; + rec->sx_fid = *fid; + rec->sx_valid = valid | OBD_MD_FLCTIME; + rec->sx_time = cfs_time_current_sec(); + rec->sx_size = output_size; + rec->sx_flags = flags; + + mdc_pack_capa(req, &RMF_CAPA1, oc); + } else { + mdc_pack_body(req, fid, oc, valid, output_size, suppgid, flags); + } + + if (xattr_name) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + memcpy(tmp, xattr_name, xattr_namelen); + } + if (input_size) { + tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); + memcpy(tmp, input, input_size); + } + + if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER)) + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, + RCL_SERVER, output_size); + ptlrpc_request_set_replen(req); + + /* make rpc */ + if (opcode == MDS_REINT) + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + rc = ptlrpc_queue_wait(req); + + if (opcode == MDS_REINT) + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, obd_valid valid, const char *xattr_name, + const char *input, int input_size, int output_size, + int flags, __u32 suppgid, struct ptlrpc_request **request) +{ + return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR, + fid, oc, MDS_REINT, valid, xattr_name, + input, input_size, output_size, flags, + suppgid, request); +} + +int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, obd_valid valid, const char *xattr_name, + const char *input, int input_size, int output_size, + int flags, struct ptlrpc_request **request) +{ + return mdc_xattr_common(exp, &RQF_MDS_GETXATTR, + fid, oc, MDS_GETXATTR, valid, xattr_name, + input, input_size, output_size, flags, + -1, request); +} + +#ifdef CONFIG_FS_POSIX_ACL +static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + struct mdt_body *body = md->body; + struct posix_acl *acl; + void *buf; + int rc; + ENTRY; + + if (!body->aclsize) + RETURN(0); + + buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->aclsize); + + if (!buf) + RETURN(-EPROTO); + + acl = posix_acl_from_xattr(&init_user_ns, buf, body->aclsize); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + CERROR("convert xattr to acl: %d\n", rc); + RETURN(rc); + } + + rc = posix_acl_valid(acl); + if (rc) { + CERROR("validate acl: %d\n", rc); + posix_acl_release(acl); + RETURN(rc); + } + + md->posix_acl = acl; + RETURN(0); +} +#else +#define mdc_unpack_acl(req, md) 0 +#endif + +int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, + struct obd_export *dt_exp, struct obd_export *md_exp, + struct lustre_md *md) +{ + struct req_capsule *pill = &req->rq_pill; + int rc; + ENTRY; + + LASSERT(md); + memset(md, 0, sizeof(*md)); + + md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); + LASSERT(md->body != NULL); + + if (md->body->valid & OBD_MD_FLEASIZE) { + int lmmsize; + struct lov_mds_md *lmm; + + if (!S_ISREG(md->body->mode)) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, should be a " + "regular file, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md->body->eadatasize == 0) { + CDEBUG(D_INFO, "OBD_MD_FLEASIZE set, " + "but eadatasize 0\n"); + GOTO(out, rc = -EPROTO); + } + lmmsize = md->body->eadatasize; + lmm = req_capsule_server_sized_get(pill, &RMF_MDT_MD, lmmsize); + if (!lmm) + GOTO(out, rc = -EPROTO); + + rc = obd_unpackmd(dt_exp, &md->lsm, lmm, lmmsize); + if (rc < 0) + GOTO(out, rc); + + if (rc < sizeof(*md->lsm)) { + CDEBUG(D_INFO, "lsm size too small: " + "rc < sizeof (*md->lsm) (%d < %d)\n", + rc, (int)sizeof(*md->lsm)); + GOTO(out, rc = -EPROTO); + } + + } else if (md->body->valid & OBD_MD_FLDIREA) { + int lmvsize; + struct lov_mds_md *lmv; + + if(!S_ISDIR(md->body->mode)) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA set, should be a " + "directory, but is not\n"); + GOTO(out, rc = -EPROTO); + } + + if (md->body->eadatasize == 0) { + CDEBUG(D_INFO, "OBD_MD_FLDIREA is set, " + "but eadatasize 0\n"); + RETURN(-EPROTO); + } + if (md->body->valid & OBD_MD_MEA) { + lmvsize = md->body->eadatasize; + lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, + lmvsize); + if (!lmv) + GOTO(out, rc = -EPROTO); + + rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv, + lmvsize); + if (rc < 0) + GOTO(out, rc); + + if (rc < sizeof(*md->mea)) { + CDEBUG(D_INFO, "size too small: " + "rc < sizeof(*md->mea) (%d < %d)\n", + rc, (int)sizeof(*md->mea)); + GOTO(out, rc = -EPROTO); + } + } + } + rc = 0; + + if (md->body->valid & OBD_MD_FLRMTPERM) { + /* remote permission */ + LASSERT(client_is_remote(exp)); + md->remote_perm = req_capsule_server_swab_get(pill, &RMF_ACL, + lustre_swab_mdt_remote_perm); + if (!md->remote_perm) + GOTO(out, rc = -EPROTO); + } + else if (md->body->valid & OBD_MD_FLACL) { + /* for ACL, it's possible that FLACL is set but aclsize is zero. + * only when aclsize != 0 there's an actual segment for ACL + * in reply buffer. + */ + if (md->body->aclsize) { + rc = mdc_unpack_acl(req, md); + if (rc) + GOTO(out, rc); +#ifdef CONFIG_FS_POSIX_ACL + } else { + md->posix_acl = NULL; +#endif + } + } + if (md->body->valid & OBD_MD_FLMDSCAPA) { + struct obd_capa *oc = NULL; + + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc); + if (rc) + GOTO(out, rc); + md->mds_capa = oc; + } + + if (md->body->valid & OBD_MD_FLOSSCAPA) { + struct obd_capa *oc = NULL; + + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc); + if (rc) + GOTO(out, rc); + md->oss_capa = oc; + } + + EXIT; +out: + if (rc) { + if (md->oss_capa) { + capa_put(md->oss_capa); + md->oss_capa = NULL; + } + if (md->mds_capa) { + capa_put(md->mds_capa); + md->mds_capa = NULL; + } +#ifdef CONFIG_FS_POSIX_ACL + posix_acl_release(md->posix_acl); +#endif + if (md->lsm) + obd_free_memmd(dt_exp, &md->lsm); + } + return rc; +} + +int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md) +{ + ENTRY; + RETURN(0); +} + +/** + * Handles both OPEN and SETATTR RPCs for OPEN-CLOSE and SETATTR-DONE_WRITING + * RPC chains. + */ +void mdc_replay_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + struct ptlrpc_request *close_req; + struct obd_client_handle *och; + struct lustre_handle old; + struct mdt_body *body; + ENTRY; + + if (mod == NULL) { + DEBUG_REQ(D_ERROR, req, + "Can't properly replay without open data."); + EXIT; + return; + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + LASSERT(body != NULL); + + och = mod->mod_och; + if (och != NULL) { + struct lustre_handle *file_fh; + + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); + + file_fh = &och->och_fh; + CDEBUG(D_HA, "updating handle from "LPX64" to "LPX64"\n", + file_fh->cookie, body->handle.cookie); + old = *file_fh; + *file_fh = body->handle; + } + close_req = mod->mod_close_req; + if (close_req != NULL) { + __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); + struct mdt_ioepoch *epoch; + + LASSERT(opc == MDS_CLOSE || opc == MDS_DONE_WRITING); + epoch = req_capsule_client_get(&close_req->rq_pill, + &RMF_MDT_EPOCH); + LASSERT(epoch); + + if (och != NULL) + LASSERT(!memcmp(&old, &epoch->handle, sizeof(old))); + DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); + epoch->handle = body->handle; + } + EXIT; +} + +void mdc_commit_open(struct ptlrpc_request *req) +{ + struct md_open_data *mod = req->rq_cb_data; + if (mod == NULL) + return; + + /** + * No need to touch md_open_data::mod_och, it holds a reference on + * \var mod and will zero references to each other, \var mod will be + * freed after that when md_open_data::mod_och will put the reference. + */ + + /** + * Do not let open request to disappear as it still may be needed + * for close rpc to happen (it may happen on evict only, otherwise + * ptlrpc_request::rq_replay does not let mdc_commit_open() to be + * called), just mark this rpc as committed to distinguish these 2 + * cases, see mdc_close() for details. The open request reference will + * be put along with freeing \var mod. + */ + ptlrpc_request_addref(req); + spin_lock(&req->rq_lock); + req->rq_committed = 1; + spin_unlock(&req->rq_lock); + req->rq_cb_data = NULL; + obd_mod_put(mod); +} + +int mdc_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct ptlrpc_request *open_req) +{ + struct md_open_data *mod; + struct mdt_rec_create *rec; + struct mdt_body *body; + struct obd_import *imp = open_req->rq_import; + ENTRY; + + if (!open_req->rq_replay) + RETURN(0); + + rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT); + body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); + LASSERT(rec != NULL); + /* Incoming message in my byte order (it's been swabbed). */ + /* Outgoing messages always in my byte order. */ + LASSERT(body != NULL); + + /* Only if the import is replayable, we set replay_open data */ + if (och && imp->imp_replayable) { + mod = obd_mod_alloc(); + if (mod == NULL) { + DEBUG_REQ(D_ERROR, open_req, + "Can't allocate md_open_data"); + RETURN(0); + } + + /** + * Take a reference on \var mod, to be freed on mdc_close(). + * It protects \var mod from being freed on eviction (commit + * callback is called despite rq_replay flag). + * Another reference for \var och. + */ + obd_mod_get(mod); + obd_mod_get(mod); + + spin_lock(&open_req->rq_lock); + och->och_mod = mod; + mod->mod_och = och; + mod->mod_open_req = open_req; + open_req->rq_cb_data = mod; + open_req->rq_commit_cb = mdc_commit_open; + spin_unlock(&open_req->rq_lock); + } + + rec->cr_fid2 = body->fid1; + rec->cr_ioepoch = body->ioepoch; + rec->cr_old_handle.cookie = body->handle.cookie; + open_req->rq_replay_cb = mdc_replay_open; + if (!fid_is_sane(&body->fid1)) { + DEBUG_REQ(D_ERROR, open_req, "Saving replay request with " + "insane fid"); + LBUG(); + } + + DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data"); + RETURN(0); +} + +int mdc_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + struct md_open_data *mod = och->och_mod; + ENTRY; + + /** + * It is possible to not have \var mod in a case of eviction between + * lookup and ll_file_open(). + **/ + if (mod == NULL) + RETURN(0); + + LASSERT(mod != LP_POISON); + + mod->mod_och = NULL; + och->och_mod = NULL; + obd_mod_put(mod); + + RETURN(0); +} + +/* Prepares the request for the replay by the given reply */ +static void mdc_close_handle_reply(struct ptlrpc_request *req, + struct md_op_data *op_data, int rc) { + struct mdt_body *repbody; + struct mdt_ioepoch *epoch; + + if (req && rc == -EAGAIN) { + repbody = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); + + epoch->flags |= MF_SOM_AU; + if (repbody->valid & OBD_MD_FLGETATTRLOCK) + op_data->op_flags |= MF_GETATTR_LOCK; + } +} + +int mdc_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, struct ptlrpc_request **request) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_CLOSE); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a + * portal whose threads are not taking any DLM locks and are therefore + * always progressing */ + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + /* Ensure that this close's handle is fixed up during replay. */ + if (likely(mod != NULL)) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED open %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + + DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); + /* We no longer want to preserve this open for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } else { + CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); + } + + mdc_close_pack(req, op_data); + + req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, + obd->u.cli.cl_max_mds_easize); + req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_SERVER, + obd->u.cli.cl_max_mds_cookiesize); + + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL); + + if (req->rq_repmsg == NULL) { + CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req, + req->rq_status); + if (rc == 0) + rc = req->rq_status ?: -EIO; + } else if (rc == 0 || rc == -EAGAIN) { + struct mdt_body *body; + + rc = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err " + "= %d", rc); + if (rc > 0) + rc = -rc; + } + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + rc = -EPROTO; + } else if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open was committed and + * server failed before close was sent. Let's check if mod + * exists and return no error in that case + */ + if (mod) { + DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc); + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + /* Since now, mod is accessed through open_req only, + * thus close req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + *request = req; + mdc_close_handle_reply(req, op_data, rc); + RETURN(rc); +} + +int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_DONE_WRITING); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_DONE_WRITING); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (mod != NULL) { + LASSERTF(mod->mod_open_req != NULL && + mod->mod_open_req->rq_type != LI_POISON, + "POISONED setattr %p!\n", mod->mod_open_req); + + mod->mod_close_req = req; + DEBUG_REQ(D_HA, mod->mod_open_req, "matched setattr"); + /* We no longer want to preserve this setattr for replay even + * though the open was committed. b=3632, b=3633 */ + spin_lock(&mod->mod_open_req->rq_lock); + mod->mod_open_req->rq_replay = 0; + spin_unlock(&mod->mod_open_req->rq_lock); + } + + mdc_close_pack(req, op_data); + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(obd->u.cli.cl_close_lock, NULL); + + if (rc == -ESTALE) { + /** + * it can be allowed error after 3633 if open or setattr were + * committed and server failed before close was sent. + * Let's check if mod exists and return no error in that case + */ + if (mod) { + LASSERT(mod->mod_open_req != NULL); + if (mod->mod_open_req->rq_committed) + rc = 0; + } + } + + if (mod) { + if (rc != 0) + mod->mod_close_req = NULL; + /* Since now, mod is accessed through setattr req only, + * thus DW req does not keep a reference on mod anymore. */ + obd_mod_put(mod); + } + + mdc_close_handle_reply(req, op_data, rc); + ptlrpc_req_finished(req); + RETURN(rc); +} + + +int mdc_readpage(struct obd_export *exp, struct md_op_data *op_data, + struct page **pages, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + int i; + wait_queue_head_t waitq; + int resends = 0; + struct l_wait_info lwi; + int rc; + ENTRY; + + *request = NULL; + init_waitqueue_head(&waitq); + +restart_bulk: + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = MDS_READPAGE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + desc = ptlrpc_prep_bulk_imp(req, op_data->op_npages, 1, BULK_PUT_SINK, + MDS_BULK_PORTAL); + if (desc == NULL) { + ptlrpc_request_free(req); + RETURN(-ENOMEM); + } + + /* NB req now owns desc and will free it when it gets freed */ + for (i = 0; i < op_data->op_npages; i++) + ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE); + + mdc_readdir_pack(req, op_data->op_offset, + PAGE_CACHE_SIZE * op_data->op_npages, + &op_data->op_fid1, op_data->op_capa1); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) { + ptlrpc_req_finished(req); + if (rc != -ETIMEDOUT) + RETURN(rc); + + resends++; + if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("too many resend retries, returning error\n"); + RETURN(-EIO); + } + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi); + + goto restart_bulk; + } + + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, + req->rq_bulk->bd_nob_transferred); + if (rc < 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + + if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { + CERROR("Unexpected # bytes transferred: %d (%ld expected)\n", + req->rq_bulk->bd_nob_transferred, + PAGE_CACHE_SIZE * op_data->op_npages); + ptlrpc_req_finished(req); + RETURN(-EPROTO); + } + + *request = req; + RETURN(0); +} + +static int mdc_statfs(const struct lu_env *env, + struct obd_export *exp, struct obd_statfs *osfs, + __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct obd_statfs *msfs; + struct obd_import *imp = NULL; + int rc; + ENTRY; + + /* + * Since the request might also come from lprocfs, so we need + * sync this with client_disconnect_export Bug15684 + */ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS, + LUSTRE_MDS_VERSION, MDS_STATFS); + if (req == NULL) + GOTO(output, rc = -ENOMEM); + + ptlrpc_request_set_replen(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stay in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* check connection error first */ + if (imp->imp_connect_error) + rc = imp->imp_connect_error; + GOTO(out, rc); + } + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) + GOTO(out, rc = -EPROTO); + + *osfs = *msfs; + EXIT; +out: + ptlrpc_req_finished(req); +output: + class_import_put(imp); + return rc; +} + +static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf) +{ + __u32 keylen, vallen; + void *key; + int rc; + + if (gf->gf_pathlen > PATH_MAX) + RETURN(-ENAMETOOLONG); + if (gf->gf_pathlen < 2) + RETURN(-EOVERFLOW); + + /* Key is KEY_FID2PATH + getinfo_fid2path description */ + keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf); + OBD_ALLOC(key, keylen); + if (key == NULL) + RETURN(-ENOMEM); + memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH)); + memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf)); + + CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno); + + if (!fid_is_sane(&gf->gf_fid)) + GOTO(out, rc = -EINVAL); + + /* Val is struct getinfo_fid2path result plus path */ + vallen = sizeof(*gf) + gf->gf_pathlen; + + rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf, NULL); + if (rc != 0 && rc != -EREMOTE) + GOTO(out, rc); + + if (vallen <= sizeof(*gf)) + GOTO(out, rc = -EPROTO); + else if (vallen > sizeof(*gf) + gf->gf_pathlen) + GOTO(out, rc = -EOVERFLOW); + + CDEBUG(D_IOCTL, "path get "DFID" from "LPU64" #%d\n%s\n", + PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, gf->gf_path); + +out: + OBD_FREE(key, keylen); + return rc; +} + +static int mdc_ioc_hsm_progress(struct obd_export *exp, + struct hsm_progress_kernel *hpk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct hsm_progress_kernel *req_hpk; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS, + LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_progress struct */ + req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS); + if (req_hpk == NULL) + GOTO(out, rc = -EPROTO); + + *req_hpk = *hpk; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives) +{ + __u32 *archive_mask; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_REGISTER); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_progress struct */ + archive_mask = req_capsule_client_get(&req->rq_pill, + &RMF_MDS_HSM_ARCHIVE); + if (archive_mask == NULL) + GOTO(out, rc = -EPROTO); + + *archive_mask = archives; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_current_action(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_current_action *hca = op_data->op_data; + struct hsm_current_action *req_hca; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_ACTION); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hca = req_capsule_server_get(&req->rq_pill, + &RMF_MDS_HSM_CURRENT_ACTION); + if (req_hca == NULL) + GOTO(out, rc = -EPROTO); + + *hca = *req_hca; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER, + LUSTRE_MDS_VERSION, + MDS_HSM_CT_UNREGISTER); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_get(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_user_state *hus = op_data->op_data; + struct hsm_user_state *req_hus; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_GET); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET); + if (rc != 0) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + if (rc) + GOTO(out, rc); + + req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE); + if (req_hus == NULL) + GOTO(out, rc = -EPROTO); + + *hus = *req_hus; + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_state_set(struct obd_export *exp, + struct md_op_data *op_data) +{ + struct hsm_state_set *hss = op_data->op_data; + struct hsm_state_set *req_hss; + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_HSM_STATE_SET); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, &op_data->op_fid1, op_data->op_capa1, + OBD_MD_FLRMTPERM, 0, op_data->op_suppgids[0], 0); + + /* Copy states */ + req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET); + if (req_hss == NULL) + GOTO(out, rc = -EPROTO); + *req_hss = *hss; + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); + + EXIT; +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_ioc_hsm_request(struct obd_export *exp, + struct hsm_user_request *hur) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + struct hsm_request *req_hr; + struct hsm_user_item *req_hui; + char *req_opaque; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT, + hur->hur_request.hr_itemcount + * sizeof(struct hsm_user_item)); + req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT, + hur->hur_request.hr_data_len); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, NULL, NULL, OBD_MD_FLRMTPERM, 0, 0, 0); + + /* Copy hsm_request struct */ + req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST); + if (req_hr == NULL) + GOTO(out, rc = -EPROTO); + *req_hr = hur->hur_request; + + /* Copy hsm_user_item structs */ + req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM); + if (req_hui == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_hui, hur->hur_user_item, + hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item)); + + /* Copy opaque field */ + req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA); + if (req_opaque == NULL) + GOTO(out, rc = -EPROTO); + memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len); + + ptlrpc_request_set_replen(req); + + rc = mdc_queue_wait(req); + GOTO(out, rc); + +out: + ptlrpc_req_finished(req); + return rc; +} + +static struct kuc_hdr *changelog_kuc_hdr(char *buf, int len, int flags) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)buf; + + LASSERT(len <= CR_MAXSIZE); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = KUC_TRANSPORT_CHANGELOG; + lh->kuc_flags = flags; + lh->kuc_msgtype = CL_RECORD; + lh->kuc_msglen = len; + return lh; +} + +#define D_CHANGELOG 0 + +struct changelog_show { + __u64 cs_startrec; + __u32 cs_flags; + struct file *cs_fp; + char *cs_buf; + struct obd_device *cs_obd; +}; + +static int changelog_show_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *hdr, void *data) +{ + struct changelog_show *cs = data; + struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr; + struct kuc_hdr *lh; + int len, rc; + ENTRY; + + if ((rec->cr_hdr.lrh_type != CHANGELOG_REC) || + (rec->cr.cr_type >= CL_LAST)) { + CERROR("Not a changelog rec %d/%d\n", rec->cr_hdr.lrh_type, + rec->cr.cr_type); + RETURN(-EINVAL); + } + + if (rec->cr.cr_index < cs->cs_startrec) { + /* Skip entries earlier than what we are interested in */ + CDEBUG(D_CHANGELOG, "rec="LPU64" start="LPU64"\n", + rec->cr.cr_index, cs->cs_startrec); + RETURN(0); + } + + CDEBUG(D_CHANGELOG, LPU64" %02d%-5s "LPU64" 0x%x t="DFID" p="DFID + " %.*s\n", rec->cr.cr_index, rec->cr.cr_type, + changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, + rec->cr.cr_flags & CLF_FLAGMASK, + PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid), + rec->cr.cr_namelen, changelog_rec_name(&rec->cr)); + + len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen; + + /* Set up the message */ + lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags); + memcpy(lh + 1, &rec->cr, len - sizeof(*lh)); + + rc = libcfs_kkuc_msg_put(cs->cs_fp, lh); + CDEBUG(D_CHANGELOG, "kucmsg fp %p len %d rc %d\n", cs->cs_fp, len,rc); + + RETURN(rc); +} + +static int mdc_changelog_send_thread(void *csdata) +{ + struct changelog_show *cs = csdata; + struct llog_ctxt *ctxt = NULL; + struct llog_handle *llh = NULL; + struct kuc_hdr *kuch; + int rc; + + CDEBUG(D_CHANGELOG, "changelog to fp=%p start "LPU64"\n", + cs->cs_fp, cs->cs_startrec); + + OBD_ALLOC(cs->cs_buf, CR_MAXSIZE); + if (cs->cs_buf == NULL) + GOTO(out, rc = -ENOMEM); + + /* Set up the remote catalog handle */ + ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt == NULL) + GOTO(out, rc = -ENOENT); + rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG, + LLOG_OPEN_EXISTS); + if (rc) { + CERROR("%s: fail to open changelog catalog: rc = %d\n", + cs->cs_obd->obd_name, rc); + GOTO(out, rc); + } + rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT, NULL); + if (rc) { + CERROR("llog_init_handle failed %d\n", rc); + GOTO(out, rc); + } + + rc = llog_cat_process(NULL, llh, changelog_show_cb, cs, 0, 0); + + /* Send EOF no matter what our result */ + if ((kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch), + cs->cs_flags))) { + kuch->kuc_msgtype = CL_EOF; + libcfs_kkuc_msg_put(cs->cs_fp, kuch); + } + +out: + fput(cs->cs_fp); + if (llh) + llog_cat_close(NULL, llh); + if (ctxt) + llog_ctxt_put(ctxt); + if (cs->cs_buf) + OBD_FREE(cs->cs_buf, CR_MAXSIZE); + OBD_FREE_PTR(cs); + return rc; +} + +static int mdc_ioc_changelog_send(struct obd_device *obd, + struct ioc_changelog *icc) +{ + struct changelog_show *cs; + int rc; + + /* Freed in mdc_changelog_send_thread */ + OBD_ALLOC_PTR(cs); + if (!cs) + return -ENOMEM; + + cs->cs_obd = obd; + cs->cs_startrec = icc->icc_recno; + /* matching fput in mdc_changelog_send_thread */ + cs->cs_fp = fget(icc->icc_id); + cs->cs_flags = icc->icc_flags; + + /* + * New thread because we should return to user app before + * writing into our pipe + */ + rc = PTR_ERR(kthread_run(mdc_changelog_send_thread, cs, + "mdc_clg_send_thread")); + if (!IS_ERR_VALUE(rc)) { + CDEBUG(D_CHANGELOG, "start changelog thread\n"); + return 0; + } + + CERROR("Failed to start changelog thread: %d\n", rc); + OBD_FREE_PTR(cs); + return rc; +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk); + +static int mdc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct obd_quotactl *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_QUOTACHECK, LUSTRE_MDS_VERSION, + MDS_QUOTACHECK); + if (req == NULL) + RETURN(-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *body = *oqctl; + + ptlrpc_request_set_replen(req); + + /* the next poll will find -ENODATA, that means quotacheck is + * going on */ + cli->cl_qchk_stat = -ENODATA; + rc = ptlrpc_queue_wait(req); + if (rc) + cli->cl_qchk_stat = rc; + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int mdc_quota_poll_check(struct obd_export *exp, + struct if_quotacheck *qchk) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc; + ENTRY; + + qchk->obd_uuid = cli->cl_target_uuid; + memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)); + + rc = cli->cl_qchk_stat; + /* the client is not the previous one */ + if (rc == CL_NOT_QUOTACHECKED) + rc = -EINTR; + RETURN(rc); +} + +static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION, + MDS_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + *oqctl = *oqc; + } else if (!rc) { + CERROR ("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int mdc_ioc_swap_layouts(struct obd_export *exp, + struct md_op_data *op_data) +{ + LIST_HEAD(cancels); + struct ptlrpc_request *req; + int rc, count; + struct mdc_swap_layouts *msl, *payload; + ENTRY; + + msl = op_data->op_data; + + /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the + * first thing it will do is to cancel the 2 layout + * locks hold by this client. + * So the client must cancel its layout locks on the 2 fids + * with the request RPC to avoid extra RPC round trips + */ + count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, + LCK_CR, MDS_INODELOCK_LAYOUT); + count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, + LCK_CR, MDS_INODELOCK_LAYOUT); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_MDS_SWAP_LAYOUTS); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + mdc_set_capa_size(req, &RMF_CAPA1, op_data->op_capa1); + mdc_set_capa_size(req, &RMF_CAPA2, op_data->op_capa2); + + rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_swap_layouts_pack(req, op_data); + + payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS); + LASSERT(payload); + + *payload = *msl; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + EXIT; + +out: + ptlrpc_req_finished(req); + return rc; +} + +static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + struct obd_import *imp = obd->u.cli.cl_import; + struct llog_ctxt *ctxt; + int rc; + ENTRY; + + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_CHANGELOG_SEND: + rc = mdc_ioc_changelog_send(obd, karg); + GOTO(out, rc); + case OBD_IOC_CHANGELOG_CLEAR: { + struct ioc_changelog *icc = karg; + struct changelog_setinfo cs = + {.cs_recno = icc->icc_recno, .cs_id = icc->icc_id}; + rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR), + KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, + NULL); + GOTO(out, rc); + } + case OBD_IOC_FID2PATH: + rc = mdc_ioc_fid2path(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_CT_START: + rc = mdc_ioc_hsm_ct_start(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_PROGRESS: + rc = mdc_ioc_hsm_progress(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_GET: + rc = mdc_ioc_hsm_state_get(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_STATE_SET: + rc = mdc_ioc_hsm_state_set(exp, karg); + case LL_IOC_HSM_ACTION: + rc = mdc_ioc_hsm_current_action(exp, karg); + GOTO(out, rc); + case LL_IOC_HSM_REQUEST: + rc = mdc_ioc_hsm_request(exp, karg); + GOTO(out, rc); + case OBD_IOC_CLIENT_RECOVER: + rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0); + if (rc < 0) + GOTO(out, rc); + GOTO(out, rc = 0); + case IOC_OSC_SET_ACTIVE: + rc = ptlrpc_set_import_active(imp, data->ioc_offset); + GOTO(out, rc); + case OBD_IOC_PARSE: { + ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT); + rc = class_config_parse_llog(NULL, ctxt, data->ioc_inlbuf1, + NULL); + llog_ctxt_put(ctxt); + GOTO(out, rc); + } + case OBD_IOC_LLOG_INFO: + case OBD_IOC_LLOG_PRINT: { + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_ioctl(NULL, ctxt, cmd, data); + llog_ctxt_put(ctxt); + GOTO(out, rc); + } + case OBD_IOC_POLL_QUOTACHECK: + rc = mdc_quota_poll_check(exp, (struct if_quotacheck *)karg); + GOTO(out, rc); + case OBD_IOC_PING_TARGET: + rc = ptlrpc_obd_ping(obd); + GOTO(out, rc); + /* + * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by + * LMV instead of MDC. But when the cluster is upgraded from 1.8, + * there'd be no LMV layer thus we might be called here. Eventually + * this code should be removed. + * bz20731, LU-592. + */ + case IOC_OBD_STATFS: { + struct obd_statfs stat_buf = {0}; + + if (*((__u32 *) data->ioc_inlbuf2) != 0) + GOTO(out, rc = -ENODEV); + + /* copy UUID */ + if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd), + min((int) data->ioc_plen2, + (int) sizeof(struct obd_uuid)))) + GOTO(out, rc = -EFAULT); + + rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + 0); + if (rc != 0) + GOTO(out, rc); + + if (copy_to_user(data->ioc_pbuf1, &stat_buf, + min((int) data->ioc_plen1, + (int) sizeof(stat_buf)))) + GOTO(out, rc = -EFAULT); + + GOTO(out, rc = 0); + } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = obd->u.cli.cl_target_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } + case LL_IOC_GET_CONNECT_FLAGS: { + if (copy_to_user(uarg, + exp_connect_flags_ptr(exp), + sizeof(__u64))) + GOTO(out, rc = -EFAULT); + else + GOTO(out, rc = 0); + } + case LL_IOC_LOV_SWAP_LAYOUTS: { + rc = mdc_ioc_swap_layouts(exp, karg); + break; + } + default: + CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd); + GOTO(out, rc = -ENOTTY); + } +out: + module_put(THIS_MODULE); + + return rc; +} + +int mdc_get_info_rpc(struct obd_export *exp, + obd_count keylen, void *key, + int vallen, void *val) +{ + struct obd_import *imp = class_exp2cliimp(exp); + struct ptlrpc_request *req; + char *tmp; + int rc = -EINVAL; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN, + RCL_CLIENT, sizeof(__u32)); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN); + memcpy(tmp, &vallen, sizeof(__u32)); + + req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL, + RCL_SERVER, vallen); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + /* -EREMOTE means the get_info result is partial, and it needs to + * continue on another MDT, see fid2path part in lmv_iocontrol */ + if (rc == 0 || rc == -EREMOTE) { + tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL); + memcpy(val, tmp, vallen); + if (ptlrpc_rep_need_swab(req)) { + if (KEY_IS(KEY_FID2PATH)) + lustre_swab_fid2path(val); + } + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static void lustre_swab_hai(struct hsm_action_item *h) +{ + __swab32s(&h->hai_len); + __swab32s(&h->hai_action); + lustre_swab_lu_fid(&h->hai_fid); + lustre_swab_lu_fid(&h->hai_dfid); + __swab64s(&h->hai_cookie); + __swab64s(&h->hai_extent.offset); + __swab64s(&h->hai_extent.length); + __swab64s(&h->hai_gid); +} + +static void lustre_swab_hal(struct hsm_action_list *h) +{ + struct hsm_action_item *hai; + int i; + + __swab32s(&h->hal_version); + __swab32s(&h->hal_count); + __swab32s(&h->hal_archive_id); + __swab64s(&h->hal_flags); + hai = hai_zero(h); + for (i = 0; i < h->hal_count; i++) { + lustre_swab_hai(hai); + hai = hai_next(hai); + } +} + +static void lustre_swab_kuch(struct kuc_hdr *l) +{ + __swab16s(&l->kuc_magic); + /* __u8 l->kuc_transport */ + __swab16s(&l->kuc_msgtype); + __swab16s(&l->kuc_msglen); +} + +static int mdc_ioc_hsm_ct_start(struct obd_export *exp, + struct lustre_kernelcomm *lk) +{ + struct obd_import *imp = class_exp2cliimp(exp); + __u32 archive = lk->lk_data; + int rc = 0; + + if (lk->lk_group != KUC_GRP_HSM) { + CERROR("Bad copytool group %d\n", lk->lk_group); + return -EINVAL; + } + + CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd, + lk->lk_uid, lk->lk_group, lk->lk_flags); + + if (lk->lk_flags & LK_FLG_STOP) { + rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group); + /* Unregister with the coordinator */ + if (rc == 0) + rc = mdc_ioc_hsm_ct_unregister(imp); + } else { + struct file *fp = fget(lk->lk_wfd); + + rc = libcfs_kkuc_group_add(fp, lk->lk_uid, lk->lk_group, + lk->lk_data); + if (rc && fp) + fput(fp); + if (rc == 0) + rc = mdc_ioc_hsm_ct_register(imp, archive); + } + + return rc; +} + +/** + * Send a message to any listening copytools + * @param val KUC message (kuc_hdr + hsm_action_list) + * @param len total length of message + */ +static int mdc_hsm_copytool_send(int len, void *val) +{ + struct kuc_hdr *lh = (struct kuc_hdr *)val; + struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); + int rc; + ENTRY; + + if (len < sizeof(*lh) + sizeof(*hal)) { + CERROR("Short HSM message %d < %d\n", len, + (int) (sizeof(*lh) + sizeof(*hal))); + RETURN(-EPROTO); + } + if (lh->kuc_magic == __swab16(KUC_MAGIC)) { + lustre_swab_kuch(lh); + lustre_swab_hal(hal); + } else if (lh->kuc_magic != KUC_MAGIC) { + CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC); + RETURN(-EPROTO); + } + + CDEBUG(D_HSM, " Received message mg=%x t=%d m=%d l=%d actions=%d " + "on %s\n", + lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype, + lh->kuc_msglen, hal->hal_count, hal->hal_fsname); + + /* Broadcast to HSM listeners */ + rc = libcfs_kkuc_group_put(KUC_GRP_HSM, lh); + + RETURN(rc); +} + +/** + * callback function passed to kuc for re-registering each HSM copytool + * running on MDC, after MDT shutdown/recovery. + * @param data archive id served by the copytool + * @param cb_arg callback argument (obd_import) + */ +static int mdc_hsm_ct_reregister(__u32 data, void *cb_arg) +{ + struct obd_import *imp = (struct obd_import *)cb_arg; + __u32 archive = data; + int rc; + + CDEBUG(D_HA, "recover copytool registration to MDT (archive=%#x)\n", + archive); + rc = mdc_ioc_hsm_ct_register(imp, archive); + + /* ignore error if the copytool is already registered */ + return ((rc != 0) && (rc != -EEXIST)) ? rc : 0; +} + +/** + * Re-establish all kuc contexts with MDT + * after MDT shutdown/recovery. + */ +static int mdc_kuc_reregister(struct obd_import *imp) +{ + /* re-register HSM agents */ + return libcfs_kkuc_group_foreach(KUC_GRP_HSM, mdc_hsm_ct_reregister, + (void *)imp); +} + +int mdc_set_info_async(const struct lu_env *env, + struct obd_export *exp, + obd_count keylen, void *key, + obd_count vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_import *imp = class_exp2cliimp(exp); + int rc; + ENTRY; + + if (KEY_IS(KEY_READ_ONLY)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + + spin_lock(&imp->imp_lock); + if (*((int *)val)) { + imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags |= + OBD_CONNECT_RDONLY; + } else { + imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY; + imp->imp_connect_data.ocd_connect_flags &= + ~OBD_CONNECT_RDONLY; + } + spin_unlock(&imp->imp_lock); + + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(exp->exp_obd); + RETURN(0); + } + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + RETURN(0); + } + if (KEY_IS(KEY_MDS_CONN)) { + /* mds-mds import */ + spin_lock(&imp->imp_lock); + imp->imp_server_timeout = 1; + spin_unlock(&imp->imp_lock); + imp->imp_client->cli_request_portal = MDS_MDS_PORTAL; + CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name); + RETURN(0); + } + if (KEY_IS(KEY_CHANGELOG_CLEAR)) { + rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, + keylen, key, vallen, val, set); + RETURN(rc); + } + if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) { + rc = mdc_hsm_copytool_send(vallen, val); + RETURN(rc); + } + + CERROR("Unknown key %s\n", (char *)key); + RETURN(-EINVAL); +} + +int mdc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_MAX_EASIZE)) { + int mdsize, *max_easize; + + if (*vallen != sizeof(int)) + RETURN(-EINVAL); + mdsize = *(int*)val; + if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) + exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; + max_easize = val; + *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; + RETURN(0); + } else if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen != sizeof(*data)) + RETURN(-EINVAL); + + *data = imp->imp_connect_data; + RETURN(0); + } else if (KEY_IS(KEY_TGT_COUNT)) { + *((int *)val) = 1; + RETURN(0); + } + + rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val); + + RETURN(rc); +} + +static int mdc_pin(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct obd_client_handle *handle, + int flags) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_PIN); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_PIN); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, fid, oc, 0, 0, -1, flags); + + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) { + CERROR("Pin failed: %d\n", rc); + GOTO(err_out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(err_out, rc = -EPROTO); + + handle->och_fh = body->handle; + handle->och_magic = OBD_CLIENT_HANDLE_MAGIC; + + handle->och_mod = obd_mod_alloc(); + if (handle->och_mod == NULL) { + DEBUG_REQ(D_ERROR, req, "can't allocate md_open_data"); + GOTO(err_out, rc = -ENOMEM); + } + handle->och_mod->mod_open_req = req; /* will be dropped by unpin */ + + RETURN(0); + +err_out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int mdc_unpin(struct obd_export *exp, struct obd_client_handle *handle, + int flag) +{ + struct ptlrpc_request *req; + struct mdt_body *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_UNPIN, + LUSTRE_MDS_VERSION, MDS_UNPIN); + if (req == NULL) + RETURN(-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_MDT_BODY); + body->handle = handle->och_fh; + body->flags = flag; + + ptlrpc_request_set_replen(req); + + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + if (rc != 0) + CERROR("Unpin failed: %d\n", rc); + + ptlrpc_req_finished(req); + ptlrpc_req_finished(handle->och_mod->mod_open_req); + + obd_mod_put(handle->och_mod); + RETURN(rc); +} + +int mdc_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, fid, oc, 0, 0, -1, 0); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { +#if 0 + /* XXX Pass event up to OBDs stack. used only for FLD now */ + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DISCON, NULL); +#endif + break; + } + case IMP_EVENT_INACTIVE: { + struct client_obd *cli = &obd->u.cli; + /* + * Flush current sequence to make client obtain new one + * from server in case of disconnect/reconnect. + */ + if (cli->cl_seq != NULL) + seq_client_flush(cli->cl_seq); + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + + break; + } + case IMP_EVENT_ACTIVE: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); + /* restore re-establish kuc registration after reconnecting */ + if (rc == 0) + rc = mdc_kuc_reregister(imp); + break; + case IMP_EVENT_OCD: + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %x\n", event); + LBUG(); + } + RETURN(rc); +} + +int mdc_fid_alloc(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct lu_client_seq *seq = cli->cl_seq; + ENTRY; + RETURN(seq_client_alloc_fid(NULL, seq, fid)); +} + +struct obd_uuid *mdc_get_uuid(struct obd_export *exp) { + struct client_obd *cli = &exp->exp_obd->u.cli; + return &cli->cl_target_uuid; +} + +/** + * Determine whether the lock can be canceled before replaying it during + * recovery, non zero value will be return if the lock can be canceled, + * or zero returned for not + */ +static int mdc_cancel_for_recovery(struct ldlm_lock *lock) +{ + if (lock->l_resource->lr_type != LDLM_IBITS) + RETURN(0); + + /* FIXME: if we ever get into a situation where there are too many + * opened files with open locks on a single node, then we really + * should replay these open locks to reget it */ + if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) + RETURN(0); + + RETURN(1); +} + +static int mdc_resource_inode_free(struct ldlm_resource *res) +{ + if (res->lr_lvb_inode) + res->lr_lvb_inode = NULL; + + return 0; +} + +struct ldlm_valblock_ops inode_lvbo = { + lvbo_free: mdc_resource_inode_free +}; + +static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + struct client_obd *cli = &obd->u.cli; + struct lprocfs_static_vars lvars = { 0 }; + int rc; + ENTRY; + + OBD_ALLOC(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock)); + if (!cli->cl_rpc_lock) + RETURN(-ENOMEM); + mdc_init_rpc_lock(cli->cl_rpc_lock); + + ptlrpcd_addref(); + + OBD_ALLOC(cli->cl_close_lock, sizeof (*cli->cl_close_lock)); + if (!cli->cl_close_lock) + GOTO(err_rpc_lock, rc = -ENOMEM); + mdc_init_rpc_lock(cli->cl_close_lock); + + rc = client_obd_setup(obd, cfg); + if (rc) + GOTO(err_close_lock, rc); + lprocfs_mdc_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + + ns_register_cancel(obd->obd_namespace, mdc_cancel_for_recovery); + + obd->obd_namespace->ns_lvbo = &inode_lvbo; + + rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL); + if (rc) { + mdc_cleanup(obd); + CERROR("failed to setup llogging subsystems\n"); + } + + RETURN(rc); + +err_close_lock: + OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock)); +err_rpc_lock: + OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock)); + ptlrpcd_decref(); + RETURN(rc); +} + +/* Initialize the default and maximum LOV EA and cookie sizes. This allows + * us to make MDS RPCs with large enough reply buffers to hold the + * maximum-sized (= maximum striped) EA and cookie without having to + * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ +static int mdc_init_ea_size(struct obd_export *exp, int easize, + int def_easize, int cookiesize) +{ + struct obd_device *obd = exp->exp_obd; + struct client_obd *cli = &obd->u.cli; + ENTRY; + + if (cli->cl_max_mds_easize < easize) + cli->cl_max_mds_easize = easize; + + if (cli->cl_default_mds_easize < def_easize) + cli->cl_default_mds_easize = def_easize; + + if (cli->cl_max_mds_cookiesize < cookiesize) + cli->cl_max_mds_cookiesize = cookiesize; + + RETURN(0); +} + +static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: + break; + case OBD_CLEANUP_EXPORTS: + /* Failsafe, ok if racy */ + if (obd->obd_type->typ_refcnt <= 1) + libcfs_kkuc_group_rem(0, KUC_GRP_HSM); + + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_obd_cleanup(obd); + + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + RETURN(rc); +} + +static int mdc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + + OBD_FREE(cli->cl_rpc_lock, sizeof (*cli->cl_rpc_lock)); + OBD_FREE(cli->cl_close_lock, sizeof (*cli->cl_close_lock)); + + ptlrpcd_decref(); + + return client_obd_cleanup(obd); +} + + +static int mdc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *tgt, int *index) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + LASSERT(olg == &obd->obd_olg); + + rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, tgt, + &llog_client_ops); + if (rc) + RETURN(rc); + + ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT); + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +} + +static int mdc_llog_finish(struct obd_device *obd, int count) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + + RETURN(0); +} + +static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct lprocfs_static_vars lvars = { 0 }; + int rc = 0; + + lprocfs_mdc_init_vars(&lvars); + switch (lcfg->lcfg_command) { + default: + rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + break; + } + return(rc); +} + + +/* get remote permission for current user on fid */ +int mdc_get_remote_perm(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + LASSERT(client_is_remote(exp)); + + *request = NULL; + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + mdc_set_capa_size(req, &RMF_CAPA1, oc); + + rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + mdc_pack_body(req, fid, oc, OBD_MD_FLRMTPERM, 0, suppgid, 0); + + req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, + sizeof(struct mdt_remote_perm)); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + ptlrpc_req_finished(req); + else + *request = req; + RETURN(rc); +} + +static int mdc_interpret_renew_capa(const struct lu_env *env, + struct ptlrpc_request *req, void *args, + int status) +{ + struct mdc_renew_capa_args *ra = args; + struct mdt_body *body = NULL; + struct lustre_capa *capa; + ENTRY; + + if (status) + GOTO(out, capa = ERR_PTR(status)); + + body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); + if (body == NULL) + GOTO(out, capa = ERR_PTR(-EFAULT)); + + if ((body->valid & OBD_MD_FLOSSCAPA) == 0) + GOTO(out, capa = ERR_PTR(-ENOENT)); + + capa = req_capsule_server_get(&req->rq_pill, &RMF_CAPA2); + if (!capa) + GOTO(out, capa = ERR_PTR(-EFAULT)); + EXIT; +out: + ra->ra_cb(ra->ra_oc, capa); + return 0; +} + +static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc, + renew_capa_cb_t cb) +{ + struct ptlrpc_request *req; + struct mdc_renew_capa_args *ra; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_MDS_GETATTR, + LUSTRE_MDS_VERSION, MDS_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + /* NB, OBD_MD_FLOSSCAPA is set here, but it doesn't necessarily mean the + * capa to renew is oss capa. + */ + mdc_pack_body(req, &oc->c_capa.lc_fid, oc, OBD_MD_FLOSSCAPA, 0, -1, 0); + ptlrpc_request_set_replen(req); + + CLASSERT(sizeof(*ra) <= sizeof(req->rq_async_args)); + ra = ptlrpc_req_async_args(req); + ra->ra_oc = oc; + ra->ra_cb = cb; + req->rq_interpret_reply = mdc_interpret_renew_capa; + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + RETURN(0); +} + +static int mdc_connect(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *obd, struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + struct obd_import *imp = obd->u.cli.cl_import; + + /* mds-mds import features */ + if (data && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS)) { + spin_lock(&imp->imp_lock); + imp->imp_server_timeout = 1; + spin_unlock(&imp->imp_lock); + imp->imp_client->cli_request_portal = MDS_MDS_PORTAL; + CDEBUG(D_OTHER, "%s: Set 'mds' portal and timeout\n", + obd->obd_name); + } + + return client_connect_import(env, exp, obd, cluuid, data, NULL); +} + +struct obd_ops mdc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mdc_setup, + .o_precleanup = mdc_precleanup, + .o_cleanup = mdc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = mdc_connect, + .o_disconnect = client_disconnect_export, + .o_iocontrol = mdc_iocontrol, + .o_set_info_async = mdc_set_info_async, + .o_statfs = mdc_statfs, + .o_pin = mdc_pin, + .o_unpin = mdc_unpin, + .o_fid_init = client_fid_init, + .o_fid_fini = client_fid_fini, + .o_fid_alloc = mdc_fid_alloc, + .o_import_event = mdc_import_event, + .o_llog_init = mdc_llog_init, + .o_llog_finish = mdc_llog_finish, + .o_get_info = mdc_get_info, + .o_process_config = mdc_process_config, + .o_get_uuid = mdc_get_uuid, + .o_quotactl = mdc_quotactl, + .o_quotacheck = mdc_quotacheck +}; + +struct md_ops mdc_md_ops = { + .m_getstatus = mdc_getstatus, + .m_null_inode = mdc_null_inode, + .m_find_cbdata = mdc_find_cbdata, + .m_close = mdc_close, + .m_create = mdc_create, + .m_done_writing = mdc_done_writing, + .m_enqueue = mdc_enqueue, + .m_getattr = mdc_getattr, + .m_getattr_name = mdc_getattr_name, + .m_intent_lock = mdc_intent_lock, + .m_link = mdc_link, + .m_is_subdir = mdc_is_subdir, + .m_rename = mdc_rename, + .m_setattr = mdc_setattr, + .m_setxattr = mdc_setxattr, + .m_getxattr = mdc_getxattr, + .m_sync = mdc_sync, + .m_readpage = mdc_readpage, + .m_unlink = mdc_unlink, + .m_cancel_unused = mdc_cancel_unused, + .m_init_ea_size = mdc_init_ea_size, + .m_set_lock_data = mdc_set_lock_data, + .m_lock_match = mdc_lock_match, + .m_get_lustre_md = mdc_get_lustre_md, + .m_free_lustre_md = mdc_free_lustre_md, + .m_set_open_replay_data = mdc_set_open_replay_data, + .m_clear_open_replay_data = mdc_clear_open_replay_data, + .m_renew_capa = mdc_renew_capa, + .m_unpack_capa = mdc_unpack_capa, + .m_get_remote_perm = mdc_get_remote_perm, + .m_intent_getattr_async = mdc_intent_getattr_async, + .m_revalidate_lock = mdc_revalidate_lock +}; + +int __init mdc_init(void) +{ + int rc; + struct lprocfs_static_vars lvars = { 0 }; + lprocfs_mdc_init_vars(&lvars); + + rc = class_register_type(&mdc_obd_ops, &mdc_md_ops, lvars.module_vars, + LUSTRE_MDC_NAME, NULL); + RETURN(rc); +} + +static void /*__exit*/ mdc_exit(void) +{ + class_unregister_type(LUSTRE_MDC_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Client"); +MODULE_LICENSE("GPL"); + +module_init(mdc_init); +module_exit(mdc_exit); diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile new file mode 100644 index 000000000000..267246344e1c --- /dev/null +++ b/drivers/staging/lustre/lustre/mgc/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += mgc.o +mgc-y := mgc_request.o lproc_mgc.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/mgc/libmgc.c b/drivers/staging/lustre/lustre/mgc/libmgc.c new file mode 100644 index 000000000000..442146cc7e60 --- /dev/null +++ b/drivers/staging/lustre/lustre/mgc/libmgc.c @@ -0,0 +1,166 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/mgc/libmgc.c + * + * Lustre Management Client + * Author: Nathan Rutman + */ + +/* Minimal MGC for liblustre: only used to read the config log from the MGS + at setup time, no updates. */ + +#define DEBUG_SUBSYSTEM S_MGC + +#include + +#include +#include +#include +#include +#include + + +static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int rc; + ENTRY; + + ptlrpcd_addref(); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(err_decref, rc); + + /* liblustre only support null flavor to MGS */ + obd->u.cli.cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_NULL; + + rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + GOTO(err_cleanup, rc); + } + + RETURN(rc); + +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + RETURN(rc); +} + +static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: + case OBD_CLEANUP_EXPORTS: + obd_cleanup_client_import(obd); + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + RETURN(rc); +} + +static int mgc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + ENTRY; + + LASSERT(cli->cl_mgc_vfsmnt == NULL); + + ptlrpcd_decref(); + + rc = client_obd_cleanup(obd); + RETURN(rc); +} + +static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *tgt, int *index) +{ + struct llog_ctxt *ctxt; + int rc; + ENTRY; + + LASSERT(olg == &obd->obd_olg); + rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt, + &llog_client_ops); + if (rc < 0) + RETURN(rc); + + ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT); + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +static int mgc_llog_finish(struct obd_device *obd, int count) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + + RETURN(0); +} + +struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + .o_llog_init = mgc_llog_init, + .o_llog_finish = mgc_llog_finish, +}; + +int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, + NULL, LUSTRE_MGC_NAME, NULL); +} diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c new file mode 100644 index 000000000000..041f365beabf --- /dev/null +++ b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c @@ -0,0 +1,68 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include "mgc_internal.h" + +#ifdef LPROCFS + +static struct lprocfs_vars lprocfs_mgc_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "ping", 0, lprocfs_wr_ping, 0, 0, 0222 }, + { "connect_flags", lprocfs_rd_connect_flags, 0, 0 }, + { "mgs_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, + { "mgs_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, + { "import", lprocfs_rd_import, 0, 0 }, + { "state", lprocfs_rd_state, 0, 0 }, + { "ir_state", lprocfs_mgc_rd_ir_state, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_mgc_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_mgc_module_vars; + lvars->obd_vars = lprocfs_mgc_obd_vars; +} +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h new file mode 100644 index 000000000000..111db90f33a7 --- /dev/null +++ b/drivers/staging/lustre/lustre/mgc/mgc_internal.h @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _MGC_INTERNAL_H +#define _MGC_INTERNAL_H + +#include +#include +#include +#include +#include +#include + +#ifdef LPROCFS +void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars); +int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off, + int count, int *eof, void *data); +#else +static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +static inline int lprocfs_mgc_rd_ir_state(char *page, char **start, + off_t off, int count, int *eof, void *data) +{ + return 0; +} +#endif /* LPROCFS */ + +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); + +static inline int cld_is_sptlrpc(struct config_llog_data *cld) +{ + return cld->cld_type == CONFIG_T_SPTLRPC; +} + +static inline int cld_is_recover(struct config_llog_data *cld) +{ + return cld->cld_type == CONFIG_T_RECOVER; +} + +#endif /* _MGC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c new file mode 100644 index 000000000000..74232f4c1004 --- /dev/null +++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c @@ -0,0 +1,1863 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/mgc/mgc_request.c + * + * Author: Nathan Rutman + */ + +#define DEBUG_SUBSYSTEM S_MGC +#define D_MGC D_CONFIG /*|D_WARNING*/ + +# include +# include +# include +# include + +#include +#include +#include +#include +#include +#include +#include "mgc_internal.h" + +static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id, + int type) +{ + __u64 resname = 0; + + if (len > 8) { + CERROR("name too long: %s\n", name); + return -EINVAL; + } + if (len <= 0) { + CERROR("missing name: %s\n", name); + return -EINVAL; + } + memcpy(&resname, name, len); + + /* Always use the same endianness for the resid */ + memset(res_id, 0, sizeof(*res_id)); + res_id->name[0] = cpu_to_le64(resname); + /* XXX: unfortunately, sptlprc and config llog share one lock */ + switch(type) { + case CONFIG_T_CONFIG: + case CONFIG_T_SPTLRPC: + resname = 0; + break; + case CONFIG_T_RECOVER: + resname = type; + break; + default: + LBUG(); + } + res_id->name[1] = cpu_to_le64(resname); + CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", name, + res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); + return 0; +} + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type) +{ + /* fsname is at most 8 chars long, maybe contain "-". + * e.g. "lustre", "SUN-000" */ + return mgc_name2resid(fsname, strlen(fsname), res_id, type); +} +EXPORT_SYMBOL(mgc_fsname2resid); + +int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type) +{ + char *name_end; + int len; + + /* logname consists of "fsname-nodetype". + * e.g. "lustre-MDT0001", "SUN-000-client" */ + name_end = strrchr(logname, '-'); + LASSERT(name_end); + len = name_end - logname; + return mgc_name2resid(logname, len, res_id, type); +} + +/********************** config llog list **********************/ +static LIST_HEAD(config_llog_list); +static DEFINE_SPINLOCK(config_list_lock); + +/* Take a reference to a config log */ +static int config_log_get(struct config_llog_data *cld) +{ + ENTRY; + atomic_inc(&cld->cld_refcount); + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + RETURN(0); +} + +/* Drop a reference to a config log. When no longer referenced, + we can free the config log data */ +static void config_log_put(struct config_llog_data *cld) +{ + ENTRY; + + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, + atomic_read(&cld->cld_refcount)); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* spinlock to make sure no item with 0 refcount in the list */ + if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) { + list_del(&cld->cld_list_chain); + spin_unlock(&config_list_lock); + + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + + if (cld->cld_recover) + config_log_put(cld->cld_recover); + if (cld->cld_sptlrpc) + config_log_put(cld->cld_sptlrpc); + if (cld_is_sptlrpc(cld)) + sptlrpc_conf_log_stop(cld->cld_logname); + + class_export_put(cld->cld_mgcexp); + OBD_FREE(cld, sizeof(*cld) + strlen(cld->cld_logname) + 1); + } + + EXIT; +} + +/* Find a config log by name */ +static +struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *found = NULL; + void * instance; + ENTRY; + + LASSERT(logname != NULL); + + instance = cfg ? cfg->cfg_instance : NULL; + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + /* check if instance equals */ + if (instance != cld->cld_cfg.cfg_instance) + continue; + + /* instance may be NULL, should check name */ + if (strcmp(logname, cld->cld_logname) == 0) { + found = cld; + break; + } + } + if (found) { + atomic_inc(&found->cld_refcount); + LASSERT(found->cld_stopping == 0 || cld_is_sptlrpc(found) == 0); + } + spin_unlock(&config_list_lock); + RETURN(found); +} + +static +struct config_llog_data *do_config_log_add(struct obd_device *obd, + char *logname, + int type, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld; + int rc; + ENTRY; + + CDEBUG(D_MGC, "do adding config log %s:%p\n", logname, + cfg ? cfg->cfg_instance : 0); + + OBD_ALLOC(cld, sizeof(*cld) + strlen(logname) + 1); + if (!cld) + RETURN(ERR_PTR(-ENOMEM)); + + strcpy(cld->cld_logname, logname); + if (cfg) + cld->cld_cfg = *cfg; + else + cld->cld_cfg.cfg_callback = class_config_llog_handler; + mutex_init(&cld->cld_lock); + cld->cld_cfg.cfg_last_idx = 0; + cld->cld_cfg.cfg_flags = 0; + cld->cld_cfg.cfg_sb = sb; + cld->cld_type = type; + atomic_set(&cld->cld_refcount, 1); + + /* Keep the mgc around until we are done */ + cld->cld_mgcexp = class_export_get(obd->obd_self_export); + + if (cld_is_sptlrpc(cld)) { + sptlrpc_conf_log_start(logname); + cld->cld_cfg.cfg_obdname = obd->obd_name; + } + + rc = mgc_logname2resid(logname, &cld->cld_resid, type); + + spin_lock(&config_list_lock); + list_add(&cld->cld_list_chain, &config_llog_list); + spin_unlock(&config_list_lock); + + if (rc) { + config_log_put(cld); + RETURN(ERR_PTR(rc)); + } + + if (cld_is_sptlrpc(cld)) { + rc = mgc_process_log(obd, cld); + if (rc && rc != -ENOENT) + CERROR("failed processing sptlrpc log: %d\n", rc); + } + + RETURN(cld); +} + +static struct config_llog_data *config_recover_log_add(struct obd_device *obd, + char *fsname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_instance lcfg = *cfg; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + char logname[32]; + + if (IS_OST(lsi)) + return NULL; + + /* for osp-on-ost, see lustre_start_osp() */ + if (IS_MDT(lsi) && lcfg.cfg_instance) + return NULL; + + /* we have to use different llog for clients and mdts for cmd + * where only clients are notified if one of cmd server restarts */ + LASSERT(strlen(fsname) < sizeof(logname) / 2); + strcpy(logname, fsname); + if (IS_SERVER(lsi)) { /* mdt */ + LASSERT(lcfg.cfg_instance == NULL); + lcfg.cfg_instance = sb; + strcat(logname, "-mdtir"); + } else { + LASSERT(lcfg.cfg_instance != NULL); + strcat(logname, "-cliir"); + } + + cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb); + return cld; +} + + +/** Add this log to the list of active logs watched by an MGC. + * Active means we're watching for updates. + * We have one active log per "mount" - client instance or servername. + * Each instance may be at a different point in the log. + */ +static int config_log_add(struct obd_device *obd, char *logname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_data *cld; + struct config_llog_data *sptlrpc_cld; + char seclogname[32]; + char *ptr; + ENTRY; + + CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance); + + /* + * for each regular log, the depended sptlrpc log name is + * -sptlrpc. multiple regular logs may share one sptlrpc log. + */ + ptr = strrchr(logname, '-'); + if (ptr == NULL || ptr - logname > 8) { + CERROR("logname %s is too long\n", logname); + RETURN(-EINVAL); + } + + memcpy(seclogname, logname, ptr - logname); + strcpy(seclogname + (ptr - logname), "-sptlrpc"); + + sptlrpc_cld = config_log_find(seclogname, NULL); + if (sptlrpc_cld == NULL) { + sptlrpc_cld = do_config_log_add(obd, seclogname, + CONFIG_T_SPTLRPC, NULL, NULL); + if (IS_ERR(sptlrpc_cld)) { + CERROR("can't create sptlrpc log: %s\n", seclogname); + RETURN(PTR_ERR(sptlrpc_cld)); + } + } + + cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb); + if (IS_ERR(cld)) { + CERROR("can't create log: %s\n", logname); + config_log_put(sptlrpc_cld); + RETURN(PTR_ERR(cld)); + } + + cld->cld_sptlrpc = sptlrpc_cld; + + LASSERT(lsi->lsi_lmd); + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) { + struct config_llog_data *recover_cld; + *strrchr(seclogname, '-') = 0; + recover_cld = config_recover_log_add(obd, seclogname, cfg, sb); + if (IS_ERR(recover_cld)) { + config_log_put(cld); + RETURN(PTR_ERR(recover_cld)); + } + cld->cld_recover = recover_cld; + } + + RETURN(0); +} + +DEFINE_MUTEX(llog_process_lock); + +/** Stop watching for updates on this log. + */ +static int config_log_end(char *logname, struct config_llog_instance *cfg) +{ + struct config_llog_data *cld; + struct config_llog_data *cld_sptlrpc = NULL; + struct config_llog_data *cld_recover = NULL; + int rc = 0; + ENTRY; + + cld = config_log_find(logname, cfg); + if (cld == NULL) + RETURN(-ENOENT); + + mutex_lock(&cld->cld_lock); + /* + * if cld_stopping is set, it means we didn't start the log thus + * not owning the start ref. this can happen after previous umount: + * the cld still hanging there waiting for lock cancel, and we + * remount again but failed in the middle and call log_end without + * calling start_log. + */ + if (unlikely(cld->cld_stopping)) { + mutex_unlock(&cld->cld_lock); + /* drop the ref from the find */ + config_log_put(cld); + RETURN(rc); + } + + cld->cld_stopping = 1; + + cld_recover = cld->cld_recover; + cld->cld_recover = NULL; + mutex_unlock(&cld->cld_lock); + + if (cld_recover) { + mutex_lock(&cld_recover->cld_lock); + cld_recover->cld_stopping = 1; + mutex_unlock(&cld_recover->cld_lock); + config_log_put(cld_recover); + } + + spin_lock(&config_list_lock); + cld_sptlrpc = cld->cld_sptlrpc; + cld->cld_sptlrpc = NULL; + spin_unlock(&config_list_lock); + + if (cld_sptlrpc) + config_log_put(cld_sptlrpc); + + /* drop the ref from the find */ + config_log_put(cld); + /* drop the start ref */ + config_log_put(cld); + + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", + rc); + RETURN(rc); +} + +int lprocfs_mgc_rd_ir_state(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp = obd->u.cli.cl_import; + struct obd_connect_data *ocd = &imp->imp_connect_data; + struct config_llog_data *cld; + int rc = 0; + ENTRY; + + rc = snprintf(page, count, "imperative_recovery: %s\n", + OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED"); + rc += snprintf(page + rc, count - rc, "client_state:\n"); + + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { + if (cld->cld_recover == NULL) + continue; + rc += snprintf(page + rc, count - rc, + " - { client: %s, nidtbl_version: %u }\n", + cld->cld_logname, + cld->cld_recover->cld_cfg.cfg_last_idx); + } + spin_unlock(&config_list_lock); + + RETURN(rc); +} + +/* reenqueue any lost locks */ +#define RQ_RUNNING 0x1 +#define RQ_NOW 0x2 +#define RQ_LATER 0x4 +#define RQ_STOP 0x8 +static int rq_state = 0; +static wait_queue_head_t rq_waitq; +static DECLARE_COMPLETION(rq_exit); + +static void do_requeue(struct config_llog_data *cld) +{ + ENTRY; + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* Do not run mgc_process_log on a disconnected export or an + export which is being disconnected. Take the client + semaphore to make the check non-racy. */ + down_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); + if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { + CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); + mgc_process_log(cld->cld_mgcexp->exp_obd, cld); + } else { + CDEBUG(D_MGC, "disconnecting, won't update log %s\n", + cld->cld_logname); + } + up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); + + EXIT; +} + +/* this timeout represents how many seconds MGC should wait before + * requeue config and recover lock to the MGS. We need to randomize this + * in order to not flood the MGS. + */ +#define MGC_TIMEOUT_MIN_SECONDS 5 +#define MGC_TIMEOUT_RAND_CENTISEC 0x1ff /* ~500 */ + +static int mgc_requeue_thread(void *data) +{ + int rc = 0; + ENTRY; + + CDEBUG(D_MGC, "Starting requeue thread\n"); + + /* Keep trying failed locks periodically */ + spin_lock(&config_list_lock); + rq_state |= RQ_RUNNING; + while (1) { + struct l_wait_info lwi; + struct config_llog_data *cld, *cld_prev; + int rand = cfs_rand() & MGC_TIMEOUT_RAND_CENTISEC; + int stopped = !!(rq_state & RQ_STOP); + int to; + + /* Any new or requeued lostlocks will change the state */ + rq_state &= ~(RQ_NOW | RQ_LATER); + spin_unlock(&config_list_lock); + + /* Always wait a few seconds to allow the server who + caused the lock revocation to finish its setup, plus some + random so everyone doesn't try to reconnect at once. */ + to = MGC_TIMEOUT_MIN_SECONDS * HZ; + to += rand * HZ / 100; /* rand is centi-seconds */ + lwi = LWI_TIMEOUT(to, NULL, NULL); + l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi); + + /* + * iterate & processing through the list. for each cld, process + * its depending sptlrpc cld firstly (if any) and then itself. + * + * it's guaranteed any item in the list must have + * reference > 0; and if cld_lostlock is set, at + * least one reference is taken by the previous enqueue. + */ + cld_prev = NULL; + + spin_lock(&config_list_lock); + list_for_each_entry(cld, &config_llog_list, + cld_list_chain) { + if (!cld->cld_lostlock) + continue; + + spin_unlock(&config_list_lock); + + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* Whether we enqueued again or not in mgc_process_log, + * we're done with the ref from the old enqueue */ + if (cld_prev) + config_log_put(cld_prev); + cld_prev = cld; + + cld->cld_lostlock = 0; + if (likely(!stopped)) + do_requeue(cld); + + spin_lock(&config_list_lock); + } + spin_unlock(&config_list_lock); + if (cld_prev) + config_log_put(cld_prev); + + /* break after scanning the list so that we can drop + * refcount to losing lock clds */ + if (unlikely(stopped)) { + spin_lock(&config_list_lock); + break; + } + + /* Wait a bit to see if anyone else needs a requeue */ + lwi = (struct l_wait_info) { 0 }; + l_wait_event(rq_waitq, rq_state & (RQ_NOW | RQ_STOP), + &lwi); + spin_lock(&config_list_lock); + } + /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */ + rq_state &= ~RQ_RUNNING; + spin_unlock(&config_list_lock); + + complete(&rq_exit); + + CDEBUG(D_MGC, "Ending requeue thread\n"); + RETURN(rc); +} + +/* Add a cld to the list to requeue. Start the requeue thread if needed. + We are responsible for dropping the config log reference from here on out. */ +static void mgc_requeue_add(struct config_llog_data *cld) +{ + ENTRY; + + CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n", + cld->cld_logname, atomic_read(&cld->cld_refcount), + cld->cld_stopping, rq_state); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping || cld->cld_lostlock) { + mutex_unlock(&cld->cld_lock); + RETURN_EXIT; + } + /* this refcount will be released in mgc_requeue_thread. */ + config_log_get(cld); + cld->cld_lostlock = 1; + mutex_unlock(&cld->cld_lock); + + /* Hold lock for rq_state */ + spin_lock(&config_list_lock); + if (rq_state & RQ_STOP) { + spin_unlock(&config_list_lock); + cld->cld_lostlock = 0; + config_log_put(cld); + } else { + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + } + EXIT; +} + +/********************** class fns **********************/ + +static int mgc_fs_setup(struct obd_device *obd, struct super_block *sb, + struct vfsmount *mnt) +{ + struct lvfs_run_ctxt saved; + struct lustre_sb_info *lsi = s2lsi(sb); + struct client_obd *cli = &obd->u.cli; + struct dentry *dentry; + char *label; + int err = 0; + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_srv_mnt == mnt); + + /* The mgc fs exclusion sem. Only one fs can be setup at a time. */ + down(&cli->cl_mgc_sem); + + cfs_cleanup_group_info(); + + obd->obd_fsops = fsfilt_get_ops(lsi->lsi_fstype); + if (IS_ERR(obd->obd_fsops)) { + up(&cli->cl_mgc_sem); + CERROR("%s: No fstype %s: rc = %ld\n", lsi->lsi_fstype, + obd->obd_name, PTR_ERR(obd->obd_fsops)); + RETURN(PTR_ERR(obd->obd_fsops)); + } + + cli->cl_mgc_vfsmnt = mnt; + err = fsfilt_setup(obd, mnt->mnt_sb); + if (err) + GOTO(err_ops, err); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.pwdmnt = mnt; + obd->obd_lvfs_ctxt.pwd = mnt->mnt_root; + obd->obd_lvfs_ctxt.fs = get_ds(); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs), + strlen(MOUNT_CONFIGS_DIR)); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + CERROR("cannot lookup %s directory: rc = %d\n", + MOUNT_CONFIGS_DIR, err); + GOTO(err_ops, err); + } + cli->cl_mgc_configs_dir = dentry; + + /* We take an obd ref to insure that we can't get to mgc_cleanup + without calling mgc_fs_cleanup first. */ + class_incref(obd, "mgc_fs", obd); + + label = fsfilt_get_label(obd, mnt->mnt_sb); + if (label) + CDEBUG(D_MGC, "MGC using disk labelled=%s\n", label); + + /* We keep the cl_mgc_sem until mgc_fs_cleanup */ + RETURN(0); + +err_ops: + fsfilt_put_ops(obd->obd_fsops); + obd->obd_fsops = NULL; + cli->cl_mgc_vfsmnt = NULL; + up(&cli->cl_mgc_sem); + RETURN(err); +} + +static int mgc_fs_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc = 0; + ENTRY; + + LASSERT(cli->cl_mgc_vfsmnt != NULL); + + if (cli->cl_mgc_configs_dir != NULL) { + struct lvfs_run_ctxt saved; + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + l_dput(cli->cl_mgc_configs_dir); + cli->cl_mgc_configs_dir = NULL; + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + class_decref(obd, "mgc_fs", obd); + } + + cli->cl_mgc_vfsmnt = NULL; + if (obd->obd_fsops) + fsfilt_put_ops(obd->obd_fsops); + + up(&cli->cl_mgc_sem); + + RETURN(rc); +} + +static atomic_t mgc_count = ATOMIC_INIT(0); +static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: + break; + case OBD_CLEANUP_EXPORTS: + if (atomic_dec_and_test(&mgc_count)) { + int running; + /* stop requeue thread */ + spin_lock(&config_list_lock); + running = rq_state & RQ_RUNNING; + if (running) + rq_state |= RQ_STOP; + spin_unlock(&config_list_lock); + if (running) { + wake_up(&rq_waitq); + wait_for_completion(&rq_exit); + } + } + obd_cleanup_client_import(obd); + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + RETURN(rc); +} + +static int mgc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + ENTRY; + + LASSERT(cli->cl_mgc_vfsmnt == NULL); + + /* COMPAT_146 - old config logs may have added profiles we don't + know about */ + if (obd->obd_type->typ_refcnt <= 1) + /* Only for the last mgc */ + class_del_profiles(); + + lprocfs_obd_cleanup(obd); + ptlrpcd_decref(); + + rc = client_obd_cleanup(obd); + RETURN(rc); +} + +static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars; + int rc; + ENTRY; + + ptlrpcd_addref(); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(err_decref, rc); + + rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL); + if (rc) { + CERROR("failed to setup llogging subsystems\n"); + GOTO(err_cleanup, rc); + } + + lprocfs_mgc_init_vars(&lvars); + lprocfs_obd_setup(obd, lvars.obd_vars); + sptlrpc_lprocfs_cliobd_attach(obd); + + if (atomic_inc_return(&mgc_count) == 1) { + rq_state = 0; + init_waitqueue_head(&rq_waitq); + + /* start requeue thread */ + rc = PTR_ERR(kthread_run(mgc_requeue_thread, NULL, + "ll_cfg_requeue")); + if (IS_ERR_VALUE(rc)) { + CERROR("%s: Cannot start requeue thread (%d)," + "no more log updates!\n", + obd->obd_name, rc); + GOTO(err_cleanup, rc); + } + /* rc is the task_struct pointer of mgc_requeue_thread. */ + rc = 0; + } + + RETURN(rc); + +err_cleanup: + client_obd_cleanup(obd); +err_decref: + ptlrpcd_decref(); + RETURN(rc); +} + +/* based on ll_mdc_blocking_ast */ +static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct lustre_handle lockh; + struct config_llog_data *cld = (struct config_llog_data *)data; + int rc = 0; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + /* mgs wants the lock, give it up... */ + LDLM_DEBUG(lock, "MGC blocking CB"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); + break; + case LDLM_CB_CANCELING: + /* We've given up the lock, prepare ourselves to update. */ + LDLM_DEBUG(lock, "MGC cancel CB"); + + CDEBUG(D_MGC, "Lock res "LPX64" (%.8s)\n", + lock->l_resource->lr_name.name[0], + (char *)&lock->l_resource->lr_name.name[0]); + + if (!cld) { + CDEBUG(D_INFO, "missing data, won't requeue\n"); + break; + } + + /* held at mgc_process_log(). */ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + /* Are we done with this log? */ + if (cld->cld_stopping) { + CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + /* Make sure not to re-enqueue when the mgc is stopping + (we get called from client_disconnect_export) */ + if (!lock->l_conn_export || + !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) { + CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n", + cld->cld_logname); + config_log_put(cld); + break; + } + + /* Re-enqueue now */ + mgc_requeue_add(cld); + config_log_put(cld); + break; + default: + LBUG(); + } + + RETURN(rc); +} + +/* Not sure where this should go... */ +#define MGC_ENQUEUE_LIMIT 50 +#define MGC_TARGET_REG_LIMIT 10 +#define MGC_SEND_PARAM_LIMIT 10 + +/* Send parameter to MGS*/ +static int mgc_set_mgs_param(struct obd_export *exp, + struct mgs_send_param *msp) +{ + struct ptlrpc_request *req; + struct mgs_send_param *req_msp, *rep_msp; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION, + MGS_SET_INFO); + if (!req) + RETURN(-ENOMEM); + + req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + if (!req_msp) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + memcpy(req_msp, msp, sizeof(*req_msp)); + ptlrpc_request_set_replen(req); + + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_SEND_PARAM_LIMIT; + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); + memcpy(msp, rep_msp, sizeof(*rep_msp)); + } + + ptlrpc_req_finished(req); + + RETURN(rc); +} + +/* Take a config lock so we can get cancel notifications */ +static int mgc_enqueue(struct obd_export *exp, struct lov_stripe_md *lsm, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb, + void *data, __u32 lvb_len, void *lvb_swabber, + struct lustre_handle *lockh) +{ + struct config_llog_data *cld = (struct config_llog_data *)data; + struct ldlm_enqueue_info einfo = { type, mode, mgc_blocking_ast, + ldlm_completion_ast, NULL, NULL, NULL }; + struct ptlrpc_request *req; + int short_limit = cld_is_sptlrpc(cld); + int rc; + ENTRY; + + CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname, + cld->cld_resid.name[0]); + + /* We need a callback for every lockholder, so don't try to + ldlm_lock_match (see rev 1.1.2.11.2.47) */ + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION, + LDLM_ENQUEUE); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0); + ptlrpc_request_set_replen(req); + + /* check if this is server or client */ + if (cld->cld_cfg.cfg_sb) { + struct lustre_sb_info *lsi = s2lsi(cld->cld_cfg.cfg_sb); + if (lsi && IS_SERVER(lsi)) + short_limit = 1; + } + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT; + rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags, + NULL, 0, LVB_T_NONE, lockh, 0); + /* A failed enqueue should still call the mgc_blocking_ast, + where it will be requeued if needed ("grant failed"). */ + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int mgc_cancel(struct obd_export *exp, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *lockh) +{ + ENTRY; + + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + +static void mgc_notify_active(struct obd_device *unused) +{ + /* wakeup mgc_requeue_thread to requeue mgc lock */ + spin_lock(&config_list_lock); + rq_state |= RQ_NOW; + spin_unlock(&config_list_lock); + wake_up(&rq_waitq); + + /* TODO: Help the MGS rebuild nidtbl. -jay */ +} + +/* Send target_reg message to MGS */ +static int mgc_target_register(struct obd_export *exp, + struct mgs_target_info *mti) +{ + struct ptlrpc_request *req; + struct mgs_target_info *req_mti, *rep_mti; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION, + MGS_TARGET_REG); + if (req == NULL) + RETURN(-ENOMEM); + + req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO); + if (!req_mti) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + + memcpy(req_mti, mti, sizeof(*req_mti)); + ptlrpc_request_set_replen(req); + CDEBUG(D_MGC, "register %s\n", mti->mti_svname); + /* Limit how long we will wait for the enqueue to complete */ + req->rq_delay_limit = MGC_TARGET_REG_LIMIT; + + rc = ptlrpc_queue_wait(req); + if (!rc) { + rep_mti = req_capsule_server_get(&req->rq_pill, + &RMF_MGS_TARGET_INFO); + memcpy(mti, rep_mti, sizeof(*rep_mti)); + CDEBUG(D_MGC, "register %s got index = %d\n", + mti->mti_svname, mti->mti_stripe_index); + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set) +{ + int rc = -EINVAL; + ENTRY; + + /* Turn off initial_recov after we try all backup servers once */ + if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { + struct obd_import *imp = class_exp2cliimp(exp); + int value; + if (vallen != sizeof(int)) + RETURN(-EINVAL); + value = *(int *)val; + CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n", + imp->imp_obd->obd_name, value, + imp->imp_deactive, imp->imp_invalid, + imp->imp_replayable, imp->imp_obd->obd_replayable, + ptlrpc_import_state_name(imp->imp_state)); + /* Resurrect if we previously died */ + if ((imp->imp_state != LUSTRE_IMP_FULL && + imp->imp_state != LUSTRE_IMP_NEW) || value > 1) + ptlrpc_reconnect_import(imp); + RETURN(0); + } + /* FIXME move this to mgc_process_config */ + if (KEY_IS(KEY_REGISTER_TARGET)) { + struct mgs_target_info *mti; + if (vallen != sizeof(struct mgs_target_info)) + RETURN(-EINVAL); + mti = (struct mgs_target_info *)val; + CDEBUG(D_MGC, "register_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(exp, mti); + RETURN(rc); + } + if (KEY_IS(KEY_SET_FS)) { + struct super_block *sb = (struct super_block *)val; + struct lustre_sb_info *lsi; + if (vallen != sizeof(struct super_block)) + RETURN(-EINVAL); + lsi = s2lsi(sb); + rc = mgc_fs_setup(exp->exp_obd, sb, lsi->lsi_srv_mnt); + if (rc) { + CERROR("set_fs got %d\n", rc); + } + RETURN(rc); + } + if (KEY_IS(KEY_CLEAR_FS)) { + if (vallen != 0) + RETURN(-EINVAL); + rc = mgc_fs_cleanup(exp->exp_obd); + if (rc) { + CERROR("clear_fs got %d\n", rc); + } + RETURN(rc); + } + if (KEY_IS(KEY_SET_INFO)) { + struct mgs_send_param *msp; + + msp = (struct mgs_send_param *)val; + rc = mgc_set_mgs_param(exp, msp); + RETURN(rc); + } + if (KEY_IS(KEY_MGSSEC)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + struct sptlrpc_flavor flvr; + + /* + * empty string means using current flavor, if which haven't + * been set yet, set it as null. + * + * if flavor has been set previously, check the asking flavor + * must match the existing one. + */ + if (vallen == 0) { + if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) + RETURN(0); + val = "null"; + vallen = 4; + } + + rc = sptlrpc_parse_flavor(val, &flvr); + if (rc) { + CERROR("invalid sptlrpc flavor %s to MGS\n", + (char *) val); + RETURN(rc); + } + + /* + * caller already hold a mutex + */ + if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { + cli->cl_flvr_mgc = flvr; + } else if (memcmp(&cli->cl_flvr_mgc, &flvr, + sizeof(flvr)) != 0) { + char str[20]; + + sptlrpc_flavor2name(&cli->cl_flvr_mgc, + str, sizeof(str)); + LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but " + "currently %s is in use\n", + (char *) val, str); + rc = -EPERM; + } + RETURN(rc); + } + + RETURN(rc); +} + +static int mgc_get_info(const struct lu_env *env, struct obd_export *exp, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *unused) +{ + int rc = -EINVAL; + + if (KEY_IS(KEY_CONN_DATA)) { + struct obd_import *imp = class_exp2cliimp(exp); + struct obd_connect_data *data = val; + + if (*vallen == sizeof(*data)) { + *data = imp->imp_connect_data; + rc = 0; + } + } + + return rc; +} + +static int mgc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + int rc = 0; + + LASSERT(imp->imp_obd == obd); + CDEBUG(D_MGC, "import event %#x\n", event); + + switch (event) { + case IMP_EVENT_DISCON: + /* MGC imports should not wait for recovery */ + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_down(); + break; + case IMP_EVENT_INACTIVE: + break; + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + break; + } + case IMP_EVENT_ACTIVE: + CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name); + /* Clearing obd_no_recov allows us to continue pinging */ + obd->obd_no_recov = 0; + mgc_notify_active(obd); + if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) + ptlrpc_pinger_ir_up(); + break; + case IMP_EVENT_OCD: + break; + case IMP_EVENT_DEACTIVATE: + case IMP_EVENT_ACTIVATE: + break; + default: + CERROR("Unknown import event %#x\n", event); + LBUG(); + } + RETURN(rc); +} + +static int mgc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *tgt, int *index) +{ + struct llog_ctxt *ctxt; + int rc; + ENTRY; + + LASSERT(olg == &obd->obd_olg); + + + rc = llog_setup(NULL, obd, olg, LLOG_CONFIG_REPL_CTXT, tgt, + &llog_client_ops); + if (rc) + GOTO(out, rc); + + ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_REPL_CTXT); + if (!ctxt) + GOTO(out, rc = -ENODEV); + + llog_initiator_connect(ctxt); + llog_ctxt_put(ctxt); + + RETURN(0); +out: + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + RETURN(rc); +} + +static int mgc_llog_finish(struct obd_device *obd, int count) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + RETURN(0); +} + +enum { + CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT), + CONFIG_READ_NRPAGES = 4 +}; + +static int mgc_apply_recover_logs(struct obd_device *mgc, + struct config_llog_data *cld, + __u64 max_version, + void *data, int datalen, bool mne_swab) +{ + struct config_llog_instance *cfg = &cld->cld_cfg; + struct lustre_sb_info *lsi = s2lsi(cfg->cfg_sb); + struct mgs_nidtbl_entry *entry; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + u64 prev_version = 0; + char *inst; + char *buf; + int bufsz; + int pos; + int rc = 0; + int off = 0; + ENTRY; + + LASSERT(cfg->cfg_instance != NULL); + LASSERT(cfg->cfg_sb == cfg->cfg_instance); + + OBD_ALLOC(inst, PAGE_CACHE_SIZE); + if (inst == NULL) + RETURN(-ENOMEM); + + if (!IS_SERVER(lsi)) { + pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance); + if (pos >= PAGE_CACHE_SIZE) { + OBD_FREE(inst, PAGE_CACHE_SIZE); + return -E2BIG; + } + } else { + LASSERT(IS_MDT(lsi)); + rc = server_name2svname(lsi->lsi_svname, inst, NULL, + PAGE_CACHE_SIZE); + if (rc) { + OBD_FREE(inst, PAGE_CACHE_SIZE); + RETURN(-EINVAL); + } + pos = strlen(inst); + } + + ++pos; + buf = inst + pos; + bufsz = PAGE_CACHE_SIZE - pos; + + while (datalen > 0) { + int entry_len = sizeof(*entry); + int is_ost; + struct obd_device *obd; + char *obdname; + char *cname; + char *params; + char *uuid; + + rc = -EINVAL; + if (datalen < sizeof(*entry)) + break; + + entry = (typeof(entry))(data + off); + + /* sanity check */ + if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */ + break; + if (entry->mne_nid_count == 0) /* at least one nid entry */ + break; + if (entry->mne_nid_size != sizeof(lnet_nid_t)) + break; + + entry_len += entry->mne_nid_count * entry->mne_nid_size; + if (datalen < entry_len) /* must have entry_len at least */ + break; + + /* Keep this swab for normal mixed endian handling. LU-1644 */ + if (mne_swab) + lustre_swab_mgs_nidtbl_entry(entry); + if (entry->mne_length > PAGE_CACHE_SIZE) { + CERROR("MNE too large (%u)\n", entry->mne_length); + break; + } + + if (entry->mne_length < entry_len) + break; + + off += entry->mne_length; + datalen -= entry->mne_length; + if (datalen < 0) + break; + + if (entry->mne_version > max_version) { + CERROR("entry index(%lld) is over max_index(%lld)\n", + entry->mne_version, max_version); + break; + } + + if (prev_version >= entry->mne_version) { + CERROR("index unsorted, prev %lld, now %lld\n", + prev_version, entry->mne_version); + break; + } + prev_version = entry->mne_version; + + /* + * Write a string with format "nid::instance" to + * lustre//--/import. + */ + + is_ost = entry->mne_type == LDD_F_SV_TYPE_OST; + memset(buf, 0, bufsz); + obdname = buf; + pos = 0; + + /* lustre-OST0001-osc- */ + strcpy(obdname, cld->cld_logname); + cname = strrchr(obdname, '-'); + if (cname == NULL) { + CERROR("mgc %s: invalid logname %s\n", + mgc->obd_name, obdname); + break; + } + + pos = cname - obdname; + obdname[pos] = 0; + pos += sprintf(obdname + pos, "-%s%04x", + is_ost ? "OST" : "MDT", entry->mne_index); + + cname = is_ost ? "osc" : "mdc", + pos += sprintf(obdname + pos, "-%s-%s", cname, inst); + lustre_cfg_bufs_reset(&bufs, obdname); + + /* find the obd by obdname */ + obd = class_name2obd(obdname); + if (obd == NULL) { + CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n", + mgc->obd_name, obdname); + rc = 0; + /* this is a safe race, when the ost is starting up...*/ + continue; + } + + /* osc.import = "connection=::" */ + ++pos; + params = buf + pos; + pos += sprintf(params, "%s.import=%s", cname, "connection="); + uuid = buf + pos; + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + /* client does not connect to the OST yet */ + up_read(&obd->u.cli.cl_sem); + rc = 0; + continue; + } + + /* TODO: iterate all nids to find one */ + /* find uuid by nid */ + rc = client_import_find_conn(obd->u.cli.cl_import, + entry->u.nids[0], + (struct obd_uuid *)uuid); + up_read(&obd->u.cli.cl_sem); + if (rc < 0) { + CERROR("mgc: cannot find uuid by nid %s\n", + libcfs_nid2str(entry->u.nids[0])); + break; + } + + CDEBUG(D_INFO, "Find uuid %s by nid %s\n", + uuid, libcfs_nid2str(entry->u.nids[0])); + + pos += strlen(uuid); + pos += sprintf(buf + pos, "::%u", entry->mne_instance); + LASSERT(pos < bufsz); + + lustre_cfg_bufs_set_string(&bufs, 1, params); + + rc = -ENOMEM; + lcfg = lustre_cfg_new(LCFG_PARAM, &bufs); + if (lcfg == NULL) { + CERROR("mgc: cannot allocate memory\n"); + break; + } + + CDEBUG(D_INFO, "ir apply logs "LPD64"/"LPD64" for %s -> %s\n", + prev_version, max_version, obdname, params); + + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + if (rc) + CDEBUG(D_INFO, "process config for %s error %d\n", + obdname, rc); + + /* continue, even one with error */ + } + + OBD_FREE(inst, PAGE_CACHE_SIZE); + RETURN(rc); +} + +/** + * This function is called if this client was notified for target restarting + * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs. + */ +static int mgc_process_recover_log(struct obd_device *obd, + struct config_llog_data *cld) +{ + struct ptlrpc_request *req = NULL; + struct config_llog_instance *cfg = &cld->cld_cfg; + struct mgs_config_body *body; + struct mgs_config_res *res; + struct ptlrpc_bulk_desc *desc; + struct page **pages; + int nrpages; + bool eof = true; + bool mne_swab = false; + int i; + int ealen; + int rc; + ENTRY; + + /* allocate buffer for bulk transfer. + * if this is the first time for this mgs to read logs, + * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs + * once; otherwise, it only reads increment of logs, this should be + * small and CONFIG_READ_NRPAGES will be used. + */ + nrpages = CONFIG_READ_NRPAGES; + if (cfg->cfg_last_idx == 0) /* the first time */ + nrpages = CONFIG_READ_NRPAGES_INIT; + + OBD_ALLOC(pages, sizeof(*pages) * nrpages); + if (pages == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) { + pages[i] = alloc_page(GFP_IOFS); + if (pages[i] == NULL) + GOTO(out, rc = -ENOMEM); + } + +again: + LASSERT(cld_is_recover(cld)); + LASSERT(mutex_is_locked(&cld->cld_lock)); + req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), + &RQF_MGS_CONFIG_READ); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ); + if (rc) + GOTO(out, rc); + + /* pack request */ + body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY); + LASSERT(body != NULL); + LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname)); + if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name)) + >= sizeof(body->mcb_name)) + GOTO(out, rc = -E2BIG); + body->mcb_offset = cfg->cfg_last_idx + 1; + body->mcb_type = cld->cld_type; + body->mcb_bits = PAGE_CACHE_SHIFT; + body->mcb_units = nrpages; + + /* allocate bulk transfer descriptor */ + desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, BULK_PUT_SINK, + MGS_BULK_PORTAL); + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < nrpages; i++) + ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); + if (res->mcr_size < res->mcr_offset) + GOTO(out, rc = -EINVAL); + + /* always update the index even though it might have errors with + * handling the recover logs */ + cfg->cfg_last_idx = res->mcr_offset; + eof = res->mcr_offset == res->mcr_size; + + CDEBUG(D_INFO, "Latest version "LPD64", more %d.\n", + res->mcr_offset, eof == false); + + ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0); + if (ealen < 0) + GOTO(out, rc = ealen); + + if (ealen > nrpages << PAGE_CACHE_SHIFT) + GOTO(out, rc = -EINVAL); + + if (ealen == 0) { /* no logs transferred */ + if (!eof) + rc = -EINVAL; + GOTO(out, rc); + } + + mne_swab = !!ptlrpc_rep_need_swab(req); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + /* This import flag means the server did an extra swab of IR MNE + * records (fixed in LU-1252), reverse it here if needed. LU-1644 */ + if (unlikely(req->rq_import->imp_need_mne_swab)) + mne_swab = !mne_swab; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + for (i = 0; i < nrpages && ealen > 0; i++) { + int rc2; + void *ptr; + + ptr = kmap(pages[i]); + rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr, + min_t(int, ealen, PAGE_CACHE_SIZE), + mne_swab); + kunmap(pages[i]); + if (rc2 < 0) { + CWARN("Process recover log %s error %d\n", + cld->cld_logname, rc2); + break; + } + + ealen -= PAGE_CACHE_SIZE; + } + +out: + if (req) + ptlrpc_req_finished(req); + + if (rc == 0 && !eof) + goto again; + + if (pages) { + for (i = 0; i < nrpages; i++) { + if (pages[i] == NULL) + break; + __free_page(pages[i]); + } + OBD_FREE(pages, sizeof(*pages) * nrpages); + } + return rc; +} + + +/* local_only means it cannot get remote llogs */ +static int mgc_process_cfg_log(struct obd_device *mgc, + struct config_llog_data *cld, + int local_only) +{ + struct llog_ctxt *ctxt, *lctxt = NULL; + struct lvfs_run_ctxt *saved_ctxt; + struct lustre_sb_info *lsi = NULL; + int rc = 0, must_pop = 0; + bool sptlrpc_started = false; + + ENTRY; + + LASSERT(cld); + LASSERT(mutex_is_locked(&cld->cld_lock)); + + /* + * local copy of sptlrpc log is controlled elsewhere, don't try to + * read it up here. + */ + if (cld_is_sptlrpc(cld) && local_only) + RETURN(0); + + if (cld->cld_cfg.cfg_sb) + lsi = s2lsi(cld->cld_cfg.cfg_sb); + + ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); + if (!ctxt) { + CERROR("missing llog context\n"); + RETURN(-EINVAL); + } + + OBD_ALLOC_PTR(saved_ctxt); + if (saved_ctxt == NULL) + RETURN(-ENOMEM); + + lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT); + + if (local_only) { /* no local log at client side */ + GOTO(out_pop, rc = -EIO); + } + + if (cld_is_sptlrpc(cld)) { + sptlrpc_conf_log_update_begin(cld->cld_logname); + sptlrpc_started = true; + } + + /* logname and instance info should be the same, so use our + copy of the instance for the update. The cfg_last_idx will + be updated here. */ + rc = class_config_parse_llog(NULL, ctxt, cld->cld_logname, + &cld->cld_cfg); + EXIT; + +out_pop: + llog_ctxt_put(ctxt); + if (lctxt) + llog_ctxt_put(lctxt); + if (must_pop) + pop_ctxt(saved_ctxt, &mgc->obd_lvfs_ctxt, NULL); + + OBD_FREE_PTR(saved_ctxt); + /* + * update settings on existing OBDs. doing it inside + * of llog_process_lock so no device is attaching/detaching + * in parallel. + * the logname must be -sptlrpc + */ + if (sptlrpc_started) { + LASSERT(cld_is_sptlrpc(cld)); + sptlrpc_conf_log_update_end(cld->cld_logname); + class_notify_sptlrpc_conf(cld->cld_logname, + strlen(cld->cld_logname) - + strlen("-sptlrpc")); + } + + RETURN(rc); +} + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Copy the log locally before parsing it if appropriate (non-MGS server) + */ +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld) +{ + struct lustre_handle lockh = { 0 }; + __u64 flags = LDLM_FL_NO_LRU; + int rc = 0, rcl; + ENTRY; + + LASSERT(cld); + + /* I don't want multiple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ + mutex_lock(&cld->cld_lock); + if (cld->cld_stopping) { + mutex_unlock(&cld->cld_lock); + RETURN(0); + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); + + CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname, + cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); + + /* Get the cfg lock on the llog */ + rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, + LCK_CR, &flags, NULL, NULL, NULL, + cld, 0, NULL, &lockh); + if (rcl == 0) { + /* Get the cld, it will be released in mgc_blocking_ast. */ + config_log_get(cld); + rc = ldlm_lock_set_data(&lockh, (void *)cld); + LASSERT(rc == 0); + } else { + CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); + + /* mark cld_lostlock so that it will requeue + * after MGC becomes available. */ + cld->cld_lostlock = 1; + /* Get extra reference, it will be put in requeue thread */ + config_log_get(cld); + } + + + if (cld_is_recover(cld)) { + rc = 0; /* this is not a fatal error for recover log */ + if (rcl == 0) + rc = mgc_process_recover_log(mgc, cld); + } else { + rc = mgc_process_cfg_log(mgc, cld, rcl != 0); + } + + CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", + mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); + + mutex_unlock(&cld->cld_lock); + + /* Now drop the lock so MGS can revoke it */ + if (!rcl) { + rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL, + LCK_CR, &lockh); + if (rcl) + CERROR("Can't drop cfg lock: %d\n", rcl); + } + + RETURN(rc); +} + + +/** Called from lustre_process_log. + * LCFG_LOG_START gets the config log from the MGS, processes it to start + * any services, and adds it to the list logs to watch (follow). + */ +static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct config_llog_instance *cfg = NULL; + char *logname; + int rc = 0; + ENTRY; + + switch(lcfg->lcfg_command) { + case LCFG_LOV_ADD_OBD: { + /* Overloading this cfg command: register a new target */ + struct mgs_target_info *mti; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) != + sizeof(struct mgs_target_info)) + GOTO(out, rc = -EINVAL); + + mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); + CDEBUG(D_MGC, "add_target %s %#x\n", + mti->mti_svname, mti->mti_flags); + rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); + break; + } + case LCFG_LOV_DEL_OBD: + /* Unregister has no meaning at the moment. */ + CERROR("lov_del_obd unimplemented\n"); + rc = -ENOSYS; + break; + case LCFG_SPTLRPC_CONF: { + rc = sptlrpc_process_config(lcfg); + break; + } + case LCFG_LOG_START: { + struct config_llog_data *cld; + struct super_block *sb; + + logname = lustre_cfg_string(lcfg, 1); + cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); + sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); + + CDEBUG(D_MGC, "parse_log %s from %d\n", logname, + cfg->cfg_last_idx); + + /* We're only called through here on the initial mount */ + rc = config_log_add(obd, logname, cfg, sb); + if (rc) + break; + cld = config_log_find(logname, cfg); + if (cld == NULL) { + rc = -ENOENT; + break; + } + + /* COMPAT_146 */ + /* FIXME only set this for old logs! Right now this forces + us to always skip the "inside markers" check */ + cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146; + + rc = mgc_process_log(obd, cld); + if (rc == 0 && cld->cld_recover != NULL) { + if (OCD_HAS_FLAG(&obd->u.cli.cl_import-> + imp_connect_data, IMP_RECOV)) { + rc = mgc_process_log(obd, cld->cld_recover); + } else { + struct config_llog_data *cir = cld->cld_recover; + cld->cld_recover = NULL; + config_log_put(cir); + } + if (rc) + CERROR("Cannot process recover llog %d\n", rc); + } + config_log_put(cld); + + break; + } + case LCFG_LOG_END: { + logname = lustre_cfg_string(lcfg, 1); + + if (lcfg->lcfg_bufcount >= 2) + cfg = (struct config_llog_instance *)lustre_cfg_buf( + lcfg, 2); + rc = config_log_end(logname, cfg); + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + +struct obd_ops mgc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = mgc_setup, + .o_precleanup = mgc_precleanup, + .o_cleanup = mgc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_disconnect = client_disconnect_export, + //.o_enqueue = mgc_enqueue, + .o_cancel = mgc_cancel, + //.o_iocontrol = mgc_iocontrol, + .o_set_info_async = mgc_set_info_async, + .o_get_info = mgc_get_info, + .o_import_event = mgc_import_event, + .o_llog_init = mgc_llog_init, + .o_llog_finish = mgc_llog_finish, + .o_process_config = mgc_process_config, +}; + +int __init mgc_init(void) +{ + return class_register_type(&mgc_obd_ops, NULL, NULL, + LUSTRE_MGC_NAME, NULL); +} + +static void /*__exit*/ mgc_exit(void) +{ + class_unregister_type(LUSTRE_MGC_NAME); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Management Client"); +MODULE_LICENSE("GPL"); + +module_init(mgc_init); +module_exit(mgc_exit); diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile new file mode 100644 index 000000000000..d2763f3f83cd --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o + +obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \ + llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \ + genops.o uuid.o llog_ioctl.o lprocfs_status.o \ + lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \ + local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\ + mea.o lu_object.o dt_object.o capa.o cl_object.o \ + cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o \ + md_local_object.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c new file mode 100644 index 000000000000..c2a6702c9f2c --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/acl.c @@ -0,0 +1,546 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/acl.c + * + * Lustre Access Control List. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#ifdef CONFIG_FS_POSIX_ACL + +#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION + +enum { + ES_UNK = 0, /* unknown stat */ + ES_UNC = 1, /* ACL entry is not changed */ + ES_MOD = 2, /* ACL entry is modified */ + ES_ADD = 3, /* ACL entry is added */ + ES_DEL = 4 /* ACL entry is deleted */ +}; + +static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); + d->e_stat = le32_to_cpu(s->e_stat); +} + +static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); + d->e_stat = cpu_to_le32(s->e_stat); +} + +static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); +} + +static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); +} + + +/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */ +static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header, + int old_count, int new_count) +{ + int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr); + int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr); + posix_acl_xattr_header *new; + + if (unlikely(old_count <= new_count)) + return old_size; + + OBD_ALLOC(new, new_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, new_size); + OBD_FREE(*header, old_size); + *header = new; + return new_size; +} + +/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */ +static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header, + int old_count) +{ + int ext_count = le32_to_cpu((*header)->a_count); + int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr); + ext_acl_xattr_header *new; + + if (unlikely(old_count <= ext_count)) + return 0; + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, ext_size); + OBD_FREE(*header, old_size); + *header = new; + return 0; +} + +/* + * Generate new extended ACL based on the posix ACL. + */ +ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size) +{ + int count, i, esize; + ext_acl_xattr_header *new; + ENTRY; + + if (unlikely(size < 0)) + RETURN(ERR_PTR(-EINVAL)); + else if (!size) + count = 0; + else + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr); + OBD_ALLOC(new, esize); + if (unlikely(new == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + new->a_count = cpu_to_le32(count); + for (i = 0; i < count; i++) { + new->a_entries[i].e_tag = header->a_entries[i].e_tag; + new->a_entries[i].e_perm = header->a_entries[i].e_perm; + new->a_entries[i].e_id = header->a_entries[i].e_id; + new->a_entries[i].e_stat = cpu_to_le32(ES_UNK); + } + + RETURN(new); +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext); + +/* + * Filter out the "nobody" entries in the posix ACL. + */ +int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size, + posix_acl_xattr_header **out) +{ + int count, i, j, rc = 0; + __u32 id; + posix_acl_xattr_header *new; + ENTRY; + + if (unlikely(size < 0)) + RETURN(-EINVAL); + else if (!size) + RETURN(0); + + OBD_ALLOC(new, size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + for (i = 0, j = 0; i < count; i++) { + id = le32_to_cpu(header->a_entries[i].e_id); + switch (le16_to_cpu(header->a_entries[i].e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (id != ACL_UNDEFINED_ID) + GOTO(_out, rc = -EIO); + + memcpy(&new->a_entries[j++], &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_USER: + if (id != NOBODY_UID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_GROUP: + if (id != NOBODY_GID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + default: + GOTO(_out, rc = -EIO); + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, count, j); + if (rc >= 0) { + size = rc; + *out = new; + rc = 0; + } + EXIT; + +_out: + if (rc) { + OBD_FREE(new, size); + size = rc; + } + return size; +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_filter); + +/* + * Release the posix ACL space. + */ +void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size) +{ + OBD_FREE(header, size); +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_free); + +/* + * Release the extended ACL space. + */ +void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header) +{ + OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \ + ext_acl_xattr)); +} +EXPORT_SYMBOL(lustre_ext_acl_xattr_free); + +static ext_acl_xattr_entry * +lustre_ext_acl_xattr_search(ext_acl_xattr_header *header, + posix_acl_xattr_entry *entry, int *pos) +{ + int once, start, end, i, j, count = le32_to_cpu(header->a_count); + + once = 0; + start = *pos; + end = count; + +again: + for (i = start; i < end; i++) { + if (header->a_entries[i].e_tag == entry->e_tag && + header->a_entries[i].e_id == entry->e_id) { + j = i; + if (++i >= count) + i = 0; + *pos = i; + return &header->a_entries[j]; + } + } + + if (!once) { + once = 1; + start = 0; + end = *pos; + goto again; + } + + return NULL; +} + +/* + * Merge the posix ACL and the extended ACL into new posix ACL. + */ +int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out) +{ + int posix_count, posix_size, i, j; + int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0; + posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID}; + posix_acl_xattr_header *new; + ext_acl_xattr_entry *ee, ae; + ENTRY; + + lustre_posix_acl_cpu_to_le(&pe, &pe); + ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos); + if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) { + /* there are only base ACL entries at most. */ + posix_count = 3; + posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + for (i = 0, j = 0; i < ext_count; i++) { + lustre_ext_acl_le_to_cpu(&ae, + &ext_header->a_entries[i]); + switch (ae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_OTHER: + if (ae.e_id != ACL_UNDEFINED_ID) + GOTO(_out, rc = -EIO); + + if (ae.e_stat != ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + break; + case ACL_MASK: + case ACL_USER: + case ACL_GROUP: + if (ae.e_stat == ES_DEL) + break; + default: + GOTO(_out, rc = -EIO); + } + } + } else { + /* maybe there are valid ACL_USER or ACL_GROUP entries in the + * original server-side ACL, they are regarded as ES_UNC stat.*/ + int ori_posix_count; + + if (unlikely(size < 0)) + RETURN(-EINVAL); + else if (!size) + ori_posix_count = 0; + else + ori_posix_count = + CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + posix_count = ori_posix_count + ext_count; + posix_size = + CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + /* 1. process the unchanged ACL entries + * in the original server-side ACL. */ + pos = 0; + for (i = 0, j = 0; i < ori_posix_count; i++) { + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee == NULL) + memcpy(&new->a_entries[j++], + &posix_header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + } + + /* 2. process the non-deleted entries + * from client-side extended ACL. */ + for (i = 0; i < ext_count; i++) { + if (le16_to_cpu(ext_header->a_entries[i].e_stat) != + ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j); + if (rc >= 0) { + posix_size = rc; + *out = new; + rc = 0; + } + EXIT; + +_out: + if (rc) { + OBD_FREE(new, posix_size); + posix_size = rc; + } + return posix_size; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2posix); + +/* + * Merge the posix ACL and the extended ACL into new extended ACL. + */ +ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header) +{ + int ori_ext_count, posix_count, ext_count, ext_size; + int i, j, pos = 0, rc = 0; + posix_acl_xattr_entry pae; + ext_acl_xattr_header *new; + ext_acl_xattr_entry *ee, eae; + ENTRY; + + if (unlikely(size < 0)) + RETURN(ERR_PTR(-EINVAL)); + else if (!size) + posix_count = 0; + else + posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + ori_ext_count = le32_to_cpu(ext_header->a_count); + ext_count = posix_count + ori_ext_count; + ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + for (i = 0, j = 0; i < posix_count; i++) { + lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]); + switch (pae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (pae.e_id != ACL_UNDEFINED_ID) + GOTO(out, rc = -EIO); + case ACL_USER: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_UID) + break; + + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + case ACL_GROUP: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_GID) + break; + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + default: + GOTO(out, rc = -EIO); + } + } + + /* process deleted entries. */ + for (i = 0; i < ori_ext_count; i++) { + lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]); + if (eae.e_stat == ES_UNK) { + /* ignore "nobody" entry. */ + if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) || + (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID)) + continue; + + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j].e_id = ext_header->a_entries[i].e_id; + new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL); + } + } + + new->a_count = cpu_to_le32(j); + /* free unused space. */ + rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count); + EXIT; + +out: + if (rc) { + OBD_FREE(new, ext_size); + new = ERR_PTR(rc); + } + return new; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2ext); + +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c new file mode 100644 index 000000000000..3e532f5106e4 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/capa.c @@ -0,0 +1,401 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/capa.c + * + * Lustre Capability Hash Management + * + * Author: Lai Siyao + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#define NR_CAPAHASH 32 +#define CAPA_HASH_SIZE 3000 /* for MDS & OSS */ + +struct kmem_cache *capa_cachep = NULL; + +/* lock for capa hash/capa_list/fo_capa_keys */ +DEFINE_SPINLOCK(capa_lock); + +struct list_head capa_list[CAPA_SITE_MAX]; + +static struct capa_hmac_alg capa_hmac_algs[] = { + DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20), +}; +/* capa count */ +int capa_count[CAPA_SITE_MAX] = { 0, }; + +EXPORT_SYMBOL(capa_cachep); +EXPORT_SYMBOL(capa_list); +EXPORT_SYMBOL(capa_lock); +EXPORT_SYMBOL(capa_count); + +struct hlist_head *init_capa_hash(void) +{ + struct hlist_head *hash; + int nr_hash, i; + + OBD_ALLOC(hash, PAGE_CACHE_SIZE); + if (!hash) + return NULL; + + nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head); + LASSERT(nr_hash > NR_CAPAHASH); + + for (i = 0; i < NR_CAPAHASH; i++) + INIT_HLIST_HEAD(hash + i); + return hash; +} +EXPORT_SYMBOL(init_capa_hash); + +static inline int capa_on_server(struct obd_capa *ocapa) +{ + return ocapa->c_site == CAPA_SITE_SERVER; +} + +static inline void capa_delete(struct obd_capa *ocapa) +{ + LASSERT(capa_on_server(ocapa)); + hlist_del_init(&ocapa->u.tgt.c_hash); + list_del_init(&ocapa->c_list); + capa_count[ocapa->c_site]--; + /* release the ref when alloc */ + capa_put(ocapa); +} + +void cleanup_capa_hash(struct hlist_head *hash) +{ + int i; + struct hlist_node *next; + struct obd_capa *oc; + + spin_lock(&capa_lock); + for (i = 0; i < NR_CAPAHASH; i++) { + hlist_for_each_entry_safe(oc, next, hash + i, + u.tgt.c_hash) + capa_delete(oc); + } + spin_unlock(&capa_lock); + + OBD_FREE(hash, PAGE_CACHE_SIZE); +} +EXPORT_SYMBOL(cleanup_capa_hash); + +static inline int capa_hashfn(struct lu_fid *fid) +{ + return (fid_oid(fid) ^ fid_ver(fid)) * + (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH; +} + +/* capa renewal time check is earlier than that on client, which is to prevent + * client renew right after obtaining it. */ +static inline int capa_is_to_expire(struct obd_capa *oc) +{ + return cfs_time_before(cfs_time_sub(oc->c_expiry, + cfs_time_seconds(oc->c_capa.lc_timeout)*2/3), + cfs_time_current()); +} + +static struct obd_capa *find_capa(struct lustre_capa *capa, + struct hlist_head *head, int alive) +{ + struct obd_capa *ocapa; + int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa); + + hlist_for_each_entry(ocapa, head, u.tgt.c_hash) { + if (memcmp(&ocapa->c_capa, capa, len)) + continue; + /* don't return one that will expire soon in this case */ + if (alive && capa_is_to_expire(ocapa)) + continue; + + LASSERT(capa_on_server(ocapa)); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found"); + return ocapa; + } + + return NULL; +} + +#define LRU_CAPA_DELETE_COUNT 12 +static inline void capa_delete_lru(struct list_head *head) +{ + struct obd_capa *ocapa; + struct list_head *node = head->next; + int count = 0; + + /* free LRU_CAPA_DELETE_COUNT unused capa from head */ + while (count++ < LRU_CAPA_DELETE_COUNT) { + ocapa = list_entry(node, struct obd_capa, c_list); + node = node->next; + if (atomic_read(&ocapa->c_refc)) + continue; + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru"); + capa_delete(ocapa); + } +} + +/* add or update */ +struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa) +{ + struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid); + struct obd_capa *ocapa, *old = NULL; + struct list_head *list = &capa_list[CAPA_SITE_SERVER]; + + ocapa = alloc_capa(CAPA_SITE_SERVER); + if (IS_ERR(ocapa)) + return NULL; + + spin_lock(&capa_lock); + old = find_capa(capa, head, 0); + if (!old) { + ocapa->c_capa = *capa; + set_capa_expiry(ocapa); + hlist_add_head(&ocapa->u.tgt.c_hash, head); + list_add_tail(&ocapa->c_list, list); + capa_get(ocapa); + capa_count[CAPA_SITE_SERVER]++; + if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE) + capa_delete_lru(list); + spin_unlock(&capa_lock); + return ocapa; + } else { + capa_get(old); + spin_unlock(&capa_lock); + capa_put(ocapa); + return old; + } +} +EXPORT_SYMBOL(capa_add); + +struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa, + int alive) +{ + struct obd_capa *ocapa; + + spin_lock(&capa_lock); + ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive); + if (ocapa) { + list_move_tail(&ocapa->c_list, + &capa_list[CAPA_SITE_SERVER]); + capa_get(ocapa); + } + spin_unlock(&capa_lock); + + return ocapa; +} +EXPORT_SYMBOL(capa_lookup); + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key) +{ + struct ll_crypto_hash *tfm; + struct capa_hmac_alg *alg; + int keylen; + struct scatterlist sl; + + if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) { + CERROR("unknown capability hmac algorithm!\n"); + return -EFAULT; + } + + alg = &capa_hmac_algs[capa_alg(capa)]; + + tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0); + if (!tfm) { + CERROR("crypto_alloc_tfm failed, check whether your kernel" + "has crypto support!\n"); + return -ENOMEM; + } + keylen = alg->ha_keylen; + + sg_set_page(&sl, virt_to_page(capa), + offsetof(struct lustre_capa, lc_hmac), + (unsigned long)(capa) % PAGE_CACHE_SIZE); + + ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac); + ll_crypto_free_hash(tfm); + + return 0; +} +EXPORT_SYMBOL(capa_hmac); + +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + ENTRY; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 ); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + RETURN(PTR_ERR(tfm)); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to encrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_encrypt_id); + +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + ENTRY; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 ); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + RETURN(PTR_ERR(tfm)); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to decrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_decrypt_id); + +void capa_cpy(void *capa, struct obd_capa *ocapa) +{ + spin_lock(&ocapa->c_lock); + *(struct lustre_capa *)capa = ocapa->c_capa; + spin_unlock(&ocapa->c_lock); +} +EXPORT_SYMBOL(capa_cpy); + +void _debug_capa(struct lustre_capa *c, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ... ) +{ + va_list args; + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " capability@%p fid "DFID" opc "LPX64" uid "LPU64 + " gid "LPU64" flags %u alg %d keyid %u timeout %u " + "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c), + capa_uid(c), capa_gid(c), capa_flags(c), + capa_alg(c), capa_keyid(c), capa_timeout(c), + capa_expiry(c)); + va_end(args); +} +EXPORT_SYMBOL(_debug_capa); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h new file mode 100644 index 000000000000..7eb0ad7b3644 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal cl interfaces. + * + * Author: Nikita Danilov + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +#define CLT_PVEC_SIZE (14) + +/** + * Possible levels of the nesting. Currently this is 2: there are "top" + * entities (files, extent locks), and "sub" entities (stripes and stripe + * locks). This is used only for debugging counters right now. + */ +enum clt_nesting_level { + CNL_TOP, + CNL_SUB, + CNL_NR +}; + +/** + * Counters used to check correctness of cl_lock interface usage. + */ +struct cl_thread_counters { + /** + * Number of outstanding calls to cl_lock_mutex_get() made by the + * current thread. For debugging. + */ + int ctc_nr_locks_locked; + /** List of locked locks. */ + struct lu_ref ctc_locks_locked; + /** Number of outstanding holds on locks. */ + int ctc_nr_held; + /** Number of outstanding uses on locks. */ + int ctc_nr_used; + /** Number of held extent locks. */ + int ctc_nr_locks_acquired; +}; + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /* + * Common fields. + */ + struct cl_io clt_io; + struct cl_2queue clt_queue; + + /* + * Fields used by cl_lock.c + */ + struct cl_lock_descr clt_descr; + struct cl_page_list clt_list; + /** + * Counters for every level of lock nesting. + */ + struct cl_thread_counters clt_counters[CNL_NR]; + /** @} debugging */ + + /* + * Fields used by cl_page.c + */ + struct cl_page *clt_pvec[CLT_PVEC_SIZE]; + + /* + * Fields used by cl_io.c + */ + /** + * Pointer to the topmost ongoing IO in this thread. + */ + struct cl_io *clt_current_io; + /** + * Used for submitting a sync io. + */ + struct cl_sync_io clt_anchor; + /** + * Fields used by cl_lock_discard_pages(). + */ + pgoff_t clt_next_index; + pgoff_t clt_fn_index; /* first non-overlapped index */ +}; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); + +#endif /* _CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c new file mode 100644 index 000000000000..75c9be8875e0 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c @@ -0,0 +1,1753 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client IO. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +#define cl_io_for_each(slice, io) \ + list_for_each_entry((slice), &io->ci_layers, cis_linkage) +#define cl_io_for_each_reverse(slice, io) \ + list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * Returns true iff there is an IO ongoing in the given environment. + */ +int cl_io_is_going(const struct lu_env *env) +{ + return cl_env_info(env)->clt_current_io != NULL; +} +EXPORT_SYMBOL(cl_io_is_going); + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + struct cl_thread_info *info; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.prev, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + info = cl_env_info(env); + if (info->clt_current_io == io) + info->clt_current_io = NULL; + + /* sanity check for layout change */ + switch(io->ci_type) { + case CIT_READ: + case CIT_WRITE: + break; + case CIT_FAULT: + case CIT_FSYNC: + LASSERT(!io->ci_need_restart); + break; + case CIT_SETATTR: + case CIT_MISC: + /* Check ignore layout change conf */ + LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, + !io->ci_need_restart)); + break; + default: + LBUG(); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_type = iot; + INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + INIT_LIST_HEAD(&io->ci_lockset.cls_curr); + INIT_LIST_HEAD(&io->ci_lockset.cls_done); + INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + RETURN(result); +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj != cl_object_top(obj)); + if (info->clt_current_io == NULL) + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj == cl_object_top(obj)); + LASSERT(info->clt_current_io == NULL); + + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %u ["LPU64", "LPU64") %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); +} +EXPORT_SYMBOL(cl_io_rw_init); + +static inline const struct lu_fid * +cl_lock_descr_fid(const struct cl_lock_descr *descr) +{ + return lu_object_fid(&descr->cld_obj->co_lu); +} + +static int cl_lock_descr_sort(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?: + __diff_normalize(d0->cld_start, d1->cld_start); +} + +static int cl_lock_descr_cmp(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + int ret; + + ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)); + if (ret) + return ret; + if (d0->cld_end < d1->cld_start) + return -1; + if (d0->cld_start > d0->cld_end) + return 1; + return 0; +} + +static void cl_lock_descr_merge(struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + d0->cld_start = min(d0->cld_start, d1->cld_start); + d0->cld_end = max(d0->cld_end, d1->cld_end); + + if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) + d0->cld_mode = CLM_WRITE; + + if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) + d0->cld_mode = CLM_GROUP; +} + +/* + * Sort locks in lexicographical order of their (fid, start-offset) pairs. + */ +static void cl_io_locks_sort(struct cl_io *io) +{ + int done = 0; + + ENTRY; + /* hidden treasure: bubble sort for now. */ + do { + struct cl_io_lock_link *curr; + struct cl_io_lock_link *prev; + struct cl_io_lock_link *temp; + + done = 1; + prev = NULL; + + list_for_each_entry_safe(curr, temp, + &io->ci_lockset.cls_todo, + cill_linkage) { + if (prev != NULL) { + switch (cl_lock_descr_sort(&prev->cill_descr, + &curr->cill_descr)) { + case 0: + /* + * IMPOSSIBLE: Identical locks are + * already removed at + * this point. + */ + default: + LBUG(); + case +1: + list_move_tail(&curr->cill_linkage, + &prev->cill_linkage); + done = 0; + continue; /* don't change prev: it's + * still "previous" */ + case -1: /* already in order */ + break; + } + } + prev = curr; + } + } while (!done); + EXIT; +} + +/** + * Check whether \a queue contains locks matching \a need. + * + * \retval +ve there is a matching lock in the \a queue + * \retval 0 there are no matching locks in the \a queue + */ +int cl_queue_match(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_match(&scan->cill_descr, need)) + RETURN(+1); + } + RETURN(0); +} +EXPORT_SYMBOL(cl_queue_match); + +static int cl_queue_merge(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_cmp(&scan->cill_descr, need)) + continue; + cl_lock_descr_merge(&scan->cill_descr, need); + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + scan->cill_descr.cld_mode, scan->cill_descr.cld_start, + scan->cill_descr.cld_end); + RETURN(+1); + } + RETURN(0); + +} + +static int cl_lockset_match(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_match(&set->cls_curr, need) || + cl_queue_match(&set->cls_done, need); +} + +static int cl_lockset_merge(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_merge(&set->cls_todo, need) || + cl_lockset_match(set, need); +} + +static int cl_lockset_lock_one(const struct lu_env *env, + struct cl_io *io, struct cl_lockset *set, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock; + int result; + + ENTRY; + + lock = cl_lock_request(env, io, &link->cill_descr, "io", io); + + if (!IS_ERR(lock)) { + link->cill_lock = lock; + list_move(&link->cill_linkage, &set->cls_curr); + if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) { + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + } else + result = 0; + } else + result = PTR_ERR(lock); + RETURN(result); +} + +static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock = link->cill_lock; + + ENTRY; + list_del_init(&link->cill_linkage); + if (lock != NULL) { + cl_lock_release(env, lock, "io", io); + link->cill_lock = NULL; + } + if (link->cill_fini != NULL) + link->cill_fini(env, link); + EXIT; +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + struct cl_lock *lock; + int result; + + ENTRY; + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + if (!cl_lockset_match(set, &link->cill_descr)) { + /* XXX some locking to guarantee that locks aren't + * expanded in between. */ + result = cl_lockset_lock_one(env, io, set, link); + if (result != 0) + break; + } else + cl_lock_link_fini(env, io, link); + } + if (result == 0) { + list_for_each_entry_safe(link, temp, + &set->cls_curr, cill_linkage) { + lock = link->cill_lock; + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + else + break; + } + } + RETURN(result); +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + cl_io_locks_sort(io); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + cl_unuse(env, link->cill_lock); + cl_lock_link_fini(env, io, link); + } + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired); + EXIT; +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + result = 0; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_rw_advance); + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + ENTRY; + if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + ENTRY; + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_state = CIS_IO_GOING; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; + EXIT; +} +EXPORT_SYMBOL(cl_io_end); + +static const struct cl_page_slice * +cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type); + LINVRNT(slice != NULL); + return slice; +} + +/** + * True iff \a page is within \a io range. + */ +static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io) +{ + int result = 1; + loff_t start; + loff_t end; + pgoff_t idx; + + idx = page->cp_index; + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* + * check that [start, end) and [pos, pos + count) extents + * overlap. + */ + if (!cl_io_is_append(io)) { + const struct cl_io_rw_common *crw = &(io->u.ci_rw); + start = cl_offset(page->cp_obj, idx); + end = cl_offset(page->cp_obj, idx + 1); + result = crw->crw_pos < end && + start < crw->crw_pos + crw->crw_count; + } + break; + case CIT_FAULT: + result = io->u.ci_fault.ft_index == idx; + break; + default: + LBUG(); + } + return result; +} + +/** + * Called by read io, when page has to be read from the server. + * + * \see cl_io_operations::cio_read_page() + */ +int cl_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + const struct cl_io_slice *scan; + struct cl_2queue *queue; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_page_in_io(page, io)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + queue = &io->ci_queue; + + cl_2queue_init(queue); + /* + * ->cio_read_page() methods called in the loop below are supposed to + * never block waiting for network (the only subtle point is the + * creation of new pages for read-ahead that might result in cache + * shrinking, but currently only clean pages are shrunk and this + * requires no network io). + * + * Should this ever starts blocking, retry loop would be needed for + * "parallel io" (see CLO_REPEAT loops in cl_lock.c). + */ + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_read_page != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + LINVRNT(slice != NULL); + result = scan->cis_iop->cio_read_page(env, scan, slice); + if (result != 0) + break; + } + } + if (result == 0) + result = cl_io_submit_rw(env, io, CRT_READ, queue); + /* + * Unlock unsent pages in case of error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_read_page); + +/** + * Called by write io to prepare page to receive data from user buffer. + * + * \see cl_io_operations::cio_prepare_write() + */ +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->cio_prepare_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_prepare_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_prepare_write); + +/** + * Called by write io after user data were copied into a page. + * + * \see cl_io_operations::cio_commit_write() + */ +int cl_io_commit_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + /* + * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov) + * already called cl_page_cache_add(), moving page into CPS_CACHED + * state. Better (and more general) way of dealing with such situation + * is needed. + */ + LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_commit_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_commit_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + LINVRNT(result <= 0); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_commit_write); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->req_op[crt].cio_submit == NULL) + continue; + result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt, + queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Submit a sync_io and wait for the IO to be finished, or error happens. + * If \a timeout is zero, it means to wait for the IO unconditionally. + */ +int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout) +{ + struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; + struct cl_page *pg; + int rc; + + cl_page_list_for_each(pg, &queue->c2_qin) { + LASSERT(pg->cp_sync_io == NULL); + pg->cp_sync_io = anchor; + } + + cl_sync_io_init(anchor, queue->c2_qin.pl_nr); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * read found up-to-date pages in the cache, or write found + * clean pages), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(pg, &queue->c2_qin) { + pg->cp_sync_io = NULL; + cl_sync_io_note(anchor, +1); + } + + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, + anchor, timeout); + } else { + LASSERT(list_empty(&queue->c2_qout.pl_pages)); + cl_page_list_for_each(pg, &queue->c2_qin) + pg->cp_sync_io = NULL; + } + return rc; +} +EXPORT_SYMBOL(cl_io_submit_sync); + +/** + * Cancel an IO which has been submitted by cl_io_submit_rw. + */ +int cl_io_cancel(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue) +{ + struct cl_page *page; + int result = 0; + + CERROR("Canceling ongoing page trasmission\n"); + cl_page_list_for_each(page, queue) { + int rc; + + LINVRNT(cl_page_in_io(page, io)); + rc = cl_page_cancel(env, page); + result = result ?: rc; + } + return result; +} +EXPORT_SYMBOL(cl_io_cancel); + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + * - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + } while (result == 0 && io->ci_continue); + if (result == 0) + result = io->ci_result; + RETURN(result < 0 ? result : 0); +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + ENTRY; + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + ENTRY; + plist->pl_nr = 0; + INIT_LIST_HEAD(&plist->pl_pages); + plist->pl_owner = current; + EXIT; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +{ + ENTRY; + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + LINVRNT(plist->pl_owner == current); + + lockdep_off(); + mutex_lock(&page->cp_mutex); + lockdep_on(); + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist); + cl_page_get(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LINVRNT(plist->pl_owner == current); + + ENTRY; + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == current); + LINVRNT(src->pl_owner == current); + + ENTRY; + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, + page->cp_queue_ref, "queue", src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +{ + struct cl_page *page; + struct cl_page *tmp; + + LINVRNT(list->pl_owner == current); + LINVRNT(head->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, tmp, list) + cl_page_list_move(head, list, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_splice); + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del(&page->cp_reference, "queue", plist); + cl_page_put(env, page); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Owns all pages in a queue. + */ +int cl_page_list_own(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + pgoff_t index = 0; + int result; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + result = 0; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(index <= page->cp_index); + index = page->cp_index; + if (cl_page_own(env, io, page) == 0) + result = result ?: page->cp_error; + else + cl_page_list_del(env, plist, page); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_own); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} +EXPORT_SYMBOL(cl_page_list_assume); + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + ENTRY; + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Unmaps all pages in a queue from user virtual memory. + */ +int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + int result; + + LINVRNT(plist->pl_owner == current); + ENTRY; + result = 0; + cl_page_list_for_each(page, plist) { + result = cl_page_unmap(env, io, page); + if (result != 0) + break; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_unmap); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_page_list_add(&queue->c2_qin, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_assume); + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top(), cl_page_top(). + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + ENTRY; + while (io->ci_parent != NULL) + io = io->ci_parent; + RETURN(io); +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Adds request slice to the compound request. + * + * This is called by cl_device_operations::cdo_req_init() methods to add a + * per-layer state to the request. New state is added at the end of + * cl_req::crq_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops) +{ + ENTRY; + list_add_tail(&slice->crs_linkage, &req->crq_layers); + slice->crs_dev = dev; + slice->crs_ops = ops; + slice->crs_req = req; + EXIT; +} +EXPORT_SYMBOL(cl_req_slice_add); + +static void cl_req_free(const struct lu_env *env, struct cl_req *req) +{ + unsigned i; + + LASSERT(list_empty(&req->crq_pages)); + LASSERT(req->crq_nrpages == 0); + LINVRNT(list_empty(&req->crq_layers)); + LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL)); + ENTRY; + + if (req->crq_o != NULL) { + for (i = 0; i < req->crq_nrobjs; ++i) { + struct cl_object *obj = req->crq_o[i].ro_obj; + if (obj != NULL) { + lu_object_ref_del_at(&obj->co_lu, + req->crq_o[i].ro_obj_ref, + "cl_req", req); + cl_object_put(env, obj); + } + } + OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]); + } + OBD_FREE_PTR(req); + EXIT; +} + +static int cl_req_init(const struct lu_env *env, struct cl_req *req, + struct cl_page *page) +{ + struct cl_device *dev; + struct cl_page_slice *slice; + int result; + + ENTRY; + result = 0; + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev); + if (dev->cd_ops->cdo_req_init != NULL) { + result = dev->cd_ops->cdo_req_init(env, + dev, req); + if (result != 0) + break; + } + } + page = page->cp_child; + } while (page != NULL && result == 0); + RETURN(result); +} + +/** + * Invokes per-request transfer completion call-backs + * (cl_req_operations::cro_completion()) bottom-to-top. + */ +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc) +{ + struct cl_req_slice *slice; + + ENTRY; + /* + * for the lack of list_for_each_entry_reverse_safe()... + */ + while (!list_empty(&req->crq_layers)) { + slice = list_entry(req->crq_layers.prev, + struct cl_req_slice, crs_linkage); + list_del_init(&slice->crs_linkage); + if (slice->crs_ops->cro_completion != NULL) + slice->crs_ops->cro_completion(env, slice, rc); + } + cl_req_free(env, req); + EXIT; +} +EXPORT_SYMBOL(cl_req_completion); + +/** + * Allocates new transfer request. + */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects) +{ + struct cl_req *req; + + LINVRNT(nr_objects > 0); + ENTRY; + + OBD_ALLOC_PTR(req); + if (req != NULL) { + int result; + + OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]); + if (req->crq_o != NULL) { + req->crq_nrobjs = nr_objects; + req->crq_type = crt; + INIT_LIST_HEAD(&req->crq_pages); + INIT_LIST_HEAD(&req->crq_layers); + result = cl_req_init(env, req, page); + } else + result = -ENOMEM; + if (result != 0) { + cl_req_completion(env, req, result); + req = ERR_PTR(result); + } + } else + req = ERR_PTR(-ENOMEM); + RETURN(req); +} +EXPORT_SYMBOL(cl_req_alloc); + +/** + * Adds a page to a request. + */ +void cl_req_page_add(const struct lu_env *env, + struct cl_req *req, struct cl_page *page) +{ + struct cl_object *obj; + struct cl_req_obj *rqo; + int i; + + ENTRY; + page = cl_page_top(page); + + LASSERT(list_empty(&page->cp_flight)); + LASSERT(page->cp_req == NULL); + + CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n", + req, req->crq_type, req->crq_nrpages); + + list_add_tail(&page->cp_flight, &req->crq_pages); + ++req->crq_nrpages; + page->cp_req = req; + obj = cl_object_top(page->cp_obj); + for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) { + if (rqo->ro_obj == NULL) { + rqo->ro_obj = obj; + cl_object_get(obj); + rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_req", req); + break; + } + } + LASSERT(i < req->crq_nrobjs); + EXIT; +} +EXPORT_SYMBOL(cl_req_page_add); + +/** + * Removes a page from a request. + */ +void cl_req_page_done(const struct lu_env *env, struct cl_page *page) +{ + struct cl_req *req = page->cp_req; + + ENTRY; + page = cl_page_top(page); + + LASSERT(!list_empty(&page->cp_flight)); + LASSERT(req->crq_nrpages > 0); + + list_del_init(&page->cp_flight); + --req->crq_nrpages; + page->cp_req = NULL; + EXIT; +} +EXPORT_SYMBOL(cl_req_page_done); + +/** + * Notifies layers that request is about to depart by calling + * cl_req_operations::cro_prep() top-to-bottom. + */ +int cl_req_prep(const struct lu_env *env, struct cl_req *req) +{ + int i; + int result; + const struct cl_req_slice *slice; + + ENTRY; + /* + * Check that the caller of cl_req_alloc() didn't lie about the number + * of objects. + */ + for (i = 0; i < req->crq_nrobjs; ++i) + LASSERT(req->crq_o[i].ro_obj != NULL); + + result = 0; + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + if (slice->crs_ops->cro_prep != NULL) { + result = slice->crs_ops->cro_prep(env, slice); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_req_prep); + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags) +{ + const struct cl_req_slice *slice; + struct cl_page *page; + int i; + + LASSERT(!list_empty(&req->crq_pages)); + ENTRY; + + /* Take any page to use as a model. */ + page = list_entry(req->crq_pages.next, struct cl_page, cp_flight); + + for (i = 0; i < req->crq_nrobjs; ++i) { + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + const struct cl_page_slice *scan; + const struct cl_object *obj; + + scan = cl_page_at(page, + slice->crs_dev->cd_lu_dev.ld_type); + LASSERT(scan != NULL); + obj = scan->cpl_obj; + if (slice->crs_ops->cro_attr_set != NULL) + slice->crs_ops->cro_attr_set(env, slice, obj, + attr + i, flags); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_req_attr_set); + +/* XXX complete(), init_completion(), and wait_for_completion(), until they are + * implemented in libcfs. */ +# include + +/** + * Initialize synchronous io wait anchor, for transfer of \a nrpages pages. + */ +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages) +{ + ENTRY; + init_waitqueue_head(&anchor->csi_waitq); + atomic_set(&anchor->csi_sync_nr, nrpages); + atomic_set(&anchor->csi_barrier, nrpages > 0); + anchor->csi_sync_rc = 0; + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_init); + +/** + * Wait until all transfer completes. Transfer completion routine has to call + * cl_sync_io_note() for every page. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout) +{ + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + NULL, NULL, NULL); + int rc; + ENTRY; + + LASSERT(timeout >= 0); + + rc = l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + if (rc < 0) { + CERROR("SYNC IO failed with error: %d, try to cancel " + "%d remaining pages\n", + rc, atomic_read(&anchor->csi_sync_nr)); + + (void)cl_io_cancel(env, io, queue); + + lwi = (struct l_wait_info) { 0 }; + (void)l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + } else { + rc = anchor->csi_sync_rc; + } + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + cl_page_list_assume(env, io, queue); + + /* wait until cl_sync_io_note() has done wakeup */ + while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) { + cpu_relax(); + } + + POISON(anchor, 0x5a, sizeof *anchor); + RETURN(rc); +} +EXPORT_SYMBOL(cl_sync_io_wait); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret) +{ + ENTRY; + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); + if (atomic_dec_and_test(&anchor->csi_sync_nr)) { + wake_up_all(&anchor->csi_waitq); + /* it's safe to nuke or reuse anchor now */ + atomic_set(&anchor->csi_barrier, 0); + } + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_note); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c new file mode 100644 index 000000000000..d34e044fc854 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c @@ -0,0 +1,2304 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Extent Lock. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include "cl_internal.h" + +/** Lock class of cl_lock::cll_guard */ +static struct lock_class_key cl_lock_guard_class; +static struct kmem_cache *cl_lock_kmem; + +static struct lu_kmem_descr cl_lock_caches[] = { + { + .ckd_cache = &cl_lock_kmem, + .ckd_name = "cl_lock_kmem", + .ckd_size = sizeof (struct cl_lock) + }, + { + .ckd_cache = NULL + } +}; + +#define CS_LOCK_INC(o, item) +#define CS_LOCK_DEC(o, item) +#define CS_LOCKSTATE_INC(o, state) +#define CS_LOCKSTATE_DEC(o, state) + +/** + * Basic lock invariant that is maintained at all times. Caller either has a + * reference to \a lock, or somehow assures that \a lock cannot be freed. + * + * \see cl_lock_invariant() + */ +static int cl_lock_invariant_trusted(const struct lu_env *env, + const struct cl_lock *lock) +{ + return ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) && + atomic_read(&lock->cll_ref) >= lock->cll_holds && + lock->cll_holds >= lock->cll_users && + lock->cll_holds >= 0 && + lock->cll_users >= 0 && + lock->cll_depth >= 0; +} + +/** + * Stronger lock invariant, checking that caller has a reference on a lock. + * + * \see cl_lock_invariant_trusted() + */ +static int cl_lock_invariant(const struct lu_env *env, + const struct cl_lock *lock) +{ + int result; + + result = atomic_read(&lock->cll_ref) > 0 && + cl_lock_invariant_trusted(env, lock); + if (!result && env != NULL) + CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken"); + return result; +} + +/** + * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock. + */ +static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock) +{ + return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting; +} + +/** + * Returns a set of counters for this lock, depending on a lock nesting. + */ +static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env, + const struct cl_lock *lock) +{ + struct cl_thread_info *info; + enum clt_nesting_level nesting; + + info = cl_env_info(env); + nesting = cl_lock_nesting(lock); + LASSERT(nesting < ARRAY_SIZE(info->clt_counters)); + return &info->clt_counters[nesting]; +} + +static void cl_lock_trace0(int level, const struct lu_env *env, + const char *prefix, const struct cl_lock *lock, + const char *func, const int line) +{ + struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); + CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)" + "(%p/%d/%d) at %s():%d\n", + prefix, lock, atomic_read(&lock->cll_ref), + lock->cll_guarder, lock->cll_depth, + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags, + env, h->coh_nesting, cl_lock_nr_mutexed(env), + func, line); +} +#define cl_lock_trace(level, env, prefix, lock) \ + cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__) + +#define RETIP ((unsigned long)__builtin_return_address(0)) + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key cl_lock_key; + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{ + lockdep_set_class_and_name(lock, &cl_lock_key, "EXT"); +} + +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired++; + lock_map_acquire(&lock->dep_map); +} + +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired--; + lock_release(&lock->dep_map, 0, RETIP); +} + +#else /* !CONFIG_LOCKDEP */ + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{} +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{} +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{} + +#endif /* !CONFIG_LOCKDEP */ + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + ENTRY; + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; + EXIT; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +/** + * Returns true iff a lock with the mode \a has provides at least the same + * guarantees as a lock with the mode \a need. + */ +int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need) +{ + LINVRNT(need == CLM_READ || need == CLM_WRITE || + need == CLM_PHANTOM || need == CLM_GROUP); + LINVRNT(has == CLM_READ || has == CLM_WRITE || + has == CLM_PHANTOM || has == CLM_GROUP); + CLASSERT(CLM_PHANTOM < CLM_READ); + CLASSERT(CLM_READ < CLM_WRITE); + CLASSERT(CLM_WRITE < CLM_GROUP); + + if (has != CLM_GROUP) + return need <= has; + else + return need == has; +} +EXPORT_SYMBOL(cl_lock_mode_match); + +/** + * Returns true iff extent portions of lock descriptions match. + */ +int cl_lock_ext_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + has->cld_start <= need->cld_start && + has->cld_end >= need->cld_end && + cl_lock_mode_match(has->cld_mode, need->cld_mode) && + (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid); +} +EXPORT_SYMBOL(cl_lock_ext_match); + +/** + * Returns true iff a lock with the description \a has provides at least the + * same guarantees as a lock with the description \a need. + */ +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + cl_object_same(has->cld_obj, need->cld_obj) && + cl_lock_ext_match(has, need); +} +EXPORT_SYMBOL(cl_lock_descr_match); + +static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + + LINVRNT(!cl_lock_is_mutexed(lock)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "free lock", lock); + might_sleep(); + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, + struct cl_lock_slice, cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + CS_LOCK_DEC(obj, total); + CS_LOCKSTATE_DEC(obj, lock->cll_state); + lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock); + cl_object_put(env, obj); + lu_ref_fini(&lock->cll_reference); + lu_ref_fini(&lock->cll_holders); + mutex_destroy(&lock->cll_guard); + OBD_SLAB_FREE_PTR(lock, cl_lock_kmem); + EXIT; +} + +/** + * Releases a reference on a lock. + * + * When last reference is released, lock is returned to the cache, unless it + * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed + * immediately. + * + * \see cl_object_put(), cl_page_put() + */ +void cl_lock_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + obj = lock->cll_descr.cld_obj; + LINVRNT(obj != NULL); + + CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + + if (atomic_dec_and_test(&lock->cll_ref)) { + if (lock->cll_state == CLS_FREEING) { + LASSERT(list_empty(&lock->cll_linkage)); + cl_lock_free(env, lock); + } + CS_LOCK_DEC(obj, busy); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_put); + +/** + * Acquires an additional reference to a lock. + * + * This can be called only by caller already possessing a reference to \a + * lock. + * + * \see cl_object_get(), cl_page_get() + */ +void cl_lock_get(struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(NULL, lock)); + CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + atomic_inc(&lock->cll_ref); +} +EXPORT_SYMBOL(cl_lock_get); + +/** + * Acquires a reference to a lock. + * + * This is much like cl_lock_get(), except that this function can be used to + * acquire initial reference to the cached lock. Caller has to deal with all + * possible races. Use with care! + * + * \see cl_page_get_trust() + */ +void cl_lock_get_trust(struct cl_lock *lock) +{ + CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + if (atomic_inc_return(&lock->cll_ref) == 1) + CS_LOCK_INC(lock->cll_descr.cld_obj, busy); +} +EXPORT_SYMBOL(cl_lock_get_trust); + +/** + * Helper function destroying the lock that wasn't completely initialized. + * + * Other threads can acquire references to the top-lock through its + * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately. + */ +static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_mutex_get(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); +} + +static struct cl_lock *cl_lock_alloc(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *descr) +{ + struct cl_lock *lock; + struct lu_object_header *head; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO); + if (lock != NULL) { + atomic_set(&lock->cll_ref, 1); + lock->cll_descr = *descr; + lock->cll_state = CLS_NEW; + cl_object_get(obj); + lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_lock", lock); + INIT_LIST_HEAD(&lock->cll_layers); + INIT_LIST_HEAD(&lock->cll_linkage); + INIT_LIST_HEAD(&lock->cll_inclosure); + lu_ref_init(&lock->cll_reference); + lu_ref_init(&lock->cll_holders); + mutex_init(&lock->cll_guard); + lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class); + init_waitqueue_head(&lock->cll_wq); + head = obj->co_lu.lo_header; + CS_LOCKSTATE_INC(obj, CLS_NEW); + CS_LOCK_INC(obj, total); + CS_LOCK_INC(obj, create); + cl_lock_lockdep_init(lock); + list_for_each_entry(obj, &head->loh_layers, + co_lu.lo_linkage) { + int err; + + err = obj->co_ops->coo_lock_init(env, obj, lock, io); + if (err != 0) { + cl_lock_finish(env, lock); + lock = ERR_PTR(err); + break; + } + } + } else + lock = ERR_PTR(-ENOMEM); + RETURN(lock); +} + +/** + * Transfer the lock into INTRANSIT state and return the original state. + * + * \pre state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED + * \post state: CLS_INTRANSIT + * \see CLS_INTRANSIT + */ +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock) +{ + enum cl_lock_state state = lock->cll_state; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(state != CLS_INTRANSIT); + LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED, + "Malformed lock state %d.\n", state); + + cl_lock_state_set(env, lock, CLS_INTRANSIT); + lock->cll_intransit_owner = current; + cl_lock_hold_add(env, lock, "intransit", current); + return state; +} +EXPORT_SYMBOL(cl_lock_intransit); + +/** + * Exit the intransit state and restore the lock state to the original state + */ +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(state != CLS_INTRANSIT); + LASSERT(lock->cll_intransit_owner == current); + + lock->cll_intransit_owner = NULL; + cl_lock_state_set(env, lock, state); + cl_lock_unhold(env, lock, "intransit", current); +} +EXPORT_SYMBOL(cl_lock_extransit); + +/** + * Checking whether the lock is intransit state + */ +int cl_lock_is_intransit(struct cl_lock *lock) +{ + LASSERT(cl_lock_is_mutexed(lock)); + return lock->cll_state == CLS_INTRANSIT && + lock->cll_intransit_owner != current; +} +EXPORT_SYMBOL(cl_lock_is_intransit); +/** + * Returns true iff lock is "suitable" for given io. E.g., locks acquired by + * truncate and O_APPEND cannot be reused for read/non-append-write, as they + * cover multiple stripes and can trigger cascading timeouts. + */ +static int cl_lock_fits_into(const struct lu_env *env, + const struct cl_lock *lock, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_fits_into != NULL && + !slice->cls_ops->clo_fits_into(env, slice, need, io)) + RETURN(0); + } + RETURN(1); +} + +static struct cl_lock *cl_lock_lookup(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_lock *lock; + struct cl_object_header *head; + + ENTRY; + + head = cl_object_header(obj); + LINVRNT(spin_is_locked(&head->coh_lock_guard)); + CS_LOCK_INC(obj, lookup); + list_for_each_entry(lock, &head->coh_locks, cll_linkage) { + int matched; + + matched = cl_lock_ext_match(&lock->cll_descr, need) && + lock->cll_state < CLS_FREEING && + lock->cll_error == 0 && + !(lock->cll_flags & CLF_CANCELLED) && + cl_lock_fits_into(env, lock, need, io); + CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n", + PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need), + matched); + if (matched) { + cl_lock_get_trust(lock); + CS_LOCK_INC(obj, hit); + RETURN(lock); + } + } + RETURN(NULL); +} + +/** + * Returns a lock matching description \a need. + * + * This is the main entry point into the cl_lock caching interface. First, a + * cache (implemented as a per-object linked list) is consulted. If lock is + * found there, it is returned immediately. Otherwise new lock is allocated + * and returned. In any case, additional reference to lock is acquired. + * + * \see cl_object_find(), cl_page_find() + */ +static struct cl_lock *cl_lock_find(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + ENTRY; + + obj = need->cld_obj; + head = cl_object_header(obj); + + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + + if (lock == NULL) { + lock = cl_lock_alloc(env, obj, io, need); + if (!IS_ERR(lock)) { + struct cl_lock *ghost; + + spin_lock(&head->coh_lock_guard); + ghost = cl_lock_lookup(env, obj, io, need); + if (ghost == NULL) { + list_add_tail(&lock->cll_linkage, + &head->coh_locks); + spin_unlock(&head->coh_lock_guard); + CS_LOCK_INC(obj, busy); + } else { + spin_unlock(&head->coh_lock_guard); + /* + * Other threads can acquire references to the + * top-lock through its sub-locks. Hence, it + * cannot be cl_lock_free()-ed immediately. + */ + cl_lock_finish(env, lock); + lock = ghost; + } + } + } + RETURN(lock); +} + +/** + * Returns existing lock matching given description. This is similar to + * cl_lock_find() except that no new lock is created, and returned lock is + * guaranteed to be in enum cl_lock_state::CLS_HELD state. + */ +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + obj = need->cld_obj; + head = cl_object_header(obj); + + do { + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + if (lock == NULL) + return NULL; + + cl_lock_mutex_get(env, lock); + if (lock->cll_state == CLS_INTRANSIT) + /* Don't care return value. */ + cl_lock_state_wait(env, lock); + if (lock->cll_state == CLS_FREEING) { + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + } while (lock == NULL); + + cl_lock_hold_add(env, lock, scope, source); + cl_lock_user_add(env, lock); + if (lock->cll_state == CLS_CACHED) + cl_use_try(env, lock, 1); + if (lock->cll_state == CLS_HELD) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, 0); + cl_lock_put(env, lock); + } else { + cl_unuse_try(env, lock); + cl_lock_unhold(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + + return lock; +} +EXPORT_SYMBOL(cl_lock_peek); + +/** + * Returns a slice within a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(NULL, lock)); + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} +EXPORT_SYMBOL(cl_lock_at); + +static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + counters = cl_lock_counters(env, lock); + lock->cll_depth++; + counters->ctc_nr_locks_locked++; + lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock); + cl_lock_trace(D_TRACE, env, "got mutex", lock); +} + +/** + * Locks cl_lock object. + * + * This is used to manipulate cl_lock fields, and to serialize state + * transitions in the lock state machine. + * + * \post cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_put() + */ +void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_guarder == current) { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_depth > 0); + } else { + struct cl_object_header *hdr; + struct cl_thread_info *info; + int i; + + LINVRNT(lock->cll_guarder != current); + hdr = cl_object_header(lock->cll_descr.cld_obj); + /* + * Check that mutices are taken in the bottom-to-top order. + */ + info = cl_env_info(env); + for (i = 0; i < hdr->coh_nesting; ++i) + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting); + lock->cll_guarder = current; + LINVRNT(lock->cll_depth == 0); + } + cl_lock_mutex_tail(env, lock); +} +EXPORT_SYMBOL(cl_lock_mutex_get); + +/** + * Try-locks cl_lock object. + * + * \retval 0 \a lock was successfully locked + * + * \retval -EBUSY \a lock cannot be locked right now + * + * \post ergo(result == 0, cl_lock_is_mutexed(lock)) + * + * \see cl_lock_mutex_get() + */ +int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + + result = 0; + if (lock->cll_guarder == current) { + LINVRNT(lock->cll_depth > 0); + cl_lock_mutex_tail(env, lock); + } else if (mutex_trylock(&lock->cll_guard)) { + LINVRNT(lock->cll_depth == 0); + lock->cll_guarder = current; + cl_lock_mutex_tail(env, lock); + } else + result = -EBUSY; + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_mutex_try); + +/** + {* Unlocks cl_lock object. + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_get() + */ +void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + LINVRNT(cl_lock_invariant(env, lock)); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_guarder == current); + LINVRNT(lock->cll_depth > 0); + + counters = cl_lock_counters(env, lock); + LINVRNT(counters->ctc_nr_locks_locked > 0); + + cl_lock_trace(D_TRACE, env, "put mutex", lock); + lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock); + counters->ctc_nr_locks_locked--; + if (--lock->cll_depth == 0) { + lock->cll_guarder = NULL; + mutex_unlock(&lock->cll_guard); + } +} +EXPORT_SYMBOL(cl_lock_mutex_put); + +/** + * Returns true iff lock's mutex is owned by the current thread. + */ +int cl_lock_is_mutexed(struct cl_lock *lock) +{ + return lock->cll_guarder == current; +} +EXPORT_SYMBOL(cl_lock_is_mutexed); + +/** + * Returns number of cl_lock mutices held by the current thread (environment). + */ +int cl_lock_nr_mutexed(const struct lu_env *env) +{ + struct cl_thread_info *info; + int i; + int locked; + + /* + * NOTE: if summation across all nesting levels (currently 2) proves + * too expensive, a summary counter can be added to + * struct cl_thread_info. + */ + info = cl_env_info(env); + for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + locked += info->clt_counters[i].ctc_nr_locks_locked; + return locked; +} +EXPORT_SYMBOL(cl_lock_nr_mutexed); + +static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + if (!(lock->cll_flags & CLF_CANCELLED)) { + const struct cl_lock_slice *slice; + + lock->cll_flags |= CLF_CANCELLED; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + } + EXIT; +} + +static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object_header *head; + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_FREEING); + + head = cl_object_header(lock->cll_descr.cld_obj); + + spin_lock(&head->coh_lock_guard); + list_del_init(&lock->cll_linkage); + spin_unlock(&head->coh_lock_guard); + + /* + * From now on, no new references to this lock can be acquired + * by cl_lock_lookup(). + */ + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_delete != NULL) + slice->cls_ops->clo_delete(env, slice); + } + /* + * From now on, no new references to this lock can be acquired + * by layer-specific means (like a pointer from struct + * ldlm_lock in osc, or a pointer from top-lock to sub-lock in + * lov). + * + * Lock will be finally freed in cl_lock_put() when last of + * existing references goes away. + */ + } + EXIT; +} + +/** + * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a + * top-lock (nesting == 0) accounts for this modification in the per-thread + * debugging counters. Sub-lock holds can be released by a thread different + * from one that acquired it. + */ +static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_holds += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_held += delta; + LASSERT(counters->ctc_nr_held >= 0); + } +} + +/** + * Mod(ifie)s cl_lock::cll_users counter for a given lock. See + * cl_lock_hold_mod() for the explanation of the debugging code. + */ +static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_users += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_used += delta; + LASSERT(counters->ctc_nr_used >= 0); + } +} + +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock); + lu_ref_del(&lock->cll_holders, scope, source); + cl_lock_hold_mod(env, lock, -1); + if (lock->cll_holds == 0) { + CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock); + if (lock->cll_descr.cld_mode == CLM_PHANTOM || + lock->cll_descr.cld_mode == CLM_GROUP || + lock->cll_state != CLS_CACHED) + /* + * If lock is still phantom or grouplock when user is + * done with it---destroy the lock. + */ + lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED; + if (lock->cll_flags & CLF_CANCELPEND) { + lock->cll_flags &= ~CLF_CANCELPEND; + cl_lock_cancel0(env, lock); + } + if (lock->cll_flags & CLF_DOOMED) { + /* no longer doomed: it's dead... Jim. */ + lock->cll_flags &= ~CLF_DOOMED; + cl_lock_delete0(env, lock); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_hold_release); + +/** + * Waits until lock state is changed. + * + * This function is called with cl_lock mutex locked, atomically releases + * mutex and goes to sleep, waiting for a lock state change (signaled by + * cl_lock_signal()), and re-acquires the mutex before return. + * + * This function is used to wait until lock state machine makes some progress + * and to emulate synchronous operations on top of asynchronous lock + * interface. + * + * \retval -EINTR wait was interrupted + * + * \retval 0 wait wasn't interrupted + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_signal() + */ +int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock) +{ + wait_queue_t waiter; + sigset_t blocked; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_depth == 1); + LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */ + + cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock); + result = lock->cll_error; + if (result == 0) { + /* To avoid being interrupted by the 'non-fatal' signals + * (SIGCHLD, for instance), we'd block them temporarily. + * LU-305 */ + blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + init_waitqueue_entry_current(&waiter); + add_wait_queue(&lock->cll_wq, &waiter); + set_current_state(TASK_INTERRUPTIBLE); + cl_lock_mutex_put(env, lock); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + + /* Returning ERESTARTSYS instead of EINTR so syscalls + * can be restarted if signals are pending here */ + result = -ERESTARTSYS; + if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) { + waitq_wait(&waiter, TASK_INTERRUPTIBLE); + if (!cfs_signal_pending()) + result = 0; + } + + cl_lock_mutex_get(env, lock); + set_current_state(TASK_RUNNING); + remove_wait_queue(&lock->cll_wq, &waiter); + + /* Restore old blocked signals */ + cfs_restore_sigs(blocked); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_state_wait); + +static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + const struct cl_lock_slice *slice; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) + if (slice->cls_ops->clo_state != NULL) + slice->cls_ops->clo_state(env, slice, state); + wake_up_all(&lock->cll_wq); + EXIT; +} + +/** + * Notifies waiters that lock state changed. + * + * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all + * layers about state change by calling cl_lock_operations::clo_state() + * top-to-bottom. + */ +void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock); + cl_lock_state_signal(env, lock, lock->cll_state); + EXIT; +} +EXPORT_SYMBOL(cl_lock_signal); + +/** + * Changes lock state. + * + * This function is invoked to notify layers that lock state changed, possible + * as a result of an asynchronous event such as call-back reception. + * + * \post lock->cll_state == state + * + * \see cl_lock_operations::clo_state() + */ +void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + ENTRY; + LASSERT(lock->cll_state <= state || + (lock->cll_state == CLS_CACHED && + (state == CLS_HELD || /* lock found in cache */ + state == CLS_NEW || /* sub-lock canceled */ + state == CLS_INTRANSIT)) || + /* lock is in transit state */ + lock->cll_state == CLS_INTRANSIT); + + if (lock->cll_state != state) { + CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state); + CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state); + + cl_lock_state_signal(env, lock, state); + lock->cll_state = state; + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_state_set); + +static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + do { + result = 0; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + + result = -ENOSYS; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_unuse != NULL) { + result = slice->cls_ops->clo_unuse(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + } while (result == CLO_REPEAT); + + return result; +} + +/** + * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling + * cl_lock_operations::clo_use() top-to-bottom to notify layers. + * @atomic = 1, it must unuse the lock to recovery the lock to keep the + * use process atomic + */ +int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic) +{ + const struct cl_lock_slice *slice; + int result; + enum cl_lock_state state; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "use lock", lock); + + LASSERT(lock->cll_state == CLS_CACHED); + if (lock->cll_error) + RETURN(lock->cll_error); + + result = -ENOSYS; + state = cl_lock_intransit(env, lock); + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_use != NULL) { + result = slice->cls_ops->clo_use(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + + LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n", + lock->cll_state); + + if (result == 0) { + state = CLS_HELD; + } else { + if (result == -ESTALE) { + /* + * ESTALE means sublock being cancelled + * at this time, and set lock state to + * be NEW here and ask the caller to repeat. + */ + state = CLS_NEW; + result = CLO_REPEAT; + } + + /* @atomic means back-off-on-failure. */ + if (atomic) { + int rc; + rc = cl_unuse_try_internal(env, lock); + /* Vet the results. */ + if (rc < 0 && result > 0) + result = rc; + } + + } + cl_lock_extransit(env, lock, state); + RETURN(result); +} +EXPORT_SYMBOL(cl_use_try); + +/** + * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers + * top-to-bottom. + */ +static int cl_enqueue_kick(const struct lu_env *env, + struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + const struct cl_lock_slice *slice; + + ENTRY; + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue != NULL) { + result = slice->cls_ops->clo_enqueue(env, + slice, io, flags); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + RETURN(result); +} + +/** + * Tries to enqueue a lock. + * + * This function is called repeatedly by cl_enqueue() until either lock is + * enqueued, or error occurs. This function does not block waiting for + * networking communication to complete. + * + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \see cl_enqueue() cl_lock_operations::clo_enqueue() + * \see cl_lock_state::CLS_ENQUEUED + */ +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + + result = lock->cll_error; + if (result != 0) + break; + + switch (lock->cll_state) { + case CLS_NEW: + cl_lock_state_set(env, lock, CLS_QUEUING); + /* fall-through */ + case CLS_QUEUING: + /* kick layers. */ + result = cl_enqueue_kick(env, lock, io, flags); + /* For AGL case, the cl_lock::cll_state may + * become CLS_HELD already. */ + if (result == 0 && lock->cll_state == CLS_QUEUING) + cl_lock_state_set(env, lock, CLS_ENQUEUED); + break; + case CLS_INTRANSIT: + LASSERT(cl_lock_is_intransit(lock)); + result = CLO_WAIT; + break; + case CLS_CACHED: + /* yank lock from the cache. */ + result = cl_use_try(env, lock, 0); + break; + case CLS_ENQUEUED: + case CLS_HELD: + result = 0; + break; + default: + case CLS_FREEING: + /* + * impossible, only held locks with increased + * ->cll_holds can be enqueued, and they cannot be + * freed. + */ + LBUG(); + } + } while (result == CLO_REPEAT); + RETURN(result); +} +EXPORT_SYMBOL(cl_enqueue_try); + +/** + * Cancel the conflicting lock found during previous enqueue. + * + * \retval 0 conflicting lock has been canceled. + * \retval -ve error code. + */ +int cl_lock_enqueue_wait(const struct lu_env *env, + struct cl_lock *lock, + int keep_mutex) +{ + struct cl_lock *conflict; + int rc = 0; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + LASSERT(lock->cll_conflict != NULL); + + conflict = lock->cll_conflict; + lock->cll_conflict = NULL; + + cl_lock_mutex_put(env, lock); + LASSERT(cl_lock_nr_mutexed(env) == 0); + + cl_lock_mutex_get(env, conflict); + cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict); + cl_lock_cancel(env, conflict); + cl_lock_delete(env, conflict); + + while (conflict->cll_state != CLS_FREEING) { + rc = cl_lock_state_wait(env, conflict); + if (rc != 0) + break; + } + cl_lock_mutex_put(env, conflict); + lu_ref_del(&conflict->cll_reference, "cancel-wait", lock); + cl_lock_put(env, conflict); + + if (keep_mutex) + cl_lock_mutex_get(env, lock); + + LASSERT(rc <= 0); + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_enqueue_wait); + +static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + cl_lock_user_add(env, lock); + do { + result = cl_enqueue_try(env, lock, io, enqflags); + if (result == CLO_WAIT) { + if (lock->cll_conflict != NULL) + result = cl_lock_enqueue_wait(env, lock, 1); + else + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result != 0) + cl_unuse_try(env, lock); + LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL), + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} + +/** + * Enqueues a lock. + * + * \pre current thread or io owns a hold on lock. + * + * \post ergo(result == 0, lock->users increased) + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + */ +int cl_enqueue(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + cl_lock_lockdep_acquire(env, lock, enqflags); + cl_lock_mutex_get(env, lock); + result = cl_enqueue_locked(env, lock, io, enqflags); + cl_lock_mutex_put(env, lock); + if (result != 0) + cl_lock_lockdep_release(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_enqueue); + +/** + * Tries to unlock a lock. + * + * This function is called to release underlying resource: + * 1. for top lock, the resource is sublocks it held; + * 2. for sublock, the resource is the reference to dlmlock. + * + * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see cl_unuse() cl_lock_operations::clo_unuse() + * \see cl_lock_state::CLS_CACHED + */ +int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + enum cl_lock_state state = CLS_NEW; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock); + + if (lock->cll_users > 1) { + cl_lock_user_del(env, lock); + RETURN(0); + } + + /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold + * underlying resources. */ + if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) { + cl_lock_user_del(env, lock); + RETURN(0); + } + + /* + * New lock users (->cll_users) are not protecting unlocking + * from proceeding. From this point, lock eventually reaches + * CLS_CACHED, is reinitialized to CLS_NEW or fails into + * CLS_FREEING. + */ + state = cl_lock_intransit(env, lock); + + result = cl_unuse_try_internal(env, lock); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(result != CLO_WAIT); + cl_lock_user_del(env, lock); + if (result == 0 || result == -ESTALE) { + /* + * Return lock back to the cache. This is the only + * place where lock is moved into CLS_CACHED state. + * + * If one of ->clo_unuse() methods returned -ESTALE, lock + * cannot be placed into cache and has to be + * re-initialized. This happens e.g., when a sub-lock was + * canceled while unlocking was in progress. + */ + if (state == CLS_HELD && result == 0) + state = CLS_CACHED; + else + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + + /* + * Hide -ESTALE error. + * If the lock is a glimpse lock, and it has multiple + * stripes. Assuming that one of its sublock returned -ENAVAIL, + * and other sublocks are matched write locks. In this case, + * we can't set this lock to error because otherwise some of + * its sublocks may not be canceled. This causes some dirty + * pages won't be written to OSTs. -jay + */ + result = 0; + } else { + CERROR("result = %d, this is unlikely!\n", result); + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + } + RETURN(result ?: lock->cll_error); +} +EXPORT_SYMBOL(cl_unuse_try); + +static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + ENTRY; + + result = cl_unuse_try(env, lock); + if (result) + CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result); + + EXIT; +} + +/** + * Unlocks a lock. + */ +void cl_unuse(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_mutex_get(env, lock); + cl_unuse_locked(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_release(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_unuse); + +/** + * Tries to wait for a lock. + * + * This function is called repeatedly by cl_wait() until either lock is + * granted, or error occurs. This function does not block waiting for network + * communication to complete. + * + * \see cl_wait() cl_lock_operations::clo_wait() + * \see cl_lock_state::CLS_HELD + */ +int cl_wait_try(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_QUEUING || + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD || + lock->cll_state == CLS_INTRANSIT, + "lock state: %d\n", lock->cll_state); + LASSERT(lock->cll_users > 0); + LASSERT(lock->cll_holds > 0); + + result = lock->cll_error; + if (result != 0) + break; + + if (cl_lock_is_intransit(lock)) { + result = CLO_WAIT; + break; + } + + if (lock->cll_state == CLS_HELD) + /* nothing to do */ + break; + + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_wait != NULL) { + result = slice->cls_ops->clo_wait(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + if (result == 0) { + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_HELD); + } + } while (result == CLO_REPEAT); + RETURN(result); +} +EXPORT_SYMBOL(cl_wait_try); + +/** + * Waits until enqueued lock is granted. + * + * \pre current thread or io owns a hold on the lock + * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \post ergo(result == 0, lock->cll_state == CLS_HELD) + */ +int cl_wait(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + ENTRY; + cl_lock_mutex_get(env, lock); + + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD, + "Wrong state %d \n", lock->cll_state); + LASSERT(lock->cll_holds > 0); + + do { + result = cl_wait_try(env, lock); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result < 0) { + cl_unuse_try(env, lock); + cl_lock_lockdep_release(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, "wait lock", lock); + cl_lock_mutex_put(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_wait); + +/** + * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock + * value. + */ +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + unsigned long pound; + unsigned long ounce; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + pound = 0; + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_weigh != NULL) { + ounce = slice->cls_ops->clo_weigh(env, slice); + pound += ounce; + if (pound < ounce) /* over-weight^Wflow */ + pound = ~0UL; + } + } + RETURN(pound); +} +EXPORT_SYMBOL(cl_lock_weigh); + +/** + * Notifies layers that lock description changed. + * + * The server can grant client a lock different from one that was requested + * (e.g., larger in extent). This method is called when actually granted lock + * description becomes known to let layers to accommodate for changed lock + * description. + * + * \see cl_lock_operations::clo_modify() + */ +int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc) +{ + const struct cl_lock_slice *slice; + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object_header *hdr = cl_object_header(obj); + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "modify lock", lock); + /* don't allow object to change */ + LASSERT(obj == desc->cld_obj); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_modify != NULL) { + result = slice->cls_ops->clo_modify(env, slice, desc); + if (result != 0) + RETURN(result); + } + } + CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n", + PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu))); + /* + * Just replace description in place. Nothing more is needed for + * now. If locks were indexed according to their extent and/or mode, + * that index would have to be updated here. + */ + spin_lock(&hdr->coh_lock_guard); + lock->cll_descr = *desc; + spin_unlock(&hdr->coh_lock_guard); + RETURN(0); +} +EXPORT_SYMBOL(cl_lock_modify); + +/** + * Initializes lock closure with a given origin. + * + * \see cl_lock_closure + */ +void cl_lock_closure_init(const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait) +{ + LINVRNT(cl_lock_is_mutexed(origin)); + LINVRNT(cl_lock_invariant(env, origin)); + + INIT_LIST_HEAD(&closure->clc_list); + closure->clc_origin = origin; + closure->clc_wait = wait; + closure->clc_nr = 0; +} +EXPORT_SYMBOL(cl_lock_closure_init); + +/** + * Builds a closure of \a lock. + * + * Building of a closure consists of adding initial lock (\a lock) into it, + * and calling cl_lock_operations::clo_closure() methods of \a lock. These + * methods might call cl_lock_closure_build() recursively again, adding more + * locks to the closure, etc. + * + * \see cl_lock_closure + */ +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(closure->clc_origin)); + LINVRNT(cl_lock_invariant(env, closure->clc_origin)); + + result = cl_lock_enclosure(env, lock, closure); + if (result == 0) { + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_closure != NULL) { + result = slice->cls_ops->clo_closure(env, slice, + closure); + if (result != 0) + break; + } + } + } + if (result != 0) + cl_lock_disclosure(env, closure); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_closure_build); + +/** + * Adds new lock to a closure. + * + * Try-locks \a lock and if succeeded, adds it to the closure (never more than + * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting + * until next try-lock is likely to succeed. + */ +int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + int result = 0; + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock); + if (!cl_lock_mutex_try(env, lock)) { + /* + * If lock->cll_inclosure is not empty, lock is already in + * this closure. + */ + if (list_empty(&lock->cll_inclosure)) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure", closure); + list_add(&lock->cll_inclosure, &closure->clc_list); + closure->clc_nr++; + } else + cl_lock_mutex_put(env, lock); + result = 0; + } else { + cl_lock_disclosure(env, closure); + if (closure->clc_wait) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure-w", closure); + cl_lock_mutex_put(env, closure->clc_origin); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + cl_lock_mutex_get(env, lock); + cl_lock_mutex_put(env, lock); + + cl_lock_mutex_get(env, closure->clc_origin); + lu_ref_del(&lock->cll_reference, "closure-w", closure); + cl_lock_put(env, lock); + } + result = CLO_REPEAT; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_enclosure); + +/** Releases mutices of enclosed locks. */ +void cl_lock_disclosure(const struct lu_env *env, + struct cl_lock_closure *closure) +{ + struct cl_lock *scan; + struct cl_lock *temp; + + cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin); + list_for_each_entry_safe(scan, temp, &closure->clc_list, + cll_inclosure){ + list_del_init(&scan->cll_inclosure); + cl_lock_mutex_put(env, scan); + lu_ref_del(&scan->cll_reference, "closure", closure); + cl_lock_put(env, scan); + closure->clc_nr--; + } + LASSERT(closure->clc_nr == 0); +} +EXPORT_SYMBOL(cl_lock_disclosure); + +/** Finalizes a closure. */ +void cl_lock_closure_fini(struct cl_lock_closure *closure) +{ + LASSERT(closure->clc_nr == 0); + LASSERT(list_empty(&closure->clc_list)); +} +EXPORT_SYMBOL(cl_lock_closure_fini); + +/** + * Destroys this lock. Notifies layers (bottom-to-top) that lock is being + * destroyed, then destroy the lock. If there are holds on the lock, postpone + * destruction until all holds are released. This is called when a decision is + * made to destroy the lock in the future. E.g., when a blocking AST is + * received on it, or fatal communication error happens. + * + * Caller must have a reference on this lock to prevent a situation, when + * deleted lock lingers in memory for indefinite time, because nobody calls + * cl_lock_put() to finish it. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * \pre ergo(cl_lock_nesting(lock) == CNL_TOP, + * cl_lock_nr_mutexed(env) == 1) + * [i.e., if a top-lock is deleted, mutices of no other locks can be + * held, as deletion of sub-locks might require releasing a top-lock + * mutex] + * + * \see cl_lock_operations::clo_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP, + cl_lock_nr_mutexed(env) == 1)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "delete lock", lock); + if (lock->cll_holds == 0) + cl_lock_delete0(env, lock); + else + lock->cll_flags |= CLF_DOOMED; + EXIT; +} +EXPORT_SYMBOL(cl_lock_delete); + +/** + * Mark lock as irrecoverably failed, and mark it for destruction. This + * happens when, e.g., server fails to grant a lock to us, or networking + * time-out happens. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * + * \see clo_lock_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_error == 0 && error != 0) { + cl_lock_trace(D_DLMTRACE, env, "set lock error", lock); + lock->cll_error = error; + cl_lock_signal(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_error); + +/** + * Cancels this lock. Notifies layers + * (bottom-to-top) that lock is being cancelled, then destroy the lock. If + * there are holds on the lock, postpone cancellation until + * all holds are released. + * + * Cancellation notification is delivered to layers at most once. + * + * \see cl_lock_operations::clo_cancel() + * \see cl_lock::cll_holds + */ +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); + if (lock->cll_holds == 0) + cl_lock_cancel0(env, lock); + else + lock->cll_flags |= CLF_CANCELPEND; + EXIT; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Finds an existing lock covering given index and optionally different from a + * given \a except lock. + */ +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, + int pending, int canceld) +{ + struct cl_object_header *head; + struct cl_lock *scan; + struct cl_lock *lock; + struct cl_lock_descr *need; + + ENTRY; + + head = cl_object_header(obj); + need = &cl_env_info(env)->clt_descr; + lock = NULL; + + need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but + * not PHANTOM */ + need->cld_start = need->cld_end = index; + need->cld_enq_flags = 0; + + spin_lock(&head->coh_lock_guard); + /* It is fine to match any group lock since there could be only one + * with a uniq gid and it conflicts with all other lock modes too */ + list_for_each_entry(scan, &head->coh_locks, cll_linkage) { + if (scan != except && + (scan->cll_descr.cld_mode == CLM_GROUP || + cl_lock_ext_match(&scan->cll_descr, need)) && + scan->cll_state >= CLS_HELD && + scan->cll_state < CLS_FREEING && + /* + * This check is racy as the lock can be canceled right + * after it is done, but this is fine, because page exists + * already. + */ + (canceld || !(scan->cll_flags & CLF_CANCELLED)) && + (pending || !(scan->cll_flags & CLF_CANCELPEND))) { + /* Don't increase cs_hit here since this + * is just a helper function. */ + cl_lock_get_trust(scan); + lock = scan; + break; + } + } + spin_unlock(&head->coh_lock_guard); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_at_pgoff); + +/** + * Calculate the page offset at the layer of @lock. + * At the time of this writing, @page is top page and @lock is sub lock. + */ +static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock) +{ + struct lu_device_type *dtype; + const struct cl_page_slice *slice; + + dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type; + slice = cl_page_at(page, dtype); + LASSERT(slice != NULL); + return slice->cpl_page->cp_index; +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + pgoff_t index = pgoff_at_lock(page, lock); + + if (index >= info->clt_fn_index) { + struct cl_lock *tmp; + + /* refresh non-overlapped index */ + tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index, + lock, 1, 0); + if (tmp != NULL) { + /* Cache the first-non-overlapped index so as to skip + * all pages within [index, clt_fn_index). This + * is safe because if tmp lock is canceled, it will + * discard these pages. */ + info->clt_fn_index = tmp->cll_descr.cld_end + 1; + if (tmp->cll_descr.cld_end == CL_PAGE_EOF) + info->clt_fn_index = CL_PAGE_EOF; + cl_lock_put(env, tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->clt_next_index = index + 1; + return CLP_GANG_OKAY; +} + +static int discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + + LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageWriteback(cl_page_vmpage(env, page)))); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(env, page)))); + + info->clt_next_index = pgoff_at_lock(page, lock) + 1; + if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + + return CLP_GANG_OKAY; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_io *io = &info->clt_io; + struct cl_lock_descr *descr = &lock->cll_descr; + cl_page_gang_cb_t cb; + int res; + int result; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + + io->ci_obj = cl_object_top(descr->cld_obj); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb; + info->clt_fn_index = info->clt_next_index = descr->cld_start; + do { + res = cl_page_gang_lookup(env, descr->cld_obj, io, + info->clt_next_index, descr->cld_end, + cb, (void *)lock); + if (info->clt_next_index > descr->cld_end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_discard_pages); + +/** + * Eliminate all locks for a given object. + * + * Caller has to guarantee that no lock is in active use. + * + * \param cancel when this is set, cl_locks_prune() cancels locks before + * destroying. + */ +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel) +{ + struct cl_object_header *head; + struct cl_lock *lock; + + ENTRY; + head = cl_object_header(obj); + /* + * If locks are destroyed without cancellation, all pages must be + * already destroyed (as otherwise they will be left unprotected). + */ + LASSERT(ergo(!cancel, + head->coh_tree.rnode == NULL && head->coh_pages == 0)); + + spin_lock(&head->coh_lock_guard); + while (!list_empty(&head->coh_locks)) { + lock = container_of(head->coh_locks.next, + struct cl_lock, cll_linkage); + cl_lock_get_trust(lock); + spin_unlock(&head->coh_lock_guard); + lu_ref_add(&lock->cll_reference, "prune", current); + +again: + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_users <= 1); + if (unlikely(lock->cll_users == 1)) { + struct l_wait_info lwi = { 0 }; + + cl_lock_mutex_put(env, lock); + l_wait_event(lock->cll_wq, + lock->cll_users == 0, + &lwi); + goto again; + } + + if (cancel) + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, "prune", current); + cl_lock_put(env, lock); + spin_lock(&head->coh_lock_guard); + } + spin_unlock(&head->coh_lock_guard); + EXIT; +} +EXPORT_SYMBOL(cl_locks_prune); + +static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + while (1) { + lock = cl_lock_find(env, io, need); + if (IS_ERR(lock)) + break; + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING && + !(lock->cll_flags & CLF_CANCELLED)) { + cl_lock_hold_mod(env, lock, +1); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + break; + } + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + } + RETURN(lock); +} + +/** + * Returns a lock matching \a need description with a reference and a hold on + * it. + * + * This is much like cl_lock_find(), except that cl_lock_hold() additionally + * guarantees that lock is not in the CLS_FREEING state on return. + */ +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (!IS_ERR(lock)) + cl_lock_mutex_put(env, lock); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_hold); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + int rc; + __u32 enqflags = need->cld_enq_flags; + + ENTRY; + do { + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (IS_ERR(lock)) + break; + + rc = cl_enqueue_locked(env, lock, io, enqflags); + if (rc == 0) { + if (cl_lock_fits_into(env, lock, need, io)) { + if (!(enqflags & CEF_AGL)) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, + enqflags); + break; + } + rc = 1; + } + cl_unuse_locked(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, + rc <= 0 ? "enqueue failed" : "agl succeed", lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + if (rc > 0) { + LASSERT(enqflags & CEF_AGL); + lock = NULL; + } else if (rc != 0) { + lock = ERR_PTR(rc); + } + } while (rc == 0); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Adds a hold to a known lock. + */ +void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state != CLS_FREEING); + + ENTRY; + cl_lock_hold_mod(env, lock, +1); + cl_lock_get(lock); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + EXIT; +} +EXPORT_SYMBOL(cl_lock_hold_add); + +/** + * Releases a hold and a reference on a lock, on which caller acquired a + * mutex. + */ +void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_hold_release(env, lock, scope, source); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_unhold); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "release lock", lock); + cl_lock_mutex_get(env, lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_release); + +void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + cl_lock_used_mod(env, lock, +1); + EXIT; +} +EXPORT_SYMBOL(cl_lock_user_add); + +void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_users > 0); + + ENTRY; + cl_lock_used_mod(env, lock, -1); + if (lock->cll_users == 0) + wake_up_all(&lock->cll_wq); + EXIT; +} +EXPORT_SYMBOL(cl_lock_user_del); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char *names[] = { + [CLM_PHANTOM] = "P", + [CLM_READ] = "R", + [CLM_WRITE] = "W", + [CLM_GROUP] = "G" + }; + if (0 <= mode && mode < ARRAY_SIZE(names)) + return names[mode]; + else + return "U"; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ", + lock, atomic_read(&lock->cll_ref), + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); + +int cl_lock_init(void) +{ + return lu_kmem_init(cl_lock_caches); +} + +void cl_lock_fini(void) +{ + lu_kmem_fini(cl_lock_caches); +} diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c new file mode 100644 index 000000000000..faa9ef63a3f5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c @@ -0,0 +1,1155 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Object. + * + * Author: Nikita Danilov + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_page_guard + * ->coh_lock_guard + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +/* class_put_type() */ +#include +#include +#include +#include +#include /* for cfs_hash stuff */ +#include +#include "cl_internal.h" + +static struct kmem_cache *cl_env_kmem; + +/** Lock class of cl_object_header::coh_page_guard */ +static struct lock_class_key cl_page_guard_class; +/** Lock class of cl_object_header::coh_lock_guard */ +static struct lock_class_key cl_lock_guard_class; +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +extern __u32 lu_context_tags_default; +extern __u32 lu_session_tags_default; +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + ENTRY; + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_page_guard); + spin_lock_init(&h->coh_lock_guard); + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class); + lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_pages = 0; + /* XXX hard coded GFP_* mask. */ + INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC); + INIT_LIST_HEAD(&h->coh_locks); + h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + LASSERT(list_empty(&h->coh_locks)); + lu_object_header_fini(&h->coh_lu); +} +EXPORT_SYMBOL(cl_object_header_fini); + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_page_top(), cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_set(). + */ +void cl_object_attr_lock(struct cl_object *o) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lu_object_header *top; + int result; + + LASSERT(spin_is_locked(cl_object_attr_guard(obj))); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom + * to top. + */ +int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned v) +{ + struct lu_object_header *top; + int result; + + LASSERT(spin_is_locked(cl_object_attr_guard(obj))); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_set != NULL) { + result = obj->co_ops->coo_attr_set(env, obj, attr, v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_set); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), + "size: "LPU64" mtime: "LPU64" atime: "LPU64" " + "ctime: "LPU64" blocks: "LPU64"\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(result); +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr; + + hdr = cl_object_header(obj); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); + /* + * Destroy all locks. Object destruction (including cl_inode_fini()) + * cannot cancel the locks, because in the case of a local client, + * where client and server share the same thread running + * prune_icache(), this can dead-lock with ldlm_cancel_handler() + * waiting on __wait_on_freeing_inode(). + */ + cl_locks_prune(env, obj, 0); +} +EXPORT_SYMBOL(cl_object_kill); + +/** + * Prunes caches of pages and locks for this object. + */ +void cl_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + ENTRY; + cl_pages_prune(env, obj); + cl_locks_prune(env, obj, 1); + EXIT; +} +EXPORT_SYMBOL(cl_object_prune); + +/** + * Check if the object has locks. + */ +int cl_object_has_locks(struct cl_object *obj) +{ + struct cl_object_header *head = cl_object_header(obj); + int has; + + spin_lock(&head->coh_lock_guard); + has = list_empty(&head->coh_locks); + spin_unlock(&head->coh_lock_guard); + + return (has == 0); +} +EXPORT_SYMBOL(cl_object_has_locks); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + int i; + + cs->cs_name = name; + for (i = 0; i < CS_NR; i++) + atomic_set(&cs->cs_stats[i], 0); +} + +int cache_stats_print(const struct cache_stats *cs, + char *page, int count, int h) +{ + int nob = 0; + int i; + /* + * lookup hit total cached create + * env: ...... ...... ...... ...... ...... + */ + if (h) { + const char *names[CS_NR] = CS_NAMES; + + nob += snprintf(page + nob, count - nob, "%6s", " "); + for (i = 0; i < CS_NR; i++) + nob += snprintf(page + nob, count - nob, + "%8s", names[i]); + nob += snprintf(page + nob, count - nob, "\n"); + } + + nob += snprintf(page + nob, count - nob, "%5.5s:", cs->cs_name); + for (i = 0; i < CS_NR; i++) + nob += snprintf(page + nob, count - nob, "%8u", + atomic_read(&cs->cs_stats[i])); + return nob; +} + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + int i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + cache_stats_init(&s->cs_locks, "locks"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i) + atomic_set(&s->cs_locks_state[i], 0); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_stats = { ATOMIC_INIT(0), } +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, char *page, int count) +{ + int nob; + int i; + static const char *pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + static const char *lstate[] = { + [CLS_NEW] = "n", + [CLS_QUEUING] = "q", + [CLS_ENQUEUED] = "e", + [CLS_HELD] = "h", + [CLS_INTRANSIT] = "t", + [CLS_CACHED] = "c", + [CLS_FREEING] = "f" + }; +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + nob = lu_site_stats_print(&site->cs_lu, page, count); + nob += cache_stats_print(&site->cs_pages, page + nob, count - nob, 1); + nob += snprintf(page + nob, count - nob, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + nob += snprintf(page + nob, count - nob, "%s: %u ", + pstate[i], + atomic_read(&site->cs_pages_state[i])); + nob += snprintf(page + nob, count - nob, "]\n"); + nob += cache_stats_print(&site->cs_locks, page + nob, count - nob, 0); + nob += snprintf(page + nob, count - nob, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i) + nob += snprintf(page + nob, count - nob, "%s: %u ", + lstate[i], + atomic_read(&site->cs_locks_state[i])); + nob += snprintf(page + nob, count - nob, "]\n"); + nob += cache_stats_print(&cl_env_stats, page + nob, count - nob, 0); + nob += snprintf(page + nob, count - nob, "\n"); + return nob; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/** + * The most efficient way is to store cl_env pointer in task specific + * structures. On Linux, it wont' be easy to use task_struct->journal_info + * because Lustre code may call into other fs which has certain assumptions + * about journal_info. Currently following fields in task_struct are identified + * can be used for this purpose: + * - cl_env: for liblustre. + * - tux_info: ony on RedHat kernel. + * - ... + * \note As long as we use task_struct to store cl_env, we assume that once + * called into Lustre, we'll never call into the other part of the kernel + * which will use those fields in task_struct without explicitly exiting + * Lustre. + * + * If there's no space in task_struct is available, hash will be used. + * bz20044, bz22683. + */ + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + + /** + * This allows cl_env to be entered into cl_env_hash which implements + * the current thread -> client environment lookup. + */ + struct hlist_node ce_node; + /** + * Owner for the current cl_env. + * + * If LL_TASK_CL_ENV is defined, this point to the owning current, + * only for debugging purpose ; + * Otherwise hash is used, and this is the key for cfs_hash. + * Now current thread pid is stored. Note using thread pointer would + * lead to unbalanced hash because of its specific allocation locality + * and could be varied for different platforms and OSes, even different + * OS versions. + */ + void *ce_owner; + + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; +}; + +#define CL_ENV_INC(counter) +#define CL_ENV_DEC(counter) + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL); + + cle->ce_ref = 1; + cle->ce_debug = debug; + CL_ENV_INC(busy); +} + + +/* + * The implementation of using hash table to connect cl_env and thread + */ + +static cfs_hash_t *cl_env_hash; + +static unsigned cl_env_hops_hash(cfs_hash_t *lh, + const void *key, unsigned mask) +{ +#if BITS_PER_LONG == 64 + return cfs_hash_u64_hash((__u64)key, mask); +#else + return cfs_hash_u32_hash((__u32)key, mask); +#endif +} + +static void *cl_env_hops_obj(struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); + return (void *)cle; +} + +static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn) +{ + struct cl_env *cle = cl_env_hops_obj(hn); + + LASSERT(cle->ce_owner != NULL); + return (key == cle->ce_owner); +} + +static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); +} + +static cfs_hash_ops_t cl_env_hops = { + .hs_hash = cl_env_hops_hash, + .hs_key = cl_env_hops_obj, + .hs_keycmp = cl_env_hops_keycmp, + .hs_object = cl_env_hops_obj, + .hs_get = cl_env_hops_noop, + .hs_put_locked = cl_env_hops_noop, +}; + +static inline struct cl_env *cl_env_fetch(void) +{ + struct cl_env *cle; + + cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid); + LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0)); + return cle; +} + +static inline void cl_env_attach(struct cl_env *cle) +{ + if (cle) { + int rc; + + LASSERT(cle->ce_owner == NULL); + cle->ce_owner = (void *) (long) current->pid; + rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(rc == 0); + } +} + +static inline void cl_env_do_detach(struct cl_env *cle) +{ + void *cookie; + + LASSERT(cle->ce_owner == (void *) (long) current->pid); + cookie = cfs_hash_del(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(cookie == cle); + cle->ce_owner = NULL; +} + +static int cl_env_store_init(void) { + cl_env_hash = cfs_hash_create("cl_env", + HASH_CL_ENV_BITS, HASH_CL_ENV_BITS, + HASH_CL_ENV_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &cl_env_hops, + CFS_HASH_RW_BKTLOCK); + return cl_env_hash != NULL ? 0 :-ENOMEM; +} + +static void cl_env_store_fini(void) { + cfs_hash_putref(cl_env_hash); +} + + +static inline struct cl_env *cl_env_detach(struct cl_env *cle) +{ + if (cle == NULL) + cle = cl_env_fetch(); + + if (cle && cle->ce_owner) + cl_env_do_detach(cle); + + return cle; +} + +static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO); + if (cle != NULL) { + int rc; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, + LCT_SESSION | ses_tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + CL_ENV_INC(create); + CL_ENV_INC(total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + CL_ENV_DEC(total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +struct lu_env *cl_env_peek(int *refcheck) +{ + struct lu_env *env; + struct cl_env *cle; + + CL_ENV_INC(lookup); + + /* check that we don't go far from untrusted pointer */ + CLASSERT(offsetof(struct cl_env, ce_magic) == 0); + + env = NULL; + cle = cl_env_fetch(); + if (cle != NULL) { + CL_ENV_INC(hit); + env = &cle->ce_lu; + *refcheck = ++cle->ce_ref; + } + CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle); + return env; +} +EXPORT_SYMBOL(cl_env_peek); + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(int *refcheck) +{ + struct lu_env *env; + + env = cl_env_peek(refcheck); + if (env == NULL) { + env = cl_env_new(lu_context_tags_default, + lu_session_tags_default, + __builtin_return_address(0)); + + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + cl_env_attach(cle); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(int *refcheck, __u32 tags) +{ + struct lu_env *env; + + LASSERT(cl_env_peek(refcheck) == NULL); + env = cl_env_new(tags, tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + LASSERT(cle->ce_owner == NULL); + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + CL_ENV_DEC(busy); + cl_env_detach(cle); + cle->ce_debug = NULL; + cl_env_exit(cle); + cl_env_fini(cle); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Declares a point of re-entrancy. + * + * \see cl_env_reexit() + */ +void *cl_env_reenter(void) +{ + return cl_env_detach(NULL); +} +EXPORT_SYMBOL(cl_env_reenter); + +/** + * Exits re-entrancy. + */ +void cl_env_reexit(void *cookie) +{ + cl_env_detach(NULL); + cl_env_attach(cookie); +} +EXPORT_SYMBOL(cl_env_reexit); + +/** + * Setup user-supplied \a env as a current environment. This is to be used to + * guaranteed that environment exists even when cl_env_get() fails. It is up + * to user to ensure proper concurrency control. + * + * \see cl_env_unplant() + */ +void cl_env_implant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + + cl_env_attach(cle); + cl_env_get(refcheck); + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); +} +EXPORT_SYMBOL(cl_env_implant); + +/** + * Detach environment installed earlier by cl_env_implant(). + */ +void cl_env_unplant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 1); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + + cl_env_detach(cle); + cl_env_put(env, refcheck); +} +EXPORT_SYMBOL(cl_env_unplant); + +struct lu_env *cl_env_nested_get(struct cl_env_nest *nest) +{ + struct lu_env *env; + + nest->cen_cookie = NULL; + env = cl_env_peek(&nest->cen_refcheck); + if (env != NULL) { + if (!cl_io_is_going(env)) + return env; + else { + cl_env_put(env, &nest->cen_refcheck); + nest->cen_cookie = cl_env_reenter(); + } + } + env = cl_env_get(&nest->cen_refcheck); + if (IS_ERR(env)) { + cl_env_reexit(nest->cen_cookie); + return env; + } + + LASSERT(!cl_io_is_going(env)); + return env; +} +EXPORT_SYMBOL(cl_env_nested_get); + +void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env) +{ + cl_env_put(env, &nest->cen_refcheck); + cl_env_reexit(nest->cen_cookie); +} +EXPORT_SYMBOL(cl_env_nested_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + ENTRY; + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_attr2lvb); + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + ENTRY; + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_lvb2attr); + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +int cl_lock_init(void); +void cl_lock_fini(void); + +int cl_page_init(void); +void cl_page_fini(void); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl0_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl0, struct cl_thread_info); + +static void *cl_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct cl_thread_info *info; + + info = cl0_key_init(ctx, key); + if (!IS_ERR(info)) { + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } + return info; +} + +static void cl_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info; + int i; + + info = data; + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + cl0_key_fini(ctx, key, data); +} + +static void cl_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info = data; + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) { + LASSERT(info->clt_counters[i].ctc_nr_held == 0); + LASSERT(info->clt_counters[i].ctc_nr_used == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } +} + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, + .lct_exit = cl_key_exit +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof (struct cl_env) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + result = cl_env_store_init(); + if (result) + return result; + + result = lu_kmem_init(cl_object_caches); + if (result) + goto out_store; + + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result) + goto out_kmem; + + result = cl_lock_init(); + if (result) + goto out_context; + + result = cl_page_init(); + if (result) + goto out_lock; + + return 0; +out_lock: + cl_lock_fini(); +out_context: + lu_context_key_degister(&cl_key); +out_kmem: + lu_kmem_fini(cl_object_caches); +out_store: + cl_env_store_fini(); + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + cl_lock_fini(); + cl_page_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); + cl_env_store_fini(); +} diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c new file mode 100644 index 000000000000..bb9335911c34 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c @@ -0,0 +1,1605 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Page. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +#include +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix); + +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) + +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) + +/* Disable page statistic by default due to huge performance penalty. */ +#define CS_PAGE_INC(o, item) +#define CS_PAGE_DEC(o, item) +#define CS_PAGESTATE_INC(o, state) +#define CS_PAGESTATE_DEC(o, state) + +/** + * Internal version of cl_page_top, it should be called if the page is + * known to be not freed, says with page referenced, or radix tree lock held, + * or page owned. + */ +static struct cl_page *cl_page_top_trusted(struct cl_page *page) +{ + while (page->cp_parent != NULL) + page = page->cp_parent; + return page; +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by locking page radix-tree + * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(atomic_read(&page->cp_ref) > 0); + atomic_inc(&page->cp_ref); +} + +/** + * Returns a slice within a page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + ENTRY; + + page = cl_page_top_trusted((struct cl_page *)page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + page = page->cp_child; + } while (page != NULL); + RETURN(NULL); +} + +/** + * Returns a page with given index in the given object, or NULL if no page is + * found. Acquires a reference on \a page. + * + * Locking: called under cl_object_header::coh_page_guard spin-lock. + */ +struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index) +{ + struct cl_page *page; + + LASSERT(spin_is_locked(&hdr->coh_page_guard)); + + page = radix_tree_lookup(&hdr->coh_tree, index); + if (page != NULL) + cl_page_get_trust(page); + return page; +} +EXPORT_SYMBOL(cl_page_lookup); + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * \param resched If not NULL, then we give up before hogging CPU for too + * long and set *resched = 1, in that case caller should implement a retry + * logic. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata) +{ + struct cl_object_header *hdr; + struct cl_page *page; + struct cl_page **pvec; + const struct cl_page_slice *slice; + const struct lu_device_type *dtype; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + int res = CLP_GANG_OKAY; + int tree_lock = 1; + ENTRY; + + idx = start; + hdr = cl_object_header(obj); + pvec = cl_env_info(env)->clt_pvec; + dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type; + spin_lock(&hdr->coh_page_guard); + while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec, + idx, CLT_PVEC_SIZE)) > 0) { + int end_of_region = 0; + idx = pvec[nr - 1]->cp_index + 1; + for (i = 0, j = 0; i < nr; ++i) { + page = pvec[i]; + pvec[i] = NULL; + + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_index > end) { + end_of_region = 1; + break; + } + if (page->cp_state == CPS_FREEING) + continue; + + slice = cl_page_at_trusted(page, dtype); + /* + * Pages for lsm-less file has no underneath sub-page + * for osc, in case of ... + */ + PASSERT(env, page, slice != NULL); + + page = slice->cpl_page; + /* + * Can safely call cl_page_get_trust() under + * radix-tree spin-lock. + * + * XXX not true, because @page is from object another + * than @hdr and protected by different tree lock. + */ + cl_page_get_trust(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = page; + } + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&hdr->coh_page_guard); + tree_lock = 0; + + for (i = 0; i < j; ++i) { + page = pvec[i]; + if (res == CLP_GANG_OKAY) + res = (*cb)(env, io, page, cbdata); + lu_ref_del(&page->cp_reference, + "gang_lookup", current); + cl_page_put(env, page); + } + if (nr < CLT_PVEC_SIZE || end_of_region) + break; + + if (res == CLP_GANG_OKAY && need_resched()) + res = CLP_GANG_RESCHED; + if (res != CLP_GANG_OKAY) + break; + + spin_lock(&hdr->coh_page_guard); + tree_lock = 1; + } + if (tree_lock) + spin_unlock(&hdr->coh_page_guard); + RETURN(res); +} +EXPORT_SYMBOL(cl_page_gang_lookup); + +static void cl_page_free(const struct lu_env *env, struct cl_page *page) +{ + struct cl_object *obj = page->cp_obj; + int pagesize = cl_object_header(obj)->coh_page_bufsize; + + PASSERT(env, page, list_empty(&page->cp_batch)); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, page->cp_req == NULL); + PASSERT(env, page, page->cp_parent == NULL); + PASSERT(env, page, page->cp_state == CPS_FREEING); + + ENTRY; + might_sleep(); + while (!list_empty(&page->cp_layers)) { + struct cl_page_slice *slice; + + slice = list_entry(page->cp_layers.next, + struct cl_page_slice, cpl_linkage); + list_del_init(page->cp_layers.next); + slice->cpl_ops->cpo_fini(env, slice); + } + CS_PAGE_DEC(obj, total); + CS_PAGESTATE_DEC(obj, page->cp_state); + lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page); + cl_object_put(env, obj); + lu_ref_fini(&page->cp_reference); + OBD_FREE(page, pagesize); + EXIT; +} + +/** + * Helper function updating page state. This is the only place in the code + * where cl_page::cp_state field is mutated. + */ +static inline void cl_page_state_set_trust(struct cl_page *page, + enum cl_page_state state) +{ + /* bypass const. */ + *(enum cl_page_state *)&page->cp_state = state; +} + +static struct cl_page *cl_page_alloc(const struct lu_env *env, + struct cl_object *o, pgoff_t ind, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page; + struct lu_object_header *head; + + ENTRY; + OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize, + __GFP_IO); + if (page != NULL) { + int result = 0; + atomic_set(&page->cp_ref, 1); + if (type == CPT_CACHEABLE) /* for radix tree */ + atomic_inc(&page->cp_ref); + page->cp_obj = o; + cl_object_get(o); + page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page); + page->cp_index = ind; + cl_page_state_set_trust(page, CPS_CACHED); + page->cp_type = type; + INIT_LIST_HEAD(&page->cp_layers); + INIT_LIST_HEAD(&page->cp_batch); + INIT_LIST_HEAD(&page->cp_flight); + mutex_init(&page->cp_mutex); + lu_ref_init(&page->cp_reference); + head = o->co_lu.lo_header; + list_for_each_entry(o, &head->loh_layers, + co_lu.lo_linkage) { + if (o->co_ops->coo_page_init != NULL) { + result = o->co_ops->coo_page_init(env, o, + page, vmpage); + if (result != 0) { + cl_page_delete0(env, page, 0); + cl_page_free(env, page); + page = ERR_PTR(result); + break; + } + } + } + if (result == 0) { + CS_PAGE_INC(o, total); + CS_PAGE_INC(o, create); + CS_PAGESTATE_DEC(o, CPS_CACHED); + } + } else { + page = ERR_PTR(-ENOMEM); + } + RETURN(page); +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +static struct cl_page *cl_page_find0(const struct lu_env *env, + struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type, + struct cl_page *parent) +{ + struct cl_page *page = NULL; + struct cl_page *ghost = NULL; + struct cl_object_header *hdr; + int err; + + LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + ENTRY; + + hdr = cl_object_header(o); + CS_PAGE_INC(o, lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* vmpage lock is used to protect the child/parent + * relationship */ + KLASSERT(PageLocked(vmpage)); + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + PINVRNT(env, page, + ergo(page != NULL, + cl_page_vmpage(env, page) == vmpage && + (void *)radix_tree_lookup(&hdr->coh_tree, + idx) == page)); + } + + if (page != NULL) { + CS_PAGE_INC(o, hit); + RETURN(page); + } + + /* allocate and initialize cl_page */ + page = cl_page_alloc(env, o, idx, vmpage, type); + if (IS_ERR(page)) + RETURN(page); + + if (type == CPT_TRANSIENT) { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + RETURN(page); + } + + /* + * XXX optimization: use radix_tree_preload() here, and change tree + * gfp mask to GFP_KERNEL in cl_object_header_init(). + */ + spin_lock(&hdr->coh_page_guard); + err = radix_tree_insert(&hdr->coh_tree, idx, page); + if (err != 0) { + ghost = page; + /* + * Noted by Jay: a lock on \a vmpage protects cl_page_find() + * from this race, but + * + * 0. it's better to have cl_page interface "locally + * consistent" so that its correctness can be reasoned + * about without appealing to the (obscure world of) VM + * locking. + * + * 1. handling this race allows ->coh_tree to remain + * consistent even when VM locking is somehow busted, + * which is very useful during diagnosing and debugging. + */ + page = ERR_PTR(err); + CL_PAGE_DEBUG(D_ERROR, env, ghost, + "fail to insert into radix tree: %d\n", err); + } else { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + hdr->coh_pages++; + } + spin_unlock(&hdr->coh_page_guard); + + if (unlikely(ghost != NULL)) { + cl_page_delete0(env, ghost, 0); + cl_page_free(env, ghost); + } + RETURN(page); +} + +struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + return cl_page_find0(env, o, idx, vmpage, type, NULL); +} +EXPORT_SYMBOL(cl_page_find); + + +struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent) +{ + return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent); +} +EXPORT_SYMBOL(cl_page_find_sub); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + struct cl_object_header *header; + struct cl_page *parent; + struct cl_page *child; + struct cl_io *owner; + + /* + * Page invariant is protected by a VM lock. + */ + LINVRNT(cl_page_is_vmlocked(NULL, pg)); + + header = cl_object_header(pg->cp_obj); + parent = pg->cp_parent; + child = pg->cp_child; + owner = pg->cp_owner; + + return cl_page_in_use(pg) && + ergo(parent != NULL, parent->cp_child == pg) && + ergo(child != NULL, child->cp_parent == pg) && + ergo(child != NULL, pg->cp_obj != child->cp_obj) && + ergo(parent != NULL, pg->cp_obj != parent->cp_obj) && + ergo(owner != NULL && parent != NULL, + parent->cp_owner == pg->cp_owner->ci_parent) && + ergo(owner != NULL && child != NULL, + child->cp_owner->ci_parent == owner) && + /* + * Either page is early in initialization (has neither child + * nor parent yet), or it is in the object radix tree. + */ + ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE, + (void *)radix_tree_lookup(&header->coh_tree, + pg->cp_index) == pg || + (child == NULL && parent == NULL)); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + enum cl_page_state old; + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + ENTRY; + old = page->cp_state; + PASSERT(env, page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); + for (; page != NULL; page = page->cp_child) { + PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, + equi(state == CPS_OWNED, page->cp_owner != NULL)); + + CS_PAGESTATE_DEC(page->cp_obj, page->cp_state); + CS_PAGESTATE_INC(page->cp_obj, state); + cl_page_state_set_trust(page, state); + } + EXIT; +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + ENTRY; + cl_page_get_trust(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page. + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", + atomic_read(&page->cp_ref)); + + if (atomic_dec_and_test(&page->cp_ref)) { + LASSERT(page->cp_state == CPS_FREEING); + + LASSERT(atomic_read(&page->cp_ref) == 0); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a VM page associated with a given cl_page. + */ +struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + /* + * Find uppermost layer with ->cpo_vmpage() method, and return its + * result. + */ + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_vmpage != NULL) + RETURN(slice->cpl_ops->cpo_vmpage(env, slice)); + } + page = page->cp_child; + } while (page != NULL); + LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */ +} +EXPORT_SYMBOL(cl_page_vmpage); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) +{ + struct cl_page *top; + struct cl_page *page; + + ENTRY; + KLASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + /* + * This loop assumes that ->private points to the top-most page. This + * can be rectified easily. + */ + top = (struct cl_page *)vmpage->private; + if (top == NULL) + RETURN(NULL); + + for (page = top; page != NULL; page = page->cp_child) { + if (cl_object_same(page->cp_obj, obj)) { + cl_page_get_trust(page); + break; + } + } + LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE)); + RETURN(page); +} +EXPORT_SYMBOL(cl_vmpage_page); + +/** + * Returns the top-page for a given page. + * + * \see cl_object_top(), cl_io_top() + */ +struct cl_page *cl_page_top(struct cl_page *page) +{ + return cl_page_top_trusted(page); +} +EXPORT_SYMBOL(cl_page_top); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname) + +#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \ +({ \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + int __result; \ + ptrdiff_t __op = (_op); \ + int (*__method)_proto; \ + \ + __result = 0; \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) { \ + __result = (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + if (__result != 0) \ + break; \ + } \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL && __result == 0); \ + if (__result > 0) \ + __result = 0; \ + __result; \ +}) + +#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL); \ +} while (0) + +#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + /* get to the bottom page. */ \ + while (__page->cp_child != NULL) \ + __page = __page->cp_child; \ + do { \ + list_for_each_entry_reverse(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_parent; \ + } while (__page != NULL); \ +} while (0) + +static int cl_page_invoke(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + RETURN(CL_PAGE_INVOKE(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io)); +} + +static void cl_page_invoid(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + CL_PAGE_INVOID(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), io); + EXIT; +} + +static void cl_page_owner_clear(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + page->cp_task = NULL; + } + } + EXIT; +} + +static void cl_page_owner_set(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + } + EXIT; +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + enum cl_page_state state; + + ENTRY; + state = pg->cp_state; + PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); + PINVRNT(env, pg, cl_page_invariant(pg)); + cl_page_owner_clear(pg); + + if (state == CPS_OWNED) + cl_page_state_set(env, pg, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + ENTRY; + RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io); +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Try to own a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(pg, io) + * \post result == 0 iff cl_page_is_owned(pg, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * or, page was owned by another thread, or in IO. + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + * \see cl_page_own_try() + * \see cl_page_own + */ +static int cl_page_own0(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, int nonblock) +{ + int result; + + PINVRNT(env, pg, !cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + if (pg->cp_state == CPS_FREEING) { + result = -ENOENT; + } else { + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own), + (const struct lu_env *, + const struct cl_page_slice *, + struct cl_io *, int), + io, nonblock); + if (result == 0) { + PASSERT(env, pg, pg->cp_owner == NULL); + PASSERT(env, pg, pg->cp_req == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + if (pg->cp_state != CPS_FREEING) { + cl_page_state_set(env, pg, CPS_OWNED); + } else { + cl_page_disown0(env, io, pg); + result = -ENOENT; + } + } + } + PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); + RETURN(result); +} + +/** + * Own a page, might be blocked. + * + * \see cl_page_own0() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 0); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Nonblock version of cl_page_own(). + * + * \see cl_page_own0() + */ +int cl_page_own_try(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 1); +} +EXPORT_SYMBOL(cl_page_own_try); + + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); + PASSERT(env, pg, pg->cp_owner == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + cl_page_state_set(env, pg, CPS_OWNED); + EXIT; +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the + * underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_disown0(env, io, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when page is to be removed from the object, e.g., as a result of + * truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard)); +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix) +{ + struct cl_page *tmp = pg; + ENTRY; + + PASSERT(env, pg, pg == cl_page_top(pg)); + PASSERT(env, pg, pg->cp_state != CPS_FREEING); + + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(pg); + + /* + * unexport the page firstly before freeing it so that + * the page content is considered to be invalid. + * We have to do this because a CPS_FREEING cl_page may + * be NOT under the protection of a cl_lock. + * Afterwards, if this page is found by other threads, then this + * page will be forced to reread. + */ + cl_page_export(env, pg, 0); + cl_page_state_set0(env, pg, CPS_FREEING); + + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete), + (const struct lu_env *, const struct cl_page_slice *)); + + if (tmp->cp_type == CPT_CACHEABLE) { + if (!radix) + /* !radix means that @pg is not yet in the radix tree, + * skip removing it. + */ + tmp = pg->cp_child; + for (; tmp != NULL; tmp = tmp->cp_child) { + void *value; + struct cl_object_header *hdr; + + hdr = cl_object_header(tmp->cp_obj); + spin_lock(&hdr->coh_page_guard); + value = radix_tree_delete(&hdr->coh_tree, + tmp->cp_index); + PASSERT(env, tmp, value == tmp); + PASSERT(env, tmp, hdr->coh_pages > 0); + hdr->coh_pages--; + spin_unlock(&hdr->coh_page_guard); + cl_page_put(env, tmp); + } + } + + EXIT; +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre pg == cl_page_top(pg) + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + ENTRY; + cl_page_delete0(env, pg, 1); + EXIT; +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Unmaps page from user virtual memory. + * + * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The + * layer responsible for VM interaction has to unmap page from user space + * virtual memory. + * + * \see cl_page_operations::cpo_unmap() + */ +int cl_page_unmap(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap)); +} +EXPORT_SYMBOL(cl_page_unmap); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark/clear page as up-to-date + * by the \a uptodate argument. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export), + (const struct lu_env *, + const struct cl_page_slice *, int), uptodate); +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, iff \a pg is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) +{ + int result; + const struct cl_page_slice *slice; + + ENTRY; + pg = cl_page_top_trusted((struct cl_page *)pg); + slice = container_of(pg->cp_layers.next, + const struct cl_page_slice, cpl_linkage); + PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, pg, result == -EBUSY || result == -ENODATA); + RETURN(result == -EBUSY); +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + ENTRY; + RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN); +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + ENTRY; + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); + EXIT; +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + PINVRNT(env, pg, crt < CRT_NR); + + /* + * XXX this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + if (crt >= CRT_NR) + return -EINVAL; + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep)); + if (result == 0) + cl_page_io_start(env, pg, crt); + + KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE, + equi(result == 0, + PageWriteback(cl_page_vmpage(env, pg))))); + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * \post pg->cp_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret) +{ + struct cl_sync_io *anchor = pg->cp_sync_io; + + PASSERT(env, pg, crt < CRT_NR); + /* cl_page::cp_req already cleared by the caller (osc_completion()) */ + PASSERT(env, pg, pg->cp_req == NULL); + PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); + if (crt == CRT_READ && ioret == 0) { + PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED)); + pg->cp_flags |= CPF_READ_COMPLETED; + } + + cl_page_state_set(env, pg, CPS_CACHED); + if (crt >= CRT_NR) + return; + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion), + (const struct lu_env *, + const struct cl_page_slice *, int), ioret); + if (anchor) { + LASSERT(cl_page_is_vmlocked(env, pg)); + LASSERT(pg->cp_sync_io == anchor); + pg->cp_sync_io = NULL; + } + /* + * As page->cp_obj is pinned by a reference from page->cp_req, it is + * safe to call cl_page_put() without risking object destruction in a + * non-blocking context. + */ + cl_page_put(env, pg); + + if (anchor) + cl_sync_io_note(anchor, ioret); + + EXIT; +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre pg->cp_state == CPS_CACHED + * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, crt < CRT_NR); + + ENTRY; + if (crt >= CRT_NR) + RETURN(-EINVAL); + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready), + (const struct lu_env *, + const struct cl_page_slice *)); + if (result == 0) { + PASSERT(env, pg, pg->cp_state == CPS_CACHED); + cl_page_io_start(env, pg, crt); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Notify layers that high level io decided to place this page into a cache + * for future transfer. + * + * The layer implementing transfer engine (osc) has to register this page in + * its queues. + * + * \pre cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_cache_add() + */ +int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + const struct cl_page_slice *scan; + int result = 0; + + PINVRNT(env, pg, crt < CRT_NR); + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + + if (crt >= CRT_NR) + RETURN(-EINVAL); + + list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) { + if (scan->cpl_ops->io[crt].cpo_cache_add == NULL) + continue; + + result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io); + if (result != 0) + break; + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_cache_add); + +/** + * Called if a pge is being written back by kernel's intention. + * + * \pre cl_page_is_owned(pg, io) + * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_flush() + */ +int cl_page_flush(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_flush); + +/** + * Checks whether page is protected by any extent lock is at least required + * mode. + * + * \return the same as in cl_page_operations::cpo_is_under_lock() method. + * \see cl_page_operations::cpo_is_under_lock() + */ +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + int rc; + + PINVRNT(env, page, cl_page_invariant(page)); + + ENTRY; + rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + PASSERT(env, page, rc != 0); + RETURN(rc); +} +EXPORT_SYMBOL(cl_page_is_under_lock); + +static int page_prune_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + cl_page_own(env, io, page); + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + return CLP_GANG_OKAY; +} + +/** + * Purges all cached pages belonging to the object \a obj. + */ +int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj) +{ + struct cl_thread_info *info; + struct cl_object *obj = cl_object_top(clobj); + struct cl_io *io; + int result; + + ENTRY; + info = cl_env_info(env); + io = &info->clt_io; + + /* + * initialize the io. This is ugly since we never do IO in this + * function, we just make cl_page_list functions happy. -jay + */ + io->ci_obj = obj; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, obj); + if (result != 0) { + cl_io_fini(env, io); + RETURN(io->ci_result); + } + + do { + result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, + page_prune_cb, NULL); + if (result == CLP_GANG_RESCHED) + cond_resched(); + } while (result != CLP_GANG_OKAY); + + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_pages_prune); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *pg, + int from, int to) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip), + (const struct lu_env *, + const struct cl_page_slice *,int, int), + from, to); +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_index, pg->cp_parent, pg->cp_child, + pg->cp_state, pg->cp_error, pg->cp_type, + pg->cp_owner, pg->cp_req, pg->cp_flags); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + struct cl_page *scan; + + for (scan = cl_page_top((struct cl_page *)pg); + scan != NULL; scan = scan->cp_child) + cl_page_header_print(env, cookie, printer, scan); + CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print), + (const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p), cookie, printer); + (*printer)(env, cookie, "end page@%p\n", pg); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Cancel a page which is still in a transfer. + */ +int cl_page_cancel(const struct lu_env *env, struct cl_page *page) +{ + return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel), + (const struct lu_env *, + const struct cl_page_slice *)); +} +EXPORT_SYMBOL(cl_page_cancel); + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + /* + * XXX for now. + */ + return (loff_t)idx << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + /* + * XXX for now. + */ + return offset >> PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +int cl_page_size(const struct cl_object *obj) +{ + return 1 << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops) +{ + ENTRY; + list_add_tail(&slice->cpl_linkage, &page->cp_layers); + slice->cpl_obj = obj; + slice->cpl_ops = ops; + slice->cpl_page = page; + EXIT; +} +EXPORT_SYMBOL(cl_page_slice_add); + +int cl_page_init(void) +{ + return 0; +} + +void cl_page_fini(void) +{ +} diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c new file mode 100644 index 000000000000..20d9eaf46ae1 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -0,0 +1,691 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +# include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llog_internal.h" + + +struct obd_device *obd_devs[MAX_OBD_DEVICES]; +EXPORT_SYMBOL(obd_devs); +struct list_head obd_types; +DEFINE_RWLOCK(obd_dev_lock); + +__u64 obd_max_pages = 0; +__u64 obd_max_alloc = 0; +DEFINE_SPINLOCK(obd_updatemax_lock); + +/* The following are visible and mutable through /proc/sys/lustre/. */ +unsigned int obd_alloc_fail_rate = 0; +EXPORT_SYMBOL(obd_alloc_fail_rate); +unsigned int obd_debug_peer_on_timeout; +EXPORT_SYMBOL(obd_debug_peer_on_timeout); +unsigned int obd_dump_on_timeout; +EXPORT_SYMBOL(obd_dump_on_timeout); +unsigned int obd_dump_on_eviction; +EXPORT_SYMBOL(obd_dump_on_eviction); +unsigned int obd_max_dirty_pages = 256; +EXPORT_SYMBOL(obd_max_dirty_pages); +atomic_t obd_unstable_pages; +EXPORT_SYMBOL(obd_unstable_pages); +atomic_t obd_dirty_pages; +EXPORT_SYMBOL(obd_dirty_pages); +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(obd_timeout); +unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(ldlm_timeout); +unsigned int obd_timeout_set; +EXPORT_SYMBOL(obd_timeout_set); +unsigned int ldlm_timeout_set; +EXPORT_SYMBOL(ldlm_timeout_set); +/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ +unsigned int at_min = 0; +EXPORT_SYMBOL(at_min); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +unsigned int at_history = 600; +EXPORT_SYMBOL(at_history); +int at_early_margin = 5; +EXPORT_SYMBOL(at_early_margin); +int at_extra = 30; +EXPORT_SYMBOL(at_extra); + +atomic_t obd_dirty_transit_pages; +EXPORT_SYMBOL(obd_dirty_transit_pages); + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +EXPORT_SYMBOL(obd_jobid_var); + +/* Get jobid of current process by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * + * TODO: + * It's better to cache the jobid for later use if there is any + * efficient way, the cl_env code probably could be reused for this + * purpose. + * + * If some job scheduler doesn't store jobid in the "env_start/end", + * then an upcall could be issued here to get the jobid by utilizing + * the userspace tools/api. Then, the jobid must be cached. + */ +int lustre_get_jobid(char *jobid) +{ + int jobid_len = JOBSTATS_JOBID_SIZE; + int rc = 0; + ENTRY; + + memset(jobid, 0, JOBSTATS_JOBID_SIZE); + /* Jobstats isn't enabled */ + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) + RETURN(0); + + /* Use process name + fsuid as jobid */ + if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u", + current_comm(), current_fsuid()); + RETURN(0); + } + + rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len); + if (rc) { + if (rc == -EOVERFLOW) { + /* For the PBS_JOBID and LOADL_STEP_ID keys (which are + * variable length strings instead of just numbers), it + * might make sense to keep the unique parts for JobID, + * instead of just returning an error. That means a + * larger temp buffer for cfs_get_environ(), then + * truncating the string at some separator to fit into + * the specified jobid_len. Fix later if needed. */ + static bool printed; + if (unlikely(!printed)) { + LCONSOLE_ERROR_MSG(0x16b, "%s value too large " + "for JobID buffer (%d)\n", + obd_jobid_var, jobid_len); + printed = true; + } + } else { + CDEBUG((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "Get jobid for (%s) failed: rc = %d\n", + obd_jobid_var, rc); + } + } + RETURN(rc); +} +EXPORT_SYMBOL(lustre_get_jobid); + +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line) +{ + if (ptr == NULL || + (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) { + CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n", + ptr ? "force " :"", type, name, (__u64)size, file, + line); + CERROR(LPU64" total bytes and "LPU64" total pages " + "("LPU64" bytes) allocated by Lustre, " + "%d total bytes by LNET\n", + obd_memory_sum(), + obd_pages_sum() << PAGE_CACHE_SHIFT, + obd_pages_sum(), + atomic_read(&libcfs_kmemory)); + return 1; + } + return 0; +} +EXPORT_SYMBOL(obd_alloc_fail); + +static inline void obd_data2conn(struct lustre_handle *conn, + struct obd_ioctl_data *data) +{ + memset(conn, 0, sizeof *conn); + conn->cookie = data->ioc_cookie; +} + +static inline void obd_conn2data(struct obd_ioctl_data *data, + struct lustre_handle *conn) +{ + data->ioc_cookie = conn->cookie; +} + +int class_resolve_dev_name(__u32 len, const char *name) +{ + int rc; + int dev; + + ENTRY; + if (!len || !name) { + CERROR("No name passed,!\n"); + GOTO(out, rc = -EINVAL); + } + if (name[len - 1] != 0) { + CERROR("Name not nul terminated!\n"); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", name); + dev = class_name2dev(name); + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for name %s!\n", name); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); + rc = dev; + +out: + RETURN(rc); +} + +int class_handle_ioctl(unsigned int cmd, unsigned long arg) +{ + char *buf = NULL; + struct obd_ioctl_data *data; + struct libcfs_debug_ioctl_data *debug_data; + struct obd_device *obd = NULL; + int err = 0, len = 0; + ENTRY; + + /* only for debugging */ + if (cmd == LIBCFS_IOC_DEBUG_MASK) { + debug_data = (struct libcfs_debug_ioctl_data*)arg; + libcfs_subsystem_debug = debug_data->subs; + libcfs_debug = debug_data->debug; + return 0; + } + + CDEBUG(D_IOCTL, "cmd = %x\n", cmd); + if (obd_ioctl_getdata(&buf, &len, (void *)arg)) { + CERROR("OBD ioctl: data error\n"); + RETURN(-EINVAL); + } + data = (struct obd_ioctl_data *)buf; + + switch (cmd) { + case OBD_IOC_PROCESS_CFG: { + struct lustre_cfg *lcfg; + + if (!data->ioc_plen1 || !data->ioc_pbuf1) { + CERROR("No config buffer passed!\n"); + GOTO(out, err = -EINVAL); + } + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) + GOTO(out, err = -ENOMEM); + err = copy_from_user(lcfg, data->ioc_pbuf1, + data->ioc_plen1); + if (!err) + err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); + if (!err) + err = class_process_config(lcfg); + + OBD_FREE(lcfg, data->ioc_plen1); + GOTO(out, err); + } + + case OBD_GET_VERSION: + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + + if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + memcpy(data->ioc_bulk, BUILD_VERSION, + strlen(BUILD_VERSION) + 1); + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + GOTO(out, err); + + case OBD_IOC_NAME2DEV: { + /* Resolve a device name. This does not change the + * currently selected device. + */ + int dev; + + dev = class_resolve_dev_name(data->ioc_inllen1, + data->ioc_inlbuf1); + data->ioc_dev = dev; + if (dev < 0) + GOTO(out, err = -EINVAL); + + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_UUID2DEV: { + /* Resolve a device uuid. This does not change the + * currently selected device. + */ + int dev; + struct obd_uuid uuid; + + if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { + CERROR("No UUID passed!\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); + data->ioc_dev = dev; + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for UUID %s!\n", + data->ioc_inlbuf1); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, + dev); + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_CLOSE_UUID: { + CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n", + data->ioc_inlbuf1); + GOTO(out, err = 0); + } + + case OBD_IOC_GETDEVICE: { + int index = data->ioc_count; + char *status, *str; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inllen1 < 128) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + obd = class_num2obd(index); + if (!obd) + GOTO(out, err = -ENOENT); + + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + str = (char *)data->ioc_bulk; + snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + err = obd_ioctl_popdata((void *)arg, data, len); + + GOTO(out, err = 0); + } + + } + + if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { + if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) + GOTO(out, err = -EINVAL); + if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) + GOTO(out, err = -EINVAL); + obd = class_name2obd(data->ioc_inlbuf4); + } else if (data->ioc_dev < class_devno_max()) { + obd = class_num2obd(data->ioc_dev); + } else { + CERROR("OBD ioctl: No device\n"); + GOTO(out, err = -EINVAL); + } + + if (obd == NULL) { + CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + + switch(cmd) { + case OBD_IOC_NO_TRANSNO: { + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + GOTO(out, err = -ENODEV); + } + CDEBUG(D_HA, "%s: disabling committed-transno notification\n", + obd->obd_name); + obd->obd_no_transno = 1; + GOTO(out, err = 0); + } + + default: { + err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); + if (err) + GOTO(out, err); + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + GOTO(out, err); + } + } + + out: + if (buf) + obd_ioctl_freedata(buf, len); + RETURN(err); +} /* class_handle_ioctl */ + +extern psdev_t obd_psdev; + +#define OBD_INIT_CHECK +int obd_init_checks(void) +{ + __u64 u64val, div64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64); + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF); + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPX64, u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + + div64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + return -EOVERFLOW; + } + if (do_div(div64val, 256) != (u64val & 255)) { + CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255); + return -EOVERFLOW; + } + if (u64val >> 8 != div64val) { + CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n", + u64val, div64val, u64val >> 8); + return -EOVERFLOW; + } + len = snprintf(buf, sizeof(buf), LPX64, u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPU64, u64val); + if (len != 20) { + CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPD64, u64val); + if (len != 2) { + CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len); + ret = -EINVAL; + } + if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) { + CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val, + (__u64)PAGE_CACHE_SIZE); + ret = -EINVAL; + } + + return ret; +} + +extern spinlock_t obd_types_lock; +extern int class_procfs_init(void); +extern int class_procfs_clean(void); + +static int __init init_obdclass(void) +{ + int i, err; + int lustre_register_fs(void); + + for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++) + INIT_LIST_HEAD(&capa_list[i]); + + LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n"); + + spin_lock_init(&obd_types_lock); + obd_zombie_impexp_init(); +#ifdef LPROCFS + obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, + LPROCFS_STATS_FLAG_NONE | + LPROCFS_STATS_FLAG_IRQ_SAFE); + if (obd_memory == NULL) { + CERROR("kmalloc of 'obd_memory' failed\n"); + RETURN(-ENOMEM); + } + + lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, + LPROCFS_CNTR_AVGMINMAX, + "memused", "bytes"); + lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT, + LPROCFS_CNTR_AVGMINMAX, + "pagesused", "pages"); +#endif + err = obd_init_checks(); + if (err == -EOVERFLOW) + return err; + + class_init_uuidlist(); + err = class_handle_init(); + if (err) + return err; + + INIT_LIST_HEAD(&obd_types); + + err = misc_register(&obd_psdev); + if (err) { + CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err); + return err; + } + + /* This struct is already zeroed for us (static global) */ + for (i = 0; i < class_devno_max(); i++) + obd_devs[i] = NULL; + + /* Default the dirty page cache cap to 1/2 of system memory. + * For clients with less memory, a larger fraction is needed + * for other purposes (mostly for BGL). */ + if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT)) + obd_max_dirty_pages = num_physpages / 4; + else + obd_max_dirty_pages = num_physpages / 2; + + err = obd_init_caches(); + if (err) + return err; + err = class_procfs_init(); + if (err) + return err; + + err = lu_global_init(); + if (err) + return err; + + err = cl_global_init(); + if (err != 0) + return err; + + + err = llog_info_init(); + if (err) + return err; + + err = lustre_register_fs(); + + return err; +} + +void obd_update_maxusage(void) +{ + __u64 max1, max2; + + max1 = obd_pages_sum(); + max2 = obd_memory_sum(); + + spin_lock(&obd_updatemax_lock); + if (max1 > obd_max_pages) + obd_max_pages = max1; + if (max2 > obd_max_alloc) + obd_max_alloc = max2; + spin_unlock(&obd_updatemax_lock); +} +EXPORT_SYMBOL(obd_update_maxusage); + +#ifdef LPROCFS +__u64 obd_memory_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_alloc; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_memory_max); + +__u64 obd_pages_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_pages; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_pages_max); +#endif + +/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this + * ifdef to the end of the file to cover module and versioning goo.*/ +static void cleanup_obdclass(void) +{ + int i; + int lustre_unregister_fs(void); + __u64 memory_leaked, pages_leaked; + __u64 memory_max, pages_max; + ENTRY; + + lustre_unregister_fs(); + + misc_deregister(&obd_psdev); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + if (obd && obd->obd_set_up && + OBT(obd) && OBP(obd, detach)) { + /* XXX should this call generic detach otherwise? */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + OBP(obd, detach)(obd); + } + } + llog_info_fini(); + cl_global_fini(); + lu_global_fini(); + + obd_cleanup_caches(); + obd_sysctl_clean(); + + class_procfs_clean(); + + class_handle_cleanup(); + class_exit_uuidlist(); + obd_zombie_impexp_stop(); + + memory_leaked = obd_memory_sum(); + pages_leaked = obd_pages_sum(); + + memory_max = obd_memory_max(); + pages_max = obd_pages_max(); + + lprocfs_free_stats(&obd_memory); + CDEBUG((memory_leaked) ? D_ERROR : D_INFO, + "obd_memory max: "LPU64", leaked: "LPU64"\n", + memory_max, memory_leaked); + CDEBUG((pages_leaked) ? D_ERROR : D_INFO, + "obd_memory_pages max: "LPU64", leaked: "LPU64"\n", + pages_max, pages_leaked); + + EXIT; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION); +MODULE_LICENSE("GPL"); + +cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass); diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c new file mode 100644 index 000000000000..15f71bbb7276 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/debug.c @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/debug.c + * + * Helper routines for dumping data structs for debugging. + */ + +#define DEBUG_SUBSYSTEM D_OTHER + + +#include +#include +#include +#include + +void dump_lniobuf(struct niobuf_local *nb) +{ + CDEBUG(D_RPCTRACE, + "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n", + nb->lnb_file_offset, nb->len, nb->page, nb->rc); + CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n", + nb->page ? page_index(nb->page) : -1); +} +EXPORT_SYMBOL(dump_lniobuf); + +void dump_lsm(int level, struct lov_stripe_md *lsm) +{ + CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X," + " stripe_size %u, stripe_count %u, refc: %d," + " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm, + POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, + lsm->lsm_stripe_size, lsm->lsm_stripe_count, + atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen, + lsm->lsm_pool_name); +} +EXPORT_SYMBOL(dump_lsm); + +#define LPDS sizeof(__u64) +int block_debug_setup(void *addr, int len, __u64 off, __u64 id) +{ + LASSERT(addr); + + off = cpu_to_le64 (off); + id = cpu_to_le64 (id); + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + addr += len - LPDS - LPDS; + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + return 0; +} +EXPORT_SYMBOL(block_debug_setup); + +int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) +{ + __u64 ne_off; + int err = 0; + + LASSERT(addr); + + ne_off = le64_to_cpu (off); + id = le64_to_cpu (id); + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + addr += end - LPDS - LPDS; + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + return err; +} +EXPORT_SYMBOL(block_debug_check); +#undef LPDS diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c new file mode 100644 index 000000000000..4303698ca643 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c @@ -0,0 +1,1055 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/dt_object.c + * + * Dt Object. + * Generic functions from dt_object.h + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +/* fid_be_to_cpu() */ +#include + +#include + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; +EXPORT_SYMBOL(dt_key); + +/* no lock is necessary to protect the list, because call-backs + * are added during system startup. Please refer to "struct dt_device". + */ +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks); +} +EXPORT_SYMBOL(dt_txn_callback_add); + +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_del_init(&cb->dtc_linkage); +} +EXPORT_SYMBOL(dt_txn_callback_del); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *th) +{ + int rc = 0; + struct dt_txn_callback *cb; + + if (th->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_start == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_start(env, th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_start); + +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn) +{ + struct dt_device *dev = txn->th_dev; + struct dt_txn_callback *cb; + int rc = 0; + + if (txn->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_stop == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_stop); + +void dt_txn_hook_commit(struct thandle *txn) +{ + struct dt_txn_callback *cb; + + if (txn->th_local) + return; + + list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks, + dtc_linkage) { + if (cb->dtc_txn_commit) + cb->dtc_txn_commit(txn, cb->dtc_cookie); + } +} +EXPORT_SYMBOL(dt_txn_hook_commit); + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t) +{ + + INIT_LIST_HEAD(&dev->dd_txn_callbacks); + return lu_device_init(&dev->dd_lu_dev, t); +} +EXPORT_SYMBOL(dt_device_init); + +void dt_device_fini(struct dt_device *dev) +{ + lu_device_fini(&dev->dd_lu_dev); +} +EXPORT_SYMBOL(dt_device_fini); + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d) + +{ + return lu_object_init(&obj->do_lu, h, d); +} +EXPORT_SYMBOL(dt_object_init); + +void dt_object_fini(struct dt_object *obj) +{ + lu_object_fini(&obj->do_lu); +} +EXPORT_SYMBOL(dt_object_fini); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) +{ + if (obj->do_index_ops == NULL) + obj->do_ops->do_index_try(env, obj, &dt_directory_features); + return obj->do_index_ops != NULL; +} +EXPORT_SYMBOL(dt_try_as_dir); + +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LBUG(); + break; + } + return result; +} +EXPORT_SYMBOL(dt_mode_to_dft); + +/** + * lookup fid for object named \a name in directory \a dir. + */ + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid) +{ + if (dt_try_as_dir(env, dir)) + return dt_lookup(env, dir, (struct dt_rec *)fid, + (const struct dt_key *)name, BYPASS_CAPA); + return -ENOTDIR; +} +EXPORT_SYMBOL(dt_lookup_dir); + +/* this differs from dt_locate by top_dev as parameter + * but not one from lu_site */ +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, const struct lu_fid *fid, + struct lu_device *top_dev) +{ + struct lu_object *lo, *n; + ENTRY; + + lo = lu_object_find_at(env, top_dev, fid, NULL); + if (IS_ERR(lo)) + return (void *)lo; + + LASSERT(lo != NULL); + + list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) { + if (n->lo_dev == &dev->dd_lu_dev) + return container_of0(n, struct dt_object, do_lu); + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(dt_locate_at); + +/** + * find a object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, void *data) +{ + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int result; + + result = dt_lookup_dir(env, obj, entry, fid); + lu_object_put(env, &obj->do_lu); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + dfh->dfh_o = obj; + return result; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid) +{ + struct dt_thread_info *info = dt_info(env); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + char *local = info->dti_buf; + int result; + + + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; + + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, local, dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = dfh->dfh_o; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} +EXPORT_SYMBOL(dt_store_resolve); + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup_dir(env, p, name, fid); + if (result == 0){ + o = dt_locate(env, dt, fid); + } + else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, + filename, fid); + lu_object_put(env, &dir->do_lu); + } else { + file = dir; + } + return file; +} +EXPORT_SYMBOL(dt_store_open); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *at) +{ + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + dto = dt_locate(env, dt, fid); + if (IS_ERR(dto)) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + RETURN(dto); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid)); + + rc = dt_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, dt, th); +out: + if (rc) { + lu_object_put(env, &dto->do_lu); + RETURN(ERR_PTR(rc)); + } + RETURN(dto); +} +EXPORT_SYMBOL(dt_find_or_create); + +/* dt class init function. */ +int dt_global_init(void) +{ + int result; + + LU_CONTEXT_KEY_INIT(&dt_key); + result = lu_context_key_register(&dt_key); + return result; +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + +/** + * Generic read helper. May return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval real size of data read + * \retval -ve errno on failure + */ +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); +} +EXPORT_SYMBOL(dt_read); + +/** + * Read structures of fixed size from storage. Unlike dt_read(), using + * dt_record_read() will return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval 0 on successfully reading full buffer + * \retval -EFAULT on short read + * \retval -ve errno on failure + */ +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + + rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); + + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_read); + +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1); + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_write); + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + + LASSERT(o); + vbuf.lb_buf = NULL; + vbuf.lb_len = sizeof(dt_obj_version_t); + return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th); + +} +EXPORT_SYMBOL(dt_declare_version_set); + +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + + rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA); + if (rc < 0) + CDEBUG(D_INODE, "Can't set version, rc %d\n", rc); + return; +} +EXPORT_SYMBOL(dt_version_set); + +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + dt_obj_version_t version; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA); + if (rc != sizeof(version)) { + CDEBUG(D_INODE, "Can't get version, rc %d\n", rc); + version = 0; + } + return version; +} +EXPORT_SYMBOL(dt_version_get); + +/* list of all supported index types */ + +/* directories */ +const struct dt_index_features dt_directory_features; +EXPORT_SYMBOL(dt_directory_features); + +/* scrub iterator */ +const struct dt_index_features dt_otable_features; +EXPORT_SYMBOL(dt_otable_features); + +/* lfsck */ +const struct dt_index_features dt_lfsck_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(__u8), + .dif_recsize_max = sizeof(__u8), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_features); + +/* accounting indexes */ +const struct dt_index_features dt_acct_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_acct_features); + +/* global quota files */ +const struct dt_index_features dt_quota_glb_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_glb_features); + +/* slave quota files */ +const struct dt_index_features dt_quota_slv_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_slv_features); + +/* helper function returning what dt_index_features structure should be used + * based on the FID sequence. This is used by OBD_IDX_READ RPC */ +static inline const struct dt_index_features *dt_index_feat_select(__u64 seq, + __u32 mode) +{ + if (seq == FID_SEQ_QUOTA_GLB) { + /* global quota index */ + if (!S_ISREG(mode)) + /* global quota index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_glb_features; + } else if (seq == FID_SEQ_QUOTA) { + /* quota slave index */ + if (!S_ISREG(mode)) + /* slave index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_slv_features; + } else if (seq >= FID_SEQ_NORMAL) { + /* object is part of the namespace, verify that it is a + * directory */ + if (!S_ISDIR(mode)) + /* sorry, we can only deal with directory */ + return ERR_PTR(-ENOTDIR); + return &dt_directory_features; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ + * RPC + * + * \param env - is the environment passed by the caller + * \param lp - is a pointer to the lu_page to fill + * \param nob - is the maximum number of bytes that should be copied + * \param iops - is the index operation vector associated with the index object + * \param it - is a pointer to the current iterator + * \param attr - is the index attribute to pass to iops->rec() + * \param arg - is a pointer to the idx_info structure + */ +static int dt_index_page_build(const struct lu_env *env, union lu_page *lp, + int nob, const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg) +{ + struct idx_info *ii = (struct idx_info *)arg; + struct lu_idxpage *lip = &lp->lp_idx; + char *entry; + int rc, size; + ENTRY; + + /* no support for variable key & record size for now */ + LASSERT((ii->ii_flags & II_FL_VARKEY) == 0); + LASSERT((ii->ii_flags & II_FL_VARREC) == 0); + + /* initialize the header of the new container */ + memset(lip, 0, LIP_HDR_SIZE); + lip->lip_magic = LIP_MAGIC; + nob -= LIP_HDR_SIZE; + + /* compute size needed to store a key/record pair */ + size = ii->ii_recsize + ii->ii_keysize; + if ((ii->ii_flags & II_FL_NOHASH) == 0) + /* add hash if the client wants it */ + size += sizeof(__u64); + + entry = lip->lip_entries; + do { + char *tmp_entry = entry; + struct dt_key *key; + __u64 hash; + + /* fetch 64-bit hash value */ + hash = iops->store(env, it); + ii->ii_hash_end = hash; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) { + if (lip->lip_nr != 0) + GOTO(out, rc = 0); + } + + if (nob < size) { + if (lip->lip_nr == 0) + GOTO(out, rc = -EINVAL); + GOTO(out, rc = 0); + } + + if ((ii->ii_flags & II_FL_NOHASH) == 0) { + /* client wants to the 64-bit hash value associated with + * each record */ + memcpy(tmp_entry, &hash, sizeof(hash)); + tmp_entry += sizeof(hash); + } + + /* then the key value */ + LASSERT(iops->key_size(env, it) == ii->ii_keysize); + key = iops->key(env, it); + memcpy(tmp_entry, key, ii->ii_keysize); + tmp_entry += ii->ii_keysize; + + /* and finally the record */ + rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr); + if (rc != -ESTALE) { + if (rc != 0) + GOTO(out, rc); + + /* hash/key/record successfully copied! */ + lip->lip_nr++; + if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0)) + ii->ii_hash_start = hash; + entry = tmp_entry + ii->ii_recsize; + nob -= size; + } + + /* move on to the next record */ + do { + rc = iops->next(env, it); + } while (rc == -ESTALE); + + } while (rc == 0); + + GOTO(out, rc); +out: + if (rc >= 0 && lip->lip_nr > 0) + /* one more container */ + ii->ii_count++; + if (rc > 0) + /* no more entries */ + ii->ii_hash_end = II_END_OFF; + return rc; +} + +/* + * Walk index and fill lu_page containers with key/record pairs + * + * \param env - is the environment passed by the caller + * \param obj - is the index object to parse + * \param rdpg - is the lu_rdpg descriptor associated with the transfer + * \param filler - is the callback function responsible for filling a lu_page + * with key/record pairs in the format wanted by the caller + * \param arg - is an opaq argument passed to the filler function + * + * \retval sum (in bytes) of all filled lu_pages + * \retval -ve errno on failure + */ +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg) +{ + struct dt_it *it; + const struct dt_it_ops *iops; + unsigned int pageidx, nob, nlupgs = 0; + int rc; + ENTRY; + + LASSERT(rdpg->rp_pages != NULL); + LASSERT(obj->do_index_ops != NULL); + + nob = rdpg->rp_count; + if (nob <= 0) + RETURN(-EFAULT); + + /* Iterate through index and fill containers from @rdpg */ + iops = &obj->do_index_ops->dio_it; + LASSERT(iops != NULL); + it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) { + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + } else if (rc > 0) { + rc = 0; + } + + /* Fill containers one after the other. There might be multiple + * containers per physical page. + * + * At this point and across for-loop: + * rc == 0 -> ok, proceed. + * rc > 0 -> end of index. + * rc < 0 -> error. */ + for (pageidx = 0; rc == 0 && nob > 0; pageidx++) { + union lu_page *lp; + int i; + + LASSERT(pageidx < rdpg->rp_npages); + lp = kmap(rdpg->rp_pages[pageidx]); + + /* fill lu pages */ + for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) { + rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE), + iops, it, rdpg->rp_attrs, arg); + if (rc < 0) + break; + /* one more lu_page */ + nlupgs++; + if (rc > 0) + /* end of index */ + break; + } + kunmap(rdpg->rp_pages[i]); + } + + iops->put(env, it); + iops->fini(env, it); + + if (rc >= 0) + rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count); + + RETURN(rc); +} +EXPORT_SYMBOL(dt_index_walk); + +/** + * Walk key/record pairs of an index and copy them into 4KB containers to be + * transferred over the network. This is the common handler for OBD_IDX_READ + * RPC processing. + * + * \param env - is the environment passed by the caller + * \param dev - is the dt_device storing the index + * \param ii - is the idx_info structure packed by the client in the + * OBD_IDX_READ request + * \param rdpg - is the lu_rdpg descriptor + * + * \retval on success, return sum (in bytes) of all filled containers + * \retval appropriate error otherwise. + */ +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg) +{ + const struct dt_index_features *feat; + struct dt_object *obj; + int rc; + ENTRY; + + /* rp_count shouldn't be null and should be a multiple of the container + * size */ + if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0) + RETURN(-EFAULT); + + if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL) + /* we don't support directory transfer via OBD_IDX_READ for the + * time being */ + RETURN(-EOPNOTSUPP); + + if (!fid_is_quota(&ii->ii_fid)) + /* block access to all local files except quota files */ + RETURN(-EPERM); + + /* lookup index object subject to the transfer */ + obj = dt_locate(env, dev, &ii->ii_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + if (dt_object_exists(obj) == 0) + GOTO(out, rc = -ENOENT); + + /* fetch index features associated with index object */ + feat = dt_index_feat_select(fid_seq(&ii->ii_fid), + lu_object_attr(&obj->do_lu)); + if (IS_ERR(feat)) + GOTO(out, rc = PTR_ERR(feat)); + + /* load index feature if not done already */ + if (obj->do_index_ops == NULL) { + rc = obj->do_ops->do_index_try(env, obj, feat); + if (rc) + GOTO(out, rc); + } + + /* fill ii_flags with supported index features */ + ii->ii_flags &= II_FL_NOHASH; + + ii->ii_keysize = feat->dif_keysize_max; + if ((feat->dif_flags & DT_IND_VARKEY) != 0) { + /* key size is variable */ + ii->ii_flags |= II_FL_VARKEY; + /* we don't support variable key size for the time being */ + GOTO(out, rc = -EOPNOTSUPP); + } + + ii->ii_recsize = feat->dif_recsize_max; + if ((feat->dif_flags & DT_IND_VARREC) != 0) { + /* record size is variable */ + ii->ii_flags |= II_FL_VARREC; + /* we don't support variable record size for the time being */ + GOTO(out, rc = -EOPNOTSUPP); + } + + if ((feat->dif_flags & DT_IND_NONUNQ) != 0) + /* key isn't necessarily unique */ + ii->ii_flags |= II_FL_NONUNQ; + + dt_read_lock(env, obj, 0); + /* fetch object version before walking the index */ + ii->ii_version = dt_version_get(env, obj); + + /* walk the index and fill lu_idxpages with key/record pairs */ + rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii); + dt_read_unlock(env, obj); + + if (rc == 0) { + /* index is empty */ + LASSERT(ii->ii_count == 0); + ii->ii_hash_end = II_END_OFF; + } + + GOTO(out, rc); +out: + lu_object_put(env, &obj->do_lu); + return rc; +} +EXPORT_SYMBOL(dt_index_read); + +#ifdef LPROCFS + +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + *eof = 1; + rc = snprintf(page, count, "%d\n", + (unsigned) osfs.os_bsize); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_blksize); + +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal); + +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree); + +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail); + +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_files); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filestotal); + +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc != 0) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_ffree); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filesfree); + +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c new file mode 100644 index 000000000000..1cc9b55890ca --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/genops.c @@ -0,0 +1,1855 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/genops.c + * + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include +#include +#include + +extern struct list_head obd_types; +spinlock_t obd_types_lock; + +struct kmem_cache *obd_device_cachep; +struct kmem_cache *obdo_cachep; +EXPORT_SYMBOL(obdo_cachep); +struct kmem_cache *import_cachep; + +struct list_head obd_zombie_imports; +struct list_head obd_zombie_exports; +spinlock_t obd_zombie_impexp_lock; +static void obd_zombie_impexp_notify(void); +static void obd_zombie_export_add(struct obd_export *exp); +static void obd_zombie_import_add(struct obd_import *imp); +static void print_export_data(struct obd_export *exp, + const char *status, int locks); + +int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +EXPORT_SYMBOL(ptlrpc_put_connection_superhack); + +/* + * support functions: we could use inter-module communication, but this + * is more portable to other OS's + */ +static struct obd_device *obd_device_alloc(void) +{ + struct obd_device *obd; + + OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO); + if (obd != NULL) { + obd->obd_magic = OBD_DEVICE_MAGIC; + } + return obd; +} + +static void obd_device_free(struct obd_device *obd) +{ + LASSERT(obd != NULL); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } + lu_ref_fini(&obd->obd_reference); + OBD_SLAB_FREE_PTR(obd, obd_device_cachep); +} + +struct obd_type *class_search_type(const char *name) +{ + struct list_head *tmp; + struct obd_type *type; + + spin_lock(&obd_types_lock); + list_for_each(tmp, &obd_types) { + type = list_entry(tmp, struct obd_type, typ_chain); + if (strcmp(type->typ_name, name) == 0) { + spin_unlock(&obd_types_lock); + return type; + } + } + spin_unlock(&obd_types_lock); + return NULL; +} +EXPORT_SYMBOL(class_search_type); + +struct obd_type *class_get_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + + if (!type) { + const char *modname = name; + + if (strcmp(modname, "obdfilter") == 0) + modname = "ofd"; + + if (strcmp(modname, LUSTRE_LWP_NAME) == 0) + modname = LUSTRE_OSP_NAME; + + if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME))) + modname = LUSTRE_MDT_NAME; + + if (!request_module("%s", modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); + type = class_search_type(name); + } else { + LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", + modname); + } + } + if (type) { + spin_lock(&type->obd_type_lock); + type->typ_refcnt++; + try_module_get(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); + } + return type; +} +EXPORT_SYMBOL(class_get_type); + +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + spin_lock(&type->obd_type_lock); + type->typ_refcnt--; + module_put(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); +} +EXPORT_SYMBOL(class_put_type); + +#define CLASS_MAX_NAME 1024 + +int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, + struct lprocfs_vars *vars, const char *name, + struct lu_device_type *ldt) +{ + struct obd_type *type; + int rc = 0; + ENTRY; + + /* sanity check */ + LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); + + if (class_search_type(name)) { + CDEBUG(D_IOCTL, "Type %s already registered\n", name); + RETURN(-EEXIST); + } + + rc = -ENOMEM; + OBD_ALLOC(type, sizeof(*type)); + if (type == NULL) + RETURN(rc); + + OBD_ALLOC_PTR(type->typ_dt_ops); + OBD_ALLOC_PTR(type->typ_md_ops); + OBD_ALLOC(type->typ_name, strlen(name) + 1); + + if (type->typ_dt_ops == NULL || + type->typ_md_ops == NULL || + type->typ_name == NULL) + GOTO (failed, rc); + + *(type->typ_dt_ops) = *dt_ops; + /* md_ops is optional */ + if (md_ops) + *(type->typ_md_ops) = *md_ops; + strcpy(type->typ_name, name); + spin_lock_init(&type->obd_type_lock); + +#ifdef LPROCFS + type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root, + vars, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; + GOTO (failed, rc); + } +#endif + if (ldt != NULL) { + type->typ_lu = ldt; + rc = lu_device_type_init(ldt); + if (rc != 0) + GOTO (failed, rc); + } + + spin_lock(&obd_types_lock); + list_add(&type->typ_chain, &obd_types); + spin_unlock(&obd_types_lock); + + RETURN (0); + + failed: + if (type->typ_name != NULL) + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(rc); +} +EXPORT_SYMBOL(class_register_type); + +int class_unregister_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + ENTRY; + + if (!type) { + CERROR("unknown obd type\n"); + RETURN(-EINVAL); + } + + if (type->typ_refcnt) { + CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); + /* This is a bad situation, let's make the best of it */ + /* Remove ops, but leave the name for debugging */ + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE_PTR(type->typ_md_ops); + RETURN(-EBUSY); + } + + /* we do not use type->typ_procroot as for compatibility purposes + * other modules can share names (i.e. lod can use lov entry). so + * we can't reference pointer as it can get invalided when another + * module removes the entry */ + lprocfs_try_remove_proc_entry(type->typ_name, proc_lustre_root); + + if (type->typ_lu) + lu_device_type_fini(type->typ_lu); + + spin_lock(&obd_types_lock); + list_del(&type->typ_chain); + spin_unlock(&obd_types_lock); + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(0); +} /* class_unregister_type */ +EXPORT_SYMBOL(class_unregister_type); + +/** + * Create a new obd device. + * + * Find an empty slot in ::obd_devs[], create a new obd device in it. + * + * \param[in] type_name obd device type string. + * \param[in] name obd device name. + * + * \retval NULL if create fails, otherwise return the obd device + * pointer created. + */ +struct obd_device *class_newdev(const char *type_name, const char *name) +{ + struct obd_device *result = NULL; + struct obd_device *newdev; + struct obd_type *type = NULL; + int i; + int new_obd_minor = 0; + ENTRY; + + if (strlen(name) >= MAX_OBD_NAME) { + CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); + RETURN(ERR_PTR(-EINVAL)); + } + + type = class_get_type(type_name); + if (type == NULL){ + CERROR("OBD: unknown type: %s\n", type_name); + RETURN(ERR_PTR(-ENODEV)); + } + + newdev = obd_device_alloc(); + if (newdev == NULL) + GOTO(out_type, result = ERR_PTR(-ENOMEM)); + + LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); + + write_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && (strcmp(name, obd->obd_name) == 0)) { + CERROR("Device %s already exists at %d, won't add\n", + name, i); + if (result) { + LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", result, + result->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(result->obd_minor == new_obd_minor, + "%p obd_minor %d != %d\n", result, + result->obd_minor, new_obd_minor); + + obd_devs[result->obd_minor] = NULL; + result->obd_name[0]='\0'; + } + result = ERR_PTR(-EEXIST); + break; + } + if (!result && !obd) { + result = newdev; + result->obd_minor = i; + new_obd_minor = i; + result->obd_type = type; + strncpy(result->obd_name, name, + sizeof(result->obd_name) - 1); + obd_devs[i] = result; + } + } + write_unlock(&obd_dev_lock); + + if (result == NULL && i >= class_devno_max()) { + CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n", + class_devno_max()); + GOTO(out, result = ERR_PTR(-EOVERFLOW)); + } + + if (IS_ERR(result)) + GOTO(out, result); + + CDEBUG(D_IOCTL, "Adding new device %s (%p)\n", + result->obd_name, result); + + RETURN(result); +out: + obd_device_free(newdev); +out_type: + class_put_type(type); + return result; +} + +void class_release_dev(struct obd_device *obd) +{ + struct obd_type *obd_type = obd->obd_type; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, obd_devs[obd->obd_minor]); + LASSERT(obd_type != NULL); + + CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n", + obd->obd_name, obd->obd_minor, obd->obd_type->typ_name); + + write_lock(&obd_dev_lock); + obd_devs[obd->obd_minor] = NULL; + write_unlock(&obd_dev_lock); + obd_device_free(obd); + + class_put_type(obd_type); +} + +int class_name2dev(const char *name) +{ + int i; + + if (!name) + return -1; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && strcmp(name, obd->obd_name) == 0) { + /* Make sure we finished attaching before we give + out any references */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_attached) { + read_unlock(&obd_dev_lock); + return i; + } + break; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_name2dev); + +struct obd_device *class_name2obd(const char *name) +{ + int dev = class_name2dev(name); + + if (dev < 0 || dev > class_devno_max()) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_name2obd); + +int class_uuid2dev(struct obd_uuid *uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + read_unlock(&obd_dev_lock); + return i; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_uuid2dev); + +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) +{ + int dev = class_uuid2dev(uuid); + if (dev < 0) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_uuid2obd); + +/** + * Get obd device from ::obd_devs[] + * + * \param num [in] array index + * + * \retval NULL if ::obd_devs[\a num] does not contains an obd device + * otherwise return the obd device there. + */ +struct obd_device *class_num2obd(int num) +{ + struct obd_device *obd = NULL; + + if (num < class_devno_max()) { + obd = obd_devs[num]; + if (obd == NULL) + return NULL; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == num, + "%p obd_minor %0d != %0d\n", + obd, obd->obd_minor, num); + } + + return obd; +} +EXPORT_SYMBOL(class_num2obd); + +/** + * Get obd devices count. Device in any + * state are counted + * \retval obd device count + */ +int get_devices_count(void) +{ + int index, max_index = class_devno_max(), dev_count = 0; + + read_lock(&obd_dev_lock); + for (index = 0; index <= max_index; index++) { + struct obd_device *obd = class_num2obd(index); + if (obd != NULL) + dev_count++; + } + read_unlock(&obd_dev_lock); + + return dev_count; +} +EXPORT_SYMBOL(get_devices_count); + +void class_obd_list(void) +{ + char *status; + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + read_unlock(&obd_dev_lock); + return; +} + +/* Search for a client OBD connected to tgt_uuid. If grp_uuid is + specified, then only the client with that uuid is returned, + otherwise any client connected to the tgt is returned. */ +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, typ_name, + strlen(typ_name)) == 0)) { + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && + ((grp_uuid)? obd_uuid_equals(grp_uuid, + &obd->obd_uuid) : 1)) { + read_unlock(&obd_dev_lock); + return obd; + } + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_find_client_obd); + +/* Iterate the obd_device list looking devices have grp_uuid. Start + searching at *next, and if a device is found, the next index to look + at is saved in *next. If next is NULL, then the first matching device + will always be returned. */ +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next) +{ + int i; + + if (next == NULL) + i = 0; + else if (*next >= 0 && *next < class_devno_max()) + i = *next; + else + return NULL; + + read_lock(&obd_dev_lock); + for (; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { + if (next != NULL) + *next = i+1; + read_unlock(&obd_dev_lock); + return obd; + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_devices_in_group); + +/** + * to notify sptlrpc log for \a fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, mdt, ost */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __FUNCTION__, obd); + read_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __FUNCTION__, obd); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); + +void obd_cleanup_caches(void) +{ + ENTRY; + if (obd_device_cachep) { + kmem_cache_destroy(obd_device_cachep); + obd_device_cachep = NULL; + } + if (obdo_cachep) { + kmem_cache_destroy(obdo_cachep); + obdo_cachep = NULL; + } + if (import_cachep) { + kmem_cache_destroy(import_cachep); + import_cachep = NULL; + } + if (capa_cachep) { + kmem_cache_destroy(capa_cachep); + capa_cachep = NULL; + } + EXIT; +} + +int obd_init_caches(void) +{ + ENTRY; + + LASSERT(obd_device_cachep == NULL); + obd_device_cachep = kmem_cache_create("ll_obd_dev_cache", + sizeof(struct obd_device), + 0, 0, NULL); + if (!obd_device_cachep) + GOTO(out, -ENOMEM); + + LASSERT(obdo_cachep == NULL); + obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo), + 0, 0, NULL); + if (!obdo_cachep) + GOTO(out, -ENOMEM); + + LASSERT(import_cachep == NULL); + import_cachep = kmem_cache_create("ll_import_cache", + sizeof(struct obd_import), + 0, 0, NULL); + if (!import_cachep) + GOTO(out, -ENOMEM); + + LASSERT(capa_cachep == NULL); + capa_cachep = kmem_cache_create("capa_cache", + sizeof(struct obd_capa), 0, 0, NULL); + if (!capa_cachep) + GOTO(out, -ENOMEM); + + RETURN(0); + out: + obd_cleanup_caches(); + RETURN(-ENOMEM); + +} + +/* map connection to client */ +struct obd_export *class_conn2export(struct lustre_handle *conn) +{ + struct obd_export *export; + ENTRY; + + if (!conn) { + CDEBUG(D_CACHE, "looking for null handle\n"); + RETURN(NULL); + } + + if (conn->cookie == -1) { /* this means assign a new connection */ + CDEBUG(D_CACHE, "want a new connection\n"); + RETURN(NULL); + } + + CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie); + export = class_handle2object(conn->cookie); + RETURN(export); +} +EXPORT_SYMBOL(class_conn2export); + +struct obd_device *class_exp2obd(struct obd_export *exp) +{ + if (exp) + return exp->exp_obd; + return NULL; +} +EXPORT_SYMBOL(class_exp2obd); + +struct obd_device *class_conn2obd(struct lustre_handle *conn) +{ + struct obd_export *export; + export = class_conn2export(conn); + if (export) { + struct obd_device *obd = export->exp_obd; + class_export_put(export); + return obd; + } + return NULL; +} +EXPORT_SYMBOL(class_conn2obd); + +struct obd_import *class_exp2cliimp(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_exp2cliimp); + +struct obd_import *class_conn2cliimp(struct lustre_handle *conn) +{ + struct obd_device *obd = class_conn2obd(conn); + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_conn2cliimp); + +/* Export management functions */ +static void class_export_destroy(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + ENTRY; + + LASSERT_ATOMIC_ZERO(&exp->exp_refcount); + LASSERT(obd != NULL); + + CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, + exp->exp_client_uuid.uuid, obd->obd_name); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + if (exp->exp_connection) + ptlrpc_put_connection_superhack(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_uncommitted_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_hp_rpcs)); + obd_destroy_export(exp); + class_decref(obd, "export", exp); + + OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); + EXIT; +} + +static void export_handle_addref(void *export) +{ + class_export_get(export); +} + +static struct portals_handle_ops export_handle_ops = { + .hop_addref = export_handle_addref, + .hop_free = NULL, +}; + +struct obd_export *class_export_get(struct obd_export *exp) +{ + atomic_inc(&exp->exp_refcount); + CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount)); + return exp; +} +EXPORT_SYMBOL(class_export_get); + +void class_export_put(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount) - 1); + + if (atomic_dec_and_test(&exp->exp_refcount)) { + LASSERT(!list_empty(&exp->exp_obd_chain)); + CDEBUG(D_IOCTL, "final put %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* release nid stat refererence */ + lprocfs_exp_cleanup(exp); + + obd_zombie_export_add(exp); + } +} +EXPORT_SYMBOL(class_export_put); + +/* Creates a new export, adds it to the hash table, and returns a + * pointer to it. The refcount is 2: one for the hash reference, and + * one for the pointer returned by this function. */ +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + cfs_hash_t *hash = NULL; + int rc = 0; + ENTRY; + + OBD_ALLOC_PTR(export); + if (!export) + return ERR_PTR(-ENOMEM); + + export->exp_conn_cnt = 0; + export->exp_lock_hash = NULL; + export->exp_flock_hash = NULL; + atomic_set(&export->exp_refcount, 2); + atomic_set(&export->exp_rpc_count, 0); + atomic_set(&export->exp_cb_count, 0); + atomic_set(&export->exp_locks_count, 0); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&export->exp_locks_list); + spin_lock_init(&export->exp_locks_list_guard); +#endif + atomic_set(&export->exp_replay_count, 0); + export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); + spin_lock_init(&export->exp_uncommitted_replies_lock); + INIT_LIST_HEAD(&export->exp_uncommitted_replies); + INIT_LIST_HEAD(&export->exp_req_replay_queue); + INIT_LIST_HEAD(&export->exp_handle.h_link); + INIT_LIST_HEAD(&export->exp_hp_rpcs); + class_handle_hash(&export->exp_handle, &export_handle_ops); + export->exp_last_request_time = cfs_time_current_sec(); + spin_lock_init(&export->exp_lock); + spin_lock_init(&export->exp_rpc_lock); + INIT_HLIST_NODE(&export->exp_uuid_hash); + INIT_HLIST_NODE(&export->exp_nid_hash); + spin_lock_init(&export->exp_bl_list_lock); + INIT_LIST_HEAD(&export->exp_bl_list); + + export->exp_sp_peer = LUSTRE_SP_ANY; + export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + export->exp_client_uuid = *cluuid; + obd_init_export(export); + + spin_lock(&obd->obd_dev_lock); + /* shouldn't happen, but might race */ + if (obd->obd_stopping) + GOTO(exit_unlock, rc = -ENODEV); + + hash = cfs_hash_getref(obd->obd_uuid_hash); + if (hash == NULL) + GOTO(exit_unlock, rc = -ENODEV); + spin_unlock(&obd->obd_dev_lock); + + if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { + rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash); + if (rc != 0) { + LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n", + obd->obd_name, cluuid->uuid, rc); + GOTO(exit_err, rc = -EALREADY); + } + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + cfs_hash_del(hash, cluuid, &export->exp_uuid_hash); + GOTO(exit_unlock, rc = -ENODEV); + } + + class_incref(obd, "export", export); + list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); + list_add_tail(&export->exp_obd_chain_timed, + &export->exp_obd->obd_exports_timed); + export->exp_obd->obd_num_exports++; + spin_unlock(&obd->obd_dev_lock); + cfs_hash_putref(hash); + RETURN(export); + +exit_unlock: + spin_unlock(&obd->obd_dev_lock); +exit_err: + if (hash) + cfs_hash_putref(hash); + class_handle_unhash(&export->exp_handle); + LASSERT(hlist_unhashed(&export->exp_uuid_hash)); + obd_destroy_export(export); + OBD_FREE_PTR(export); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(class_new_export); + +void class_unlink_export(struct obd_export *exp) +{ + class_handle_unhash(&exp->exp_handle); + + spin_lock(&exp->exp_obd->obd_dev_lock); + /* delete an uuid-export hashitem from hashtables */ + if (!hlist_unhashed(&exp->exp_uuid_hash)) + cfs_hash_del(exp->exp_obd->obd_uuid_hash, + &exp->exp_client_uuid, + &exp->exp_uuid_hash); + + list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); + list_del_init(&exp->exp_obd_chain_timed); + exp->exp_obd->obd_num_exports--; + spin_unlock(&exp->exp_obd->obd_dev_lock); + class_export_put(exp); +} +EXPORT_SYMBOL(class_unlink_export); + +/* Import management functions */ +void class_import_destroy(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, + imp->imp_obd->obd_name); + + LASSERT_ATOMIC_ZERO(&imp->imp_refcount); + + ptlrpc_put_connection_superhack(imp->imp_connection); + + while (!list_empty(&imp->imp_conn_list)) { + struct obd_import_conn *imp_conn; + + imp_conn = list_entry(imp->imp_conn_list.next, + struct obd_import_conn, oic_item); + list_del_init(&imp_conn->oic_item); + ptlrpc_put_connection_superhack(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + } + + LASSERT(imp->imp_sec == NULL); + class_decref(imp->imp_obd, "import", imp); + OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle); + EXIT; +} + +static void import_handle_addref(void *import) +{ + class_import_get(import); +} + +static struct portals_handle_ops import_handle_ops = { + .hop_addref = import_handle_addref, + .hop_free = NULL, +}; + +struct obd_import *class_import_get(struct obd_import *import) +{ + atomic_inc(&import->imp_refcount); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount), + import->imp_obd->obd_name); + return import; +} +EXPORT_SYMBOL(class_import_get); + +void class_import_put(struct obd_import *imp) +{ + ENTRY; + + LASSERT(list_empty(&imp->imp_zombie_chain)); + LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON); + + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, + atomic_read(&imp->imp_refcount) - 1, + imp->imp_obd->obd_name); + + if (atomic_dec_and_test(&imp->imp_refcount)) { + CDEBUG(D_INFO, "final put import %p\n", imp); + obd_zombie_import_add(imp); + } + + /* catch possible import put race */ + LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON); + EXIT; +} +EXPORT_SYMBOL(class_import_put); + +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + +struct obd_import *class_new_import(struct obd_device *obd) +{ + struct obd_import *imp; + + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; + + INIT_LIST_HEAD(&imp->imp_pinger_chain); + INIT_LIST_HEAD(&imp->imp_zombie_chain); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + spin_lock_init(&imp->imp_lock); + imp->imp_last_success_conn = 0; + imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd, "import", imp); + mutex_init(&imp->imp_sec_mutex); + init_waitqueue_head(&imp->imp_recovery_waitq); + + atomic_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); + atomic_set(&imp->imp_inflight, 0); + atomic_set(&imp->imp_replay_inflight, 0); + atomic_set(&imp->imp_inval_count, 0); + INIT_LIST_HEAD(&imp->imp_conn_list); + INIT_LIST_HEAD(&imp->imp_handle.h_link); + class_handle_hash(&imp->imp_handle, &import_handle_ops); + init_imp_at(&imp->imp_at); + + /* the default magic is V2, will be used in connect RPC, and + * then adjusted according to the flags in request/reply. */ + imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + + return imp; +} +EXPORT_SYMBOL(class_new_import); + +void class_destroy_import(struct obd_import *import) +{ + LASSERT(import != NULL); + LASSERT(import != LP_POISON); + + class_handle_unhash(&import->imp_handle); + + spin_lock(&import->imp_lock); + import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); +} +EXPORT_SYMBOL(class_destroy_import); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + + LASSERT(lock->l_exp_refs_nr >= 0); + + if (lock->l_exp_refs_target != NULL && + lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", + exp, lock, lock->l_exp_refs_target); + } + if ((lock->l_exp_refs_nr ++) == 0) { + list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); + lock->l_exp_refs_target = exp; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_add_lock_ref); + +void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + LASSERT(lock->l_exp_refs_nr > 0); + if (lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("lock %p, " + "mismatching export pointers: %p, %p\n", + lock, lock->l_exp_refs_target, exp); + } + if (-- lock->l_exp_refs_nr == 0) { + list_del_init(&lock->l_exp_refs_link); + lock->l_exp_refs_target = NULL; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_del_lock_ref); +#endif + +/* A connection defines an export context in which preallocation can + be managed. This releases the export pointer reference, and returns + the export handle, so the export refcount is 1 when this function + returns. */ +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(conn != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); + ENTRY; + + export = class_new_export(obd, cluuid); + if (IS_ERR(export)) + RETURN(PTR_ERR(export)); + + conn->cookie = export->exp_handle.h_cookie; + class_export_put(export); + + CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n", + cluuid->uuid, conn->cookie); + RETURN(0); +} +EXPORT_SYMBOL(class_connect); + +/* if export is involved in recovery then clean up related things */ +void class_export_recovery_cleanup(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + spin_lock(&obd->obd_recovery_task_lock); + if (exp->exp_delayed) + obd->obd_delayed_clients--; + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if (exp->exp_failed && + (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; + } + spin_unlock(&obd->obd_recovery_task_lock); + /** Cleanup req replay fields */ + if (exp->exp_req_replay_needed) { + spin_lock(&exp->exp_lock); + exp->exp_req_replay_needed = 0; + spin_unlock(&exp->exp_lock); + LASSERT(atomic_read(&obd->obd_req_replay_clients)); + atomic_dec(&obd->obd_req_replay_clients); + } + /** Cleanup lock replay data */ + if (exp->exp_lock_replay_needed) { + spin_lock(&exp->exp_lock); + exp->exp_lock_replay_needed = 0; + spin_unlock(&exp->exp_lock); + LASSERT(atomic_read(&obd->obd_lock_replay_clients)); + atomic_dec(&obd->obd_lock_replay_clients); + } +} + +/* This function removes 1-3 references from the export: + * 1 - for export pointer passed + * and if disconnect really need + * 2 - removing from hash + * 3 - in client_unlink_export + * The export pointer passed to this function can destroyed */ +int class_disconnect(struct obd_export *export) +{ + int already_disconnected; + ENTRY; + + if (export == NULL) { + CWARN("attempting to free NULL export %p\n", export); + RETURN(-EINVAL); + } + + spin_lock(&export->exp_lock); + already_disconnected = export->exp_disconnected; + export->exp_disconnected = 1; + spin_unlock(&export->exp_lock); + + /* class_cleanup(), abort_recovery(), and class_fail_export() + * all end up in here, and if any of them race we shouldn't + * call extra class_export_puts(). */ + if (already_disconnected) { + LASSERT(hlist_unhashed(&export->exp_nid_hash)); + GOTO(no_disconn, already_disconnected); + } + + CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n", + export->exp_handle.h_cookie); + + if (!hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + + class_export_recovery_cleanup(export); + class_unlink_export(export); +no_disconn: + class_export_put(export); + RETURN(0); +} +EXPORT_SYMBOL(class_disconnect); + +/* Return non-zero for a fully connected export */ +int class_connected_export(struct obd_export *exp) +{ + if (exp) { + int connected; + spin_lock(&exp->exp_lock); + connected = (exp->exp_conn_cnt > 0); + spin_unlock(&exp->exp_lock); + return connected; + } + return 0; +} +EXPORT_SYMBOL(class_connected_export); + +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) +{ + int rc; + struct obd_export *exp; + ENTRY; + + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. */ + while (!list_empty(list)) { + exp = list_entry(list->next, struct obd_export, + exp_obd_chain); + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + spin_lock(&exp->exp_lock); + exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); + + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) { + CDEBUG(D_HA, + "exp %p export uuid == obd uuid, don't discon\n", + exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); + class_export_put(exp); + continue; + } + + class_export_get(exp); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), " + "last request at "CFS_TIME_T"\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); + /* release one export reference anyway */ + rc = obd_disconnect(exp); + + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); + class_export_put(exp); + } + EXIT; +} + +void class_disconnect_exports(struct obd_device *obd) +{ + struct list_head work_list; + ENTRY; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_splice_init(&obd->obd_exports, &work_list); + list_splice_init(&obd->obd_delayed_exports, &work_list); + spin_unlock(&obd->obd_dev_lock); + + if (!list_empty(&work_list)) { + CDEBUG(D_HA, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + class_disconnect_export_list(&work_list, + exp_flags_from_obd(obd)); + } else + CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", + obd->obd_minor, obd); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_exports); + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *)) +{ + struct list_head work_list; + struct obd_export *exp, *n; + int evicted = 0; + ENTRY; + + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_for_each_entry_safe(exp, n, &obd->obd_exports, + exp_obd_chain) { + /* don't count self-export as client */ + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) + continue; + + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + + spin_lock(&exp->exp_lock); + if (exp->exp_failed || test_export(exp)) { + spin_unlock(&exp->exp_lock); + continue; + } + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + list_move(&exp->exp_obd_chain, &work_list); + evicted++; + CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n", + obd->obd_name, exp->exp_client_uuid.uuid, + exp->exp_connection == NULL ? "" : + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + print_export_data(exp, "EVICTING", 0); + } + spin_unlock(&obd->obd_dev_lock); + + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_stale_exports); + +void class_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + + spin_lock(&exp->exp_lock); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + else + CDEBUG(D_HA, "disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + class_export_put(exp); +} +EXPORT_SYMBOL(class_fail_export); + +char *obd_export_nid2str(struct obd_export *exp) +{ + if (exp->exp_connection != NULL) + return libcfs_nid2str(exp->exp_connection->c_peer.nid); + + return "(no nid)"; +} +EXPORT_SYMBOL(obd_export_nid2str); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid) +{ + cfs_hash_t *nid_hash; + struct obd_export *doomed_exp = NULL; + int exports_evicted = 0; + + lnet_nid_t nid_key = libcfs_str2nid((char *)nid); + + spin_lock(&obd->obd_dev_lock); + /* umount has run already, so evict thread should leave + * its task to umount thread now */ + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + nid_hash = obd->obd_nid_hash; + cfs_hash_getref(nid_hash); + spin_unlock(&obd->obd_dev_lock); + + do { + doomed_exp = cfs_hash_lookup(nid_hash, &nid_key); + if (doomed_exp == NULL) + break; + + LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key, + "nid %s found, wanted nid %s, requested nid %s\n", + obd_export_nid2str(doomed_exp), + libcfs_nid2str(nid_key), nid); + LASSERTF(doomed_exp != obd->obd_self_export, + "self-export is hashed by NID?\n"); + exports_evicted++; + LCONSOLE_WARN("%s: evicting %s (at %s) by administrative " + "request\n", obd->obd_name, + obd_uuid2str(&doomed_exp->exp_client_uuid), + obd_export_nid2str(doomed_exp)); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + } while (1); + + cfs_hash_putref(nid_hash); + + if (!exports_evicted) + CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_nid); + +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) +{ + cfs_hash_t *uuid_hash; + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed_uuid; + int exports_evicted = 0; + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + uuid_hash = obd->obd_uuid_hash; + cfs_hash_getref(uuid_hash); + spin_unlock(&obd->obd_dev_lock); + + obd_str2uuid(&doomed_uuid, uuid); + if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) { + CERROR("%s: can't evict myself\n", obd->obd_name); + cfs_hash_putref(uuid_hash); + return exports_evicted; + } + + doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid); + + if (doomed_exp == NULL) { + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); + } else { + CWARN("%s: evicting %s at adminstrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + exports_evicted++; + } + cfs_hash_putref(uuid_hash); + + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_uuid); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); +#endif + +static void print_export_data(struct obd_export *exp, const char *status, + int locks) +{ + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + spin_lock(&exp->exp_lock); + list_for_each_entry(rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + spin_unlock(&exp->exp_lock); + + CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n", + exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp), atomic_read(&exp->exp_refcount), + atomic_read(&exp->exp_rpc_count), + atomic_read(&exp->exp_cb_count), + atomic_read(&exp->exp_locks_count), + exp->exp_disconnected, exp->exp_delayed, exp->exp_failed, + nreplies, first_reply, nreplies > 3 ? "..." : "", + exp->exp_last_committed); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + if (locks && class_export_dump_hook != NULL) + class_export_dump_hook(exp); +#endif +} + +void dump_exports(struct obd_device *obd, int locks) +{ + struct obd_export *exp; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) + print_export_data(exp, "ACTIVE", locks); + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) + print_export_data(exp, "UNLINKED", locks); + list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) + print_export_data(exp, "DELAYED", locks); + spin_unlock(&obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain) + print_export_data(exp, "ZOMBIE", locks); + spin_unlock(&obd_zombie_impexp_lock); +} +EXPORT_SYMBOL(dump_exports); + +void obd_exports_barrier(struct obd_device *obd) +{ + int waited = 2; + LASSERT(list_empty(&obd->obd_exports)); + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_unlinked_exports)) { + spin_unlock(&obd->obd_dev_lock); + schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, + cfs_time_seconds(waited)); + if (waited > 5 && IS_PO2(waited)) { + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports " + "more than %d seconds. " + "The obd refcount = %d. Is it stuck?\n", + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); + dump_exports(obd, 1); + } + waited *= 2; + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(obd_exports_barrier); + +/* Total amount of zombies to be destroyed */ +static int zombies_count = 0; + +/** + * kill zombie imports and exports + */ +void obd_zombie_impexp_cull(void) +{ + struct obd_import *import; + struct obd_export *export; + ENTRY; + + do { + spin_lock(&obd_zombie_impexp_lock); + + import = NULL; + if (!list_empty(&obd_zombie_imports)) { + import = list_entry(obd_zombie_imports.next, + struct obd_import, + imp_zombie_chain); + list_del_init(&import->imp_zombie_chain); + } + + export = NULL; + if (!list_empty(&obd_zombie_exports)) { + export = list_entry(obd_zombie_exports.next, + struct obd_export, + exp_obd_chain); + list_del_init(&export->exp_obd_chain); + } + + spin_unlock(&obd_zombie_impexp_lock); + + if (import != NULL) { + class_import_destroy(import); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + if (export != NULL) { + class_export_destroy(export); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + cond_resched(); + } while (import != NULL || export != NULL); + EXIT; +} + +static struct completion obd_zombie_start; +static struct completion obd_zombie_stop; +static unsigned long obd_zombie_flags; +static wait_queue_head_t obd_zombie_waitq; +static pid_t obd_zombie_pid; + +enum { + OBD_ZOMBIE_STOP = 0x0001, +}; + +/** + * check for work for kill zombie import/export thread. + */ +static int obd_zombie_impexp_check(void *arg) +{ + int rc; + + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0) && + !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + spin_unlock(&obd_zombie_impexp_lock); + + RETURN(rc); +} + +/** + * Add export to the obd_zombe thread and notify it. + */ +static void obd_zombie_export_add(struct obd_export *exp) { + spin_lock(&exp->exp_obd->obd_dev_lock); + LASSERT(!list_empty(&exp->exp_obd_chain)); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + zombies_count++; + list_add(&exp->exp_obd_chain, &obd_zombie_exports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * Add import to the obd_zombe thread and notify it. + */ +static void obd_zombie_import_add(struct obd_import *imp) { + LASSERT(imp->imp_sec == NULL); + LASSERT(imp->imp_rq_pool == NULL); + spin_lock(&obd_zombie_impexp_lock); + LASSERT(list_empty(&imp->imp_zombie_chain)); + zombies_count++; + list_add(&imp->imp_zombie_chain, &obd_zombie_imports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * notify import/export destroy thread about new zombie. + */ +static void obd_zombie_impexp_notify(void) +{ + /* + * Make sure obd_zomebie_impexp_thread get this notification. + * It is possible this signal only get by obd_zombie_barrier, and + * barrier gulps this notification and sleeps away and hangs ensues + */ + wake_up_all(&obd_zombie_waitq); +} + +/** + * check whether obd_zombie is idle + */ +static int obd_zombie_is_idle(void) +{ + int rc; + + LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)); + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0); + spin_unlock(&obd_zombie_impexp_lock); + return rc; +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + struct l_wait_info lwi = { 0 }; + + if (obd_zombie_pid == current_pid()) + /* don't wait for myself */ + return; + l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi); +} +EXPORT_SYMBOL(obd_zombie_barrier); + + +/** + * destroy zombie export/import thread. + */ +static int obd_zombie_impexp_thread(void *unused) +{ + unshare_fs_struct(); + complete(&obd_zombie_start); + + obd_zombie_pid = current_pid(); + + while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) { + struct l_wait_info lwi = { 0 }; + + l_wait_event(obd_zombie_waitq, + !obd_zombie_impexp_check(NULL), &lwi); + obd_zombie_impexp_cull(); + + /* + * Notify obd_zombie_barrier callers that queues + * may be empty. + */ + wake_up(&obd_zombie_waitq); + } + + complete(&obd_zombie_stop); + + RETURN(0); +} + + +/** + * start destroy zombie import/export thread + */ +int obd_zombie_impexp_init(void) +{ + task_t *task; + + INIT_LIST_HEAD(&obd_zombie_imports); + INIT_LIST_HEAD(&obd_zombie_exports); + spin_lock_init(&obd_zombie_impexp_lock); + init_completion(&obd_zombie_start); + init_completion(&obd_zombie_stop); + init_waitqueue_head(&obd_zombie_waitq); + obd_zombie_pid = 0; + + task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid"); + if (IS_ERR(task)) + RETURN(PTR_ERR(task)); + + wait_for_completion(&obd_zombie_start); + RETURN(0); +} +/** + * stop destroy zombie import/export thread + */ +void obd_zombie_impexp_stop(void) +{ + set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + obd_zombie_impexp_notify(); + wait_for_completion(&obd_zombie_stop); +} + +/***** Kernel-userspace comm helpers *******/ + +/* Get length of entire message, including header */ +int kuc_len(int payload_len) +{ + return sizeof(struct kuc_hdr) + payload_len; +} +EXPORT_SYMBOL(kuc_len); + +/* Get a pointer to kuc header, given a ptr to the payload + * @param p Pointer to payload area + * @returns Pointer to kuc header + */ +struct kuc_hdr * kuc_ptr(void *p) +{ + struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1; + LASSERT(lh->kuc_magic == KUC_MAGIC); + return lh; +} +EXPORT_SYMBOL(kuc_ptr); + +/* Test if payload is part of kuc message + * @param p Pointer to payload area + * @returns boolean + */ +int kuc_ispayload(void *p) +{ + struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1; + + if (kh->kuc_magic == KUC_MAGIC) + return 1; + else + return 0; +} +EXPORT_SYMBOL(kuc_ispayload); + +/* Alloc space for a message, and fill in header + * @return Pointer to payload area + */ +void *kuc_alloc(int payload_len, int transport, int type) +{ + struct kuc_hdr *lh; + int len = kuc_len(payload_len); + + OBD_ALLOC(lh, len); + if (lh == NULL) + return ERR_PTR(-ENOMEM); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = transport; + lh->kuc_msgtype = type; + lh->kuc_msglen = len; + + return (void *)(lh + 1); +} +EXPORT_SYMBOL(kuc_alloc); + +/* Takes pointer to payload area */ +inline void kuc_free(void *p, int payload_len) +{ + struct kuc_hdr *lh = kuc_ptr(p); + OBD_FREE(lh, kuc_len(payload_len)); +} +EXPORT_SYMBOL(kuc_free); diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c new file mode 100644 index 000000000000..622f8d165275 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/idmap.c @@ -0,0 +1,474 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/idmap.c + * + * Lustre user identity mapping. + * + * Author: Fan Yong + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#define lustre_get_group_info(group_info) do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +#define lustre_put_group_info(group_info) do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +/* + * groups_search() is copied from linux kernel! + * A simple bsearch. + */ +static int lustre_groups_search(group_info_t *group_info, + gid_t grp) +{ + int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + int mid = (left + right) / 2; + int cmp = grp - CFS_GROUP_AT(group_info, mid); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist) +{ + int i; + int count = ginfo->ngroups; + + /* fill group_info from gid array */ + for (i = 0; i < ginfo->nblocks && count > 0; i++) { + int cp_count = min(CFS_NGROUPS_PER_BLOCK, count); + int off = i * CFS_NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*glist); + + memcpy(ginfo->blocks[i], glist + off, len); + count -= cp_count; + } +} +EXPORT_SYMBOL(lustre_groups_from_list); + +/* groups_sort() is copied from linux kernel! */ +/* a simple shell-metzner sort */ +void lustre_groups_sort(group_info_t *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = CFS_GROUP_AT(group_info, right); + + while (left >= 0 && + CFS_GROUP_AT(group_info, left) > tmp) { + CFS_GROUP_AT(group_info, right) = + CFS_GROUP_AT(group_info, left); + right = left; + left -= stride; + } + CFS_GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} +EXPORT_SYMBOL(lustre_groups_sort); + +int lustre_in_group_p(struct lu_ucred *mu, gid_t grp) +{ + int rc = 1; + + if (grp != mu->uc_fsgid) { + group_info_t *group_info = NULL; + + if (mu->uc_ginfo || !mu->uc_identity || + mu->uc_valid == UCRED_OLD) + if (grp == mu->uc_suppgids[0] || + grp == mu->uc_suppgids[1]) + return 1; + + if (mu->uc_ginfo) + group_info = mu->uc_ginfo; + else if (mu->uc_identity) + group_info = mu->uc_identity->mi_ginfo; + + if (!group_info) + return 0; + + lustre_get_group_info(group_info); + rc = lustre_groups_search(group_info, grp); + lustre_put_group_info(group_info); + } + return rc; +} +EXPORT_SYMBOL(lustre_in_group_p); + +struct lustre_idmap_entry { + struct list_head lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */ + struct list_head lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */ + struct list_head lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */ + struct list_head lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */ + uid_t lie_rmt_uid; /* remote uid */ + uid_t lie_lcl_uid; /* local uid */ + gid_t lie_rmt_gid; /* remote gid */ + gid_t lie_lcl_gid; /* local gid */ +}; + +static inline __u32 lustre_idmap_hashfunc(__u32 id) +{ + return id & (CFS_IDMAP_HASHSIZE - 1); +} + +static +struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + struct lustre_idmap_entry *e; + + OBD_ALLOC_PTR(e); + if (e == NULL) + return NULL; + + INIT_LIST_HEAD(&e->lie_rmt_uid_hash); + INIT_LIST_HEAD(&e->lie_lcl_uid_hash); + INIT_LIST_HEAD(&e->lie_rmt_gid_hash); + INIT_LIST_HEAD(&e->lie_lcl_gid_hash); + e->lie_rmt_uid = rmt_uid; + e->lie_lcl_uid = lcl_uid; + e->lie_rmt_gid = rmt_gid; + e->lie_lcl_gid = lcl_gid; + + return e; +} + +static void idmap_entry_free(struct lustre_idmap_entry *e) +{ + if (!list_empty(&e->lie_rmt_uid_hash)) + list_del(&e->lie_rmt_uid_hash); + if (!list_empty(&e->lie_lcl_uid_hash)) + list_del(&e->lie_lcl_uid_hash); + if (!list_empty(&e->lie_rmt_gid_hash)) + list_del(&e->lie_rmt_gid_hash); + if (!list_empty(&e->lie_lcl_gid_hash)) + list_del(&e->lie_lcl_gid_hash); + OBD_FREE_PTR(e); +} + +/* + * return value + * NULL: not found entry + * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry + * others: found normal entry + */ +static +struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t, + uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + struct list_head *head; + struct lustre_idmap_entry *e; + + head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)]; + list_for_each_entry(e, head, lie_rmt_uid_hash) + if (e->lie_rmt_uid == rmt_uid) { + if (e->lie_lcl_uid == lcl_uid) { + if (e->lie_rmt_gid == rmt_gid && + e->lie_lcl_gid == lcl_gid) + /* must be quaternion match */ + return e; + } else { + /* 1:N uid mapping */ + CERROR("rmt uid %u already be mapped to %u" + " (new %u)\n", e->lie_rmt_uid, + e->lie_lcl_uid, lcl_uid); + return ERR_PTR(-EACCES); + } + } + + head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)]; + list_for_each_entry(e, head, lie_rmt_gid_hash) + if (e->lie_rmt_gid == rmt_gid) { + if (e->lie_lcl_gid == lcl_gid) { + if (unlikely(e->lie_rmt_uid == rmt_uid && + e->lie_lcl_uid == lcl_uid)) + /* after uid mapping search above, + * we should never come here */ + LBUG(); + } else { + /* 1:N gid mapping */ + CERROR("rmt gid %u already be mapped to %u" + " (new %u)\n", e->lie_rmt_gid, + e->lie_lcl_gid, lcl_gid); + return ERR_PTR(-EACCES); + } + } + + return NULL; +} + +static __u32 idmap_lookup_uid(struct list_head *hash, int reverse, + __u32 uid) +{ + struct list_head *head = &hash[lustre_idmap_hashfunc(uid)]; + struct lustre_idmap_entry *e; + + if (!reverse) { + list_for_each_entry(e, head, lie_rmt_uid_hash) + if (e->lie_rmt_uid == uid) + return e->lie_lcl_uid; + } else { + list_for_each_entry(e, head, lie_lcl_uid_hash) + if (e->lie_lcl_uid == uid) + return e->lie_rmt_uid; + } + + return CFS_IDMAP_NOTFOUND; +} + +static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid) +{ + struct list_head *head = &hash[lustre_idmap_hashfunc(gid)]; + struct lustre_idmap_entry *e; + + if (!reverse) { + list_for_each_entry(e, head, lie_rmt_gid_hash) + if (e->lie_rmt_gid == gid) + return e->lie_lcl_gid; + } else { + list_for_each_entry(e, head, lie_lcl_gid_hash) + if (e->lie_lcl_gid == gid) + return e->lie_rmt_gid; + } + + return CFS_IDMAP_NOTFOUND; +} + +int lustre_idmap_add(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid) +{ + struct lustre_idmap_entry *e0, *e1; + + LASSERT(t); + + spin_lock(&t->lit_lock); + e0 = idmap_search_entry(t, ruid, luid, rgid, lgid); + spin_unlock(&t->lit_lock); + if (!e0) { + e0 = idmap_entry_alloc(ruid, luid, rgid, lgid); + if (!e0) + return -ENOMEM; + + spin_lock(&t->lit_lock); + e1 = idmap_search_entry(t, ruid, luid, rgid, lgid); + if (e1 == NULL) { + list_add_tail(&e0->lie_rmt_uid_hash, + &t->lit_idmaps[RMT_UIDMAP_IDX] + [lustre_idmap_hashfunc(ruid)]); + list_add_tail(&e0->lie_lcl_uid_hash, + &t->lit_idmaps[LCL_UIDMAP_IDX] + [lustre_idmap_hashfunc(luid)]); + list_add_tail(&e0->lie_rmt_gid_hash, + &t->lit_idmaps[RMT_GIDMAP_IDX] + [lustre_idmap_hashfunc(rgid)]); + list_add_tail(&e0->lie_lcl_gid_hash, + &t->lit_idmaps[LCL_GIDMAP_IDX] + [lustre_idmap_hashfunc(lgid)]); + } + spin_unlock(&t->lit_lock); + if (e1 != NULL) { + idmap_entry_free(e0); + if (IS_ERR(e1)) + return PTR_ERR(e1); + } + } else if (IS_ERR(e0)) { + return PTR_ERR(e0); + } + + return 0; +} +EXPORT_SYMBOL(lustre_idmap_add); + +int lustre_idmap_del(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid) +{ + struct lustre_idmap_entry *e; + int rc = 0; + + LASSERT(t); + + spin_lock(&t->lit_lock); + e = idmap_search_entry(t, ruid, luid, rgid, lgid); + if (IS_ERR(e)) + rc = PTR_ERR(e); + else if (e) + idmap_entry_free(e); + spin_unlock(&t->lit_lock); + + return rc; +} +EXPORT_SYMBOL(lustre_idmap_del); + +int lustre_idmap_lookup_uid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, uid_t uid) +{ + struct list_head *hash; + + if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) { + if (!reverse) { + if (uid == mu->uc_o_uid) + return mu->uc_uid; + else if (uid == mu->uc_o_fsuid) + return mu->uc_fsuid; + } else { + if (uid == mu->uc_uid) + return mu->uc_o_uid; + else if (uid == mu->uc_fsuid) + return mu->uc_o_fsuid; + } + } + + if (t == NULL) + return CFS_IDMAP_NOTFOUND; + + hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX]; + + spin_lock(&t->lit_lock); + uid = idmap_lookup_uid(hash, reverse, uid); + spin_unlock(&t->lit_lock); + + return uid; +} +EXPORT_SYMBOL(lustre_idmap_lookup_uid); + +int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t, + int reverse, gid_t gid) +{ + struct list_head *hash; + + if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) { + if (!reverse) { + if (gid == mu->uc_o_gid) + return mu->uc_gid; + else if (gid == mu->uc_o_fsgid) + return mu->uc_fsgid; + } else { + if (gid == mu->uc_gid) + return mu->uc_o_gid; + else if (gid == mu->uc_fsgid) + return mu->uc_o_fsgid; + } + } + + if (t == NULL) + return CFS_IDMAP_NOTFOUND; + + hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX]; + + spin_lock(&t->lit_lock); + gid = idmap_lookup_gid(hash, reverse, gid); + spin_unlock(&t->lit_lock); + + return gid; +} +EXPORT_SYMBOL(lustre_idmap_lookup_gid); + +struct lustre_idmap_table *lustre_idmap_init(void) +{ + struct lustre_idmap_table *t; + int i, j; + + OBD_ALLOC_PTR(t); + if(unlikely(t == NULL)) + return (ERR_PTR(-ENOMEM)); + + spin_lock_init(&t->lit_lock); + for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++) + for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++) + INIT_LIST_HEAD(&t->lit_idmaps[i][j]); + + return t; +} +EXPORT_SYMBOL(lustre_idmap_init); + +void lustre_idmap_fini(struct lustre_idmap_table *t) +{ + struct list_head *list; + struct lustre_idmap_entry *e; + int i; + LASSERT(t); + + list = t->lit_idmaps[RMT_UIDMAP_IDX]; + spin_lock(&t->lit_lock); + for (i = 0; i < CFS_IDMAP_HASHSIZE; i++) + while (!list_empty(&list[i])) { + e = list_entry(list[i].next, + struct lustre_idmap_entry, + lie_rmt_uid_hash); + idmap_entry_free(e); + } + spin_unlock(&t->lit_lock); + + OBD_FREE_PTR(t); +} +EXPORT_SYMBOL(lustre_idmap_fini); diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c new file mode 100644 index 000000000000..b5c19ac1470f --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linkea.c @@ -0,0 +1,194 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + * Use is subject to license terms. + * + * Author: Di Wang + */ + +#include +#include +#include + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) +{ + ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE); + if (ldata->ld_buf->lb_buf == NULL) + return -ENOMEM; + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_leh->leh_magic = LINK_EA_MAGIC; + ldata->ld_leh->leh_len = sizeof(struct link_ea_header); + ldata->ld_leh->leh_reccount = 0; + return 0; +} +EXPORT_SYMBOL(linkea_data_new); + +int linkea_init(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + + LASSERT(ldata->ld_buf != NULL); + leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + /* entries are swabbed by linkea_entry_unpack */ + } + if (leh->leh_magic != LINK_EA_MAGIC) + return -EINVAL; + if (leh->leh_reccount == 0) + return -ENODATA; + + ldata->ld_leh = leh; + return 0; +} +EXPORT_SYMBOL(linkea_init); + +/** + * Pack a link_ea_entry. + * All elements are stored as chars to avoid alignment issues. + * Numbers are always big-endian + * \retval record length + */ +static int linkea_entry_pack(struct link_ea_entry *lee, + const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_fid tmpfid; + int reclen; + + fid_cpu_to_be(&tmpfid, pfid); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) + tmpfid.f_ver = ~0; + memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); + memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); + reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; + + lee->lee_reclen[0] = (reclen >> 8) & 0xff; + lee->lee_reclen[1] = reclen & 0xff; + return reclen; +} + +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid) +{ + *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; + memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); + fid_be_to_cpu(pfid, pfid); + lname->ln_name = lee->lee_name; + lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); +} +EXPORT_SYMBOL(linkea_entry_unpack); + +/** + * Add a record to the end of link ea buf + **/ +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + LASSERT(ldata->ld_leh != NULL); + + if (lname == NULL || pfid == NULL) + return -EINVAL; + + ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + if (ldata->ld_leh->leh_len + ldata->ld_reclen > + ldata->ld_buf->lb_len) { + if (lu_buf_check_and_grow(ldata->ld_buf, + ldata->ld_leh->leh_len + + ldata->ld_reclen) < 0) + return -ENOMEM; + } + + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len; + ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); + ldata->ld_leh->leh_len += ldata->ld_reclen; + ldata->ld_leh->leh_reccount++; + CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n", + lname->ln_namelen, lname->ln_name); + return 0; +} +EXPORT_SYMBOL(linkea_add_buf); + +/** Del the current record from the link ea buf */ +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname) +{ + LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL); + + ldata->ld_leh->leh_reccount--; + ldata->ld_leh->leh_len -= ldata->ld_reclen; + memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, + (char *)ldata->ld_leh + ldata->ld_leh->leh_len - + (char *)ldata->ld_lee); + CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", + lname->ln_namelen, lname->ln_name); +} +EXPORT_SYMBOL(linkea_del_buf); + +/** + * Check if such a link exists in linkEA. + * + * \param ldata link data the search to be done on + * \param lname name in the parent's directory entry pointing to this object + * \param pfid parent fid the link to be found for + * + * \retval 0 success + * \retval -ENOENT link does not exist + * \retval -ve on error + */ +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_name tmpname; + struct lu_fid tmpfid; + int count; + + LASSERT(ldata->ld_leh != NULL); + + /* link #0 */ + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); + + for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tmpname, &tmpfid); + if (tmpname.ln_namelen == lname->ln_namelen && + lu_fid_eq(&tmpfid, pfid) && + (strncmp(tmpname.ln_name, lname->ln_name, + tmpname.ln_namelen) == 0)) + break; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + if (count == ldata->ld_leh->leh_reccount) { + CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", + lname->ln_namelen, lname->ln_name); + ldata->ld_lee = NULL; + return -ENOENT; + } + return 0; +} +EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c new file mode 100644 index 000000000000..16208ba9d072 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c @@ -0,0 +1,430 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-module.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int proc_version; + +/* buffer MUST be at least the size of obd_ioctl_hdr */ +int obd_ioctl_getdata(char **buf, int *len, void *arg) +{ + struct obd_ioctl_hdr hdr; + struct obd_ioctl_data *data; + int err; + int offset = 0; + ENTRY; + + err = copy_from_user(&hdr, (void *)arg, sizeof(hdr)); + if ( err ) + RETURN(err); + + if (hdr.ioc_version != OBD_IOCTL_VERSION) { + CERROR("Version mismatch kernel (%x) vs application (%x)\n", + OBD_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { + CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); + RETURN(-EINVAL); + } + + /* When there are lots of processes calling vmalloc on multi-core + * system, the high lock contention will hurt performance badly, + * obdfilter-survey is an example, which relies on ioctl. So we'd + * better avoid vmalloc on ioctl path. LU-66 */ + OBD_ALLOC_LARGE(*buf, hdr.ioc_len); + if (*buf == NULL) { + CERROR("Cannot allocate control buffer of len %d\n", + hdr.ioc_len); + RETURN(-EINVAL); + } + *len = hdr.ioc_len; + data = (struct obd_ioctl_data *)*buf; + + err = copy_from_user(*buf, (void *)arg, hdr.ioc_len); + if ( err ) { + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(err); + } + + if (obd_ioctl_is_invalid(data)) { + CERROR("ioctl not correctly formatted\n"); + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + offset += cfs_size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen2); + } + + if (data->ioc_inllen3) { + data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen3); + } + + if (data->ioc_inllen4) { + data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; + } + + EXIT; + return 0; +} +EXPORT_SYMBOL(obd_ioctl_getdata); + +int obd_ioctl_popdata(void *arg, void *data, int len) +{ + int err; + + err = copy_to_user(arg, data, len); + if (err) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(obd_ioctl_popdata); + +/* opening /dev/obd */ +static int obd_class_open(struct inode * inode, struct file * file) +{ + ENTRY; + + try_module_get(THIS_MODULE); + RETURN(0); +} + +/* closing /dev/obd */ +static int obd_class_release(struct inode * inode, struct file * file) +{ + ENTRY; + + module_put(THIS_MODULE); + RETURN(0); +} + +/* to control /dev/obd */ +static long obd_class_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + ENTRY; + + /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */ + if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET)) + RETURN(err = -EACCES); + if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ + RETURN(err = -ENOTTY); + + err = class_handle_ioctl(cmd, (unsigned long)arg); + + RETURN(err); +} + +/* declare character device */ +static struct file_operations obd_psdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ + .open = obd_class_open, /* open */ + .release = obd_class_release, /* release */ +}; + +/* modules setup */ +psdev_t obd_psdev = { + .minor = OBD_DEV_MINOR, + .name = OBD_DEV_NAME, + .fops = &obd_psdev_fops, +}; + + +#ifdef LPROCFS +int obd_proc_read_version(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, "lustre: %s\nkernel: %s\nbuild: %s\n", + LUSTRE_VERSION_STRING, "patchless_client", + BUILD_VERSION); +} + +int obd_proc_read_pinger(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, "%s\n", + "on" + ); +} + +/** + * Check all obd devices health + * + * \param page + * \param start + * \param off + * \param count + * \param eof + * \param data + * proc read function parameters, please refer to kernel + * code fs/proc/generic.c proc_file_read() + * \param data [in] unused + * + * \retval number of characters printed + */ +static int obd_proc_read_health(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int rc = 0, i; + *eof = 1; + + if (libcfs_catastrophe) + rc += snprintf(page + rc, count - rc, "LBUG\n"); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __FUNCTION__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) { + rc += snprintf(page + rc, count - rc, + "device %s reported unhealthy\n", + obd->obd_name); + } + class_decref(obd, __FUNCTION__, current); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + + if (rc == 0) + return snprintf(page, count, "healthy\n"); + + rc += snprintf(page + rc, count - rc, "NOT HEALTHY\n"); + return rc; +} + +static int obd_proc_rd_jobid_var(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return snprintf(page, count, "%s\n", obd_jobid_var); +} + +static int obd_proc_wr_jobid_var(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) + return -EINVAL; + + memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); + /* Trim the trailing '\n' if any */ + memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n')); + return count; +} + +/* Root for /proc/fs/lustre */ +struct proc_dir_entry *proc_lustre_root = NULL; +EXPORT_SYMBOL(proc_lustre_root); + +struct lprocfs_vars lprocfs_base[] = { + { "version", obd_proc_read_version, NULL, NULL }, + { "pinger", obd_proc_read_pinger, NULL, NULL }, + { "health_check", obd_proc_read_health, NULL, NULL }, + { "jobid_var", obd_proc_rd_jobid_var, + obd_proc_wr_jobid_var, NULL }, + { 0 } +}; +#else +#define lprocfs_base NULL +#endif /* LPROCFS */ + +static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static void obd_device_list_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static int obd_device_list_seq_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + struct obd_device *obd = class_num2obd((int)index); + char *status; + + if (obd == NULL) + return 0; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + return seq_printf(p, "%3d %s %s %s %s %d\n", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); +} + +struct seq_operations obd_device_list_sops = { + .start = obd_device_list_seq_start, + .stop = obd_device_list_seq_stop, + .next = obd_device_list_seq_next, + .show = obd_device_list_seq_show, +}; + +static int obd_device_list_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + int rc = seq_open(file, &obd_device_list_sops); + + if (rc) + return rc; + + seq = file->private_data; + seq->private = dp->data; + + return 0; +} + +struct file_operations obd_device_list_fops = { + .owner = THIS_MODULE, + .open = obd_device_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int class_procfs_init(void) +{ + int rc; + ENTRY; + + obd_sysctl_init(); + proc_lustre_root = lprocfs_register("fs/lustre", NULL, + lprocfs_base, NULL); + rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444, + &obd_device_list_fops, NULL); + if (rc) + CERROR("error adding /proc/fs/lustre/devices file\n"); + RETURN(0); +} + +int class_procfs_clean(void) +{ + ENTRY; + if (proc_lustre_root) { + lprocfs_remove(&proc_lustre_root); + } + RETURN(0); +} diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c new file mode 100644 index 000000000000..6ee347153a16 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include + +#include +#include /* for PAGE_CACHE_SIZE */ + +/*FIXME: Just copy from obdo_from_inode*/ +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid) +{ + obd_flag newvalid = 0; + + if (valid & LA_ATIME) { + dst->o_atime = la->la_atime; + newvalid |= OBD_MD_FLATIME; + } + if (valid & LA_MTIME) { + dst->o_mtime = la->la_mtime; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & LA_CTIME) { + dst->o_ctime = la->la_ctime; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & LA_SIZE) { + dst->o_size = la->la_size; + newvalid |= OBD_MD_FLSIZE; + } + if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = la->la_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & LA_TYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (la->la_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & LA_MODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (la->la_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & LA_UID) { + dst->o_uid = la->la_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & LA_GID) { + dst->o_gid = la->la_gid; + newvalid |= OBD_MD_FLGID; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_la); + +/*FIXME: Just copy from obdo_from_inode*/ +void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid) +{ + __u64 newvalid = 0; + + valid &= obdo->o_valid; + + if (valid & OBD_MD_FLATIME) { + dst->la_atime = obdo->o_atime; + newvalid |= LA_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->la_mtime = obdo->o_mtime; + newvalid |= LA_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->la_ctime = obdo->o_ctime; + newvalid |= LA_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->la_size = obdo->o_size; + newvalid |= LA_SIZE; + } + if (valid & OBD_MD_FLBLOCKS) { + dst->la_blocks = obdo->o_blocks; + newvalid |= LA_BLOCKS; + } + if (valid & OBD_MD_FLTYPE) { + dst->la_mode = (dst->la_mode & S_IALLUGO) | + (obdo->o_mode & S_IFMT); + newvalid |= LA_TYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->la_mode = (dst->la_mode & S_IFMT) | + (obdo->o_mode & S_IALLUGO); + newvalid |= LA_MODE; + } + if (valid & OBD_MD_FLUID) { + dst->la_uid = obdo->o_uid; + newvalid |= LA_UID; + } + if (valid & OBD_MD_FLGID) { + dst->la_gid = obdo->o_gid; + newvalid |= LA_GID; + } + dst->la_valid = newvalid; +} +EXPORT_SYMBOL(la_from_obdo); + +void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid) +{ + valid &= src->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime)) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime)) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + /* optimum IO size */ + if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits)) + dst->i_blkbits = ffs(src->o_blksize) - 1; + + if (dst->i_blkbits < PAGE_CACHE_SHIFT) + dst->i_blkbits = PAGE_CACHE_SHIFT; + + /* allocation of space */ + if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks) + /* + * XXX shouldn't overflow be checked here like in + * obdo_to_inode(). + */ + dst->i_blocks = src->o_blocks; +} +EXPORT_SYMBOL(obdo_refresh_inode); + +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) +{ + valid &= src->o_valid; + + LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID | + OBD_MD_FLID | OBD_MD_FLGROUP)), + "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid); + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */ + dst->i_blocks = src->o_blocks; + if (dst->i_blocks < src->o_blocks) /* overflow */ + dst->i_blocks = -1; + + } + if (valid & OBD_MD_FLBLKSZ) + dst->i_blkbits = ffs(src->o_blksize)-1; + if (valid & OBD_MD_FLMODE) + dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->i_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->i_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->i_flags = src->o_flags; +} +EXPORT_SYMBOL(obdo_to_inode); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c new file mode 100644 index 000000000000..46aad6813cab --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -0,0 +1,445 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#ifdef CONFIG_SYSCTL +ctl_table_header_t *obd_table_header = NULL; +#endif + + +#define OBD_SYSCTL 300 + +enum { + OBD_TIMEOUT = 3, /* RPC timeout before recovery/intr */ + OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */ + OBD_MEMUSED, /* bytes currently OBD_ALLOCated */ + OBD_PAGESUSED, /* pages currently OBD_PAGE_ALLOCated */ + OBD_MAXMEMUSED, /* maximum bytes OBD_ALLOCated concurrently */ + OBD_MAXPAGESUSED, /* maximum pages OBD_PAGE_ALLOCated concurrently */ + OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */ + OBD_LDLM_TIMEOUT, /* LDLM timeout for ASTs before client eviction */ + OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */ + OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */ + OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */ + OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */ + OBD_AT_MIN, /* Adaptive timeouts params */ + OBD_AT_MAX, + OBD_AT_EXTRA, + OBD_AT_EARLY_MARGIN, + OBD_AT_HISTORY, +}; + + +int LL_PROC_PROTO(proc_set_timeout) +{ + int rc; + + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + return rc; +} + +int LL_PROC_PROTO(proc_memory_alloc) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_pages_alloc) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_mem_max) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_pages_max) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_max_dirty_pages_in_mb) +{ + int rc = 0; + DECLARE_LL_PROC_PPOS_DECL; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int*)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + /* Don't allow them to let dirty pages exceed 90% of system + * memory and set a hard minimum of 4MB. */ + if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) { + CERROR("Refusing to set max dirty pages to %u, which " + "is more than 90%% of available RAM; setting " + "to %lu\n", obd_max_dirty_pages, + ((num_physpages / 10) * 9)); + obd_max_dirty_pages = ((num_physpages / 10) * 9); + } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) { + obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT); + } + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, sizeof(buf), + *(unsigned int*)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +int LL_PROC_PROTO(proc_alloc_fail_rate) +{ + int rc = 0; + DECLARE_LL_PROC_PPOS_DECL; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int*)table->data, + OBD_ALLOC_FAIL_MULT); + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, 21, + *(unsigned int*)table->data, + OBD_ALLOC_FAIL_MULT); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +int LL_PROC_PROTO(proc_at_min) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_max) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_extra) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_early_margin) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_history) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} + +#ifdef CONFIG_SYSCTL +static ctl_table_t obd_table[] = { + { + INIT_CTL_NAME(OBD_TIMEOUT) + .procname = "timeout", + .data = &obd_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT) + .procname = "debug_peer_on_timeout", + .data = &obd_debug_peer_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT) + .procname = "dump_on_timeout", + .data = &obd_dump_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_DUMP_ON_EVICTION) + .procname = "dump_on_eviction", + .data = &obd_dump_on_eviction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_MEMUSED) + .procname = "memused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_memory_alloc + }, + { + INIT_CTL_NAME(OBD_PAGESUSED) + .procname = "pagesused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_alloc + }, + { + INIT_CTL_NAME(OBD_MAXMEMUSED) + .procname = "memused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_mem_max + }, + { + INIT_CTL_NAME(OBD_MAXPAGESUSED) + .procname = "pagesused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_max + }, + { + INIT_CTL_NAME(OBD_LDLM_TIMEOUT) + .procname = "ldlm_timeout", + .data = &ldlm_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE) + .procname = "alloc_fail_rate", + .data = &obd_alloc_fail_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_alloc_fail_rate + }, + { + INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES) + .procname = "max_dirty_mb", + .data = &obd_max_dirty_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_max_dirty_pages_in_mb + }, + { + INIT_CTL_NAME(OBD_AT_MIN) + .procname = "at_min", + .data = &at_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_min + }, + { + INIT_CTL_NAME(OBD_AT_MAX) + .procname = "at_max", + .data = &at_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_max + }, + { + INIT_CTL_NAME(OBD_AT_EXTRA) + .procname = "at_extra", + .data = &at_extra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_extra + }, + { + INIT_CTL_NAME(OBD_AT_EARLY_MARGIN) + .procname = "at_early_margin", + .data = &at_early_margin, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_early_margin + }, + { + INIT_CTL_NAME(OBD_AT_HISTORY) + .procname = "at_history", + .data = &at_history, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_history + }, + { INIT_CTL_NAME(0) } +}; + +static ctl_table_t parent_table[] = { + { + INIT_CTL_NAME(OBD_SYSCTL) + .procname = "lustre", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = obd_table + }, + { INIT_CTL_NAME(0) } +}; +#endif + +void obd_sysctl_init (void) +{ +#ifdef CONFIG_SYSCTL + if ( !obd_table_header ) + obd_table_header = cfs_register_sysctl_table(parent_table, 0); +#endif +} + +void obd_sysctl_clean (void) +{ +#ifdef CONFIG_SYSCTL + if ( obd_table_header ) + unregister_sysctl_table(obd_table_header); + obd_table_header = NULL; +#endif +} diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c new file mode 100644 index 000000000000..b1d215e56991 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog.c @@ -0,0 +1,966 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alex Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include "llog_internal.h" + +/* + * Allocate a new log or catalog handle + * Used inside llog_open(). + */ +struct llog_handle *llog_alloc_handle(void) +{ + struct llog_handle *loghandle; + + OBD_ALLOC_PTR(loghandle); + if (loghandle == NULL) + return ERR_PTR(-ENOMEM); + + init_rwsem(&loghandle->lgh_lock); + spin_lock_init(&loghandle->lgh_hdr_lock); + INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); + atomic_set(&loghandle->lgh_refcount, 1); + + return loghandle; +} + +/* + * Free llog handle and header data if exists. Used in llog_close() only + */ +void llog_free_handle(struct llog_handle *loghandle) +{ + LASSERT(loghandle != NULL); + + /* failed llog_init_handle */ + if (!loghandle->lgh_hdr) + goto out; + + if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) + LASSERT(list_empty(&loghandle->u.phd.phd_entry)); + else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + LASSERT(list_empty(&loghandle->u.chd.chd_head)); + LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE); + OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE); +out: + OBD_FREE_PTR(loghandle); +} + +void llog_handle_get(struct llog_handle *loghandle) +{ + atomic_inc(&loghandle->lgh_refcount); +} + +void llog_handle_put(struct llog_handle *loghandle) +{ + LASSERT(atomic_read(&loghandle->lgh_refcount) > 0); + if (atomic_dec_and_test(&loghandle->lgh_refcount)) + llog_free_handle(loghandle); +} + +/* returns negative on error; 0 if success; 1 if success & log destroyed */ +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int rc = 0; + ENTRY; + + CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n", + index, POSTID(&loghandle->lgh_id.lgl_oi)); + + if (index == 0) { + CERROR("Can't cancel index 0 which is header\n"); + RETURN(-EINVAL); + } + + spin_lock(&loghandle->lgh_hdr_lock); + if (!ext2_clear_bit(index, llh->llh_bitmap)) { + spin_unlock(&loghandle->lgh_hdr_lock); + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index); + RETURN(-ENOENT); + } + + llh->llh_count--; + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1) && + (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) { + spin_unlock(&loghandle->lgh_hdr_lock); + rc = llog_destroy(env, loghandle); + if (rc < 0) { + CERROR("%s: can't destroy empty llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + GOTO(out_err, rc); + } + RETURN(1); + } + spin_unlock(&loghandle->lgh_hdr_lock); + + rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0); + if (rc < 0) { + CERROR("%s: fail to write header for llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + GOTO(out_err, rc); + } + RETURN(0); +out_err: + spin_lock(&loghandle->lgh_hdr_lock); + ext2_set_bit(index, llh->llh_bitmap); + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + return rc; +} +EXPORT_SYMBOL(llog_cancel_rec); + +static int llog_read_header(const struct lu_env *env, + struct llog_handle *handle, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_read_header == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_read_header(env, handle); + if (rc == LLOG_EEMPTY) { + struct llog_log_hdr *llh = handle->lgh_hdr; + + handle->lgh_last_idx = 0; /* header is record with index 0 */ + llh->llh_count = 1; /* for the header record */ + llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; + llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE; + llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0; + llh->llh_timestamp = cfs_time_current_sec(); + if (uuid) + memcpy(&llh->llh_tgtuuid, uuid, + sizeof(llh->llh_tgtuuid)); + llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); + ext2_set_bit(0, llh->llh_bitmap); + rc = 0; + } + return rc; +} + +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid) +{ + struct llog_log_hdr *llh; + int rc; + + ENTRY; + LASSERT(handle->lgh_hdr == NULL); + + OBD_ALLOC_PTR(llh); + if (llh == NULL) + RETURN(-ENOMEM); + handle->lgh_hdr = llh; + /* first assign flags to use llog_client_ops */ + llh->llh_flags = flags; + rc = llog_read_header(env, handle, uuid); + if (rc == 0) { + if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && + flags & LLOG_F_IS_CAT) || + (llh->llh_flags & LLOG_F_IS_CAT && + flags & LLOG_F_IS_PLAIN))) { + CERROR("%s: llog type is %s but initializing %s\n", + handle->lgh_ctxt->loc_obd->obd_name, + llh->llh_flags & LLOG_F_IS_CAT ? + "catalog" : "plain", + flags & LLOG_F_IS_CAT ? "catalog" : "plain"); + GOTO(out, rc = -EINVAL); + } else if (llh->llh_flags & + (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { + /* + * it is possible to open llog without specifying llog + * type so it is taken from llh_flags + */ + flags = llh->llh_flags; + } else { + /* for some reason the llh_flags has no type set */ + CERROR("llog type is not specified!\n"); + GOTO(out, rc = -EINVAL); + } + if (unlikely(uuid && + !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { + CERROR("%s: llog uuid mismatch: %s/%s\n", + handle->lgh_ctxt->loc_obd->obd_name, + (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + GOTO(out, rc = -EEXIST); + } + } + if (flags & LLOG_F_IS_CAT) { + LASSERT(list_empty(&handle->u.chd.chd_head)); + INIT_LIST_HEAD(&handle->u.chd.chd_head); + llh->llh_size = sizeof(struct llog_logid_rec); + } else if (!(flags & LLOG_F_IS_PLAIN)) { + CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", + handle->lgh_ctxt->loc_obd->obd_name, + flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); + rc = -EINVAL; + } +out: + if (rc) { + OBD_FREE_PTR(llh); + handle->lgh_hdr = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_init_handle); + +int llog_copy_handler(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, + void *data) +{ + struct llog_rec_hdr local_rec = *rec; + struct llog_handle *local_llh = (struct llog_handle *)data; + char *cfg_buf = (char*) (rec + 1); + struct lustre_cfg *lcfg; + int rc = 0; + ENTRY; + + /* Append all records */ + local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail); + rc = llog_write(env, local_llh, &local_rec, NULL, 0, + (void *)cfg_buf, -1); + + lcfg = (struct lustre_cfg *)cfg_buf; + CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n", + rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command, + lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1)); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_copy_handler); + +static int llog_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + char *buf; + __u64 cur_offset = LLOG_CHUNK_SIZE; + __u64 last_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0; + int last_called_index = 0; + + ENTRY; + + LASSERT(llh); + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) { + lpi->lpi_rc = -ENOMEM; + RETURN(0); + } + + if (cd != NULL) { + last_called_index = cd->lpcd_first_idx; + index = cd->lpcd_first_idx + 1; + } + if (cd != NULL && cd->lpcd_last_idx) + last_index = cd->lpcd_last_idx; + else + last_index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + + /* skip records not set in bitmap */ + while (index <= last_index && + !ext2_test_bit(index, llh->llh_bitmap)) + ++index; + + LASSERT(index <= last_index + 1); + if (index == last_index + 1) + break; +repeat: + CDEBUG(D_OTHER, "index: %d last_index %d\n", + index, last_index); + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + last_offset = cur_offset; + rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, + index, &cur_offset, buf, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out, rc); + + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)buf; + (char *)rec < buf + LLOG_CHUNK_SIZE; + rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){ + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", + rec->lrh_type, rec->lrh_index); + + if (rec->lrh_index == 0) { + /* probably another rec just got added? */ + if (index <= loghandle->lgh_last_idx) + GOTO(repeat, rc = 0); + GOTO(out, rc = 0); /* no more records */ + } + if (rec->lrh_len == 0 || + rec->lrh_len > LLOG_CHUNK_SIZE) { + CWARN("invalid length %d in llog record for " + "index %d/%d\n", rec->lrh_len, + rec->lrh_index, index); + GOTO(out, rc = -EINVAL); + } + + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + LLOG_CHUNK_SIZE - (char *)rec)); + + loghandle->lgh_cur_idx = rec->lrh_index; + loghandle->lgh_cur_offset = (char *)rec - (char *)buf + + last_offset; + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, + lpi->lpi_cbdata); + last_called_index = index; + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(lpi->lpi_env, + loghandle, + rec->lrh_index); + rc = 0; + } + if (rc) + GOTO(out, rc); + } else { + CDEBUG(D_OTHER, "Skipped index %d\n", index); + } + + /* next record, still in buffer? */ + ++index; + if (index > last_index) + GOTO(out, rc = 0); + } + } + +out: + if (cd != NULL) + cd->lpcd_last_idx = last_called_index; + + OBD_FREE(buf, LLOG_CHUNK_SIZE); + lpi->lpi_rc = rc; + return 0; +} + +static int llog_process_thread_daemonize(void *arg) +{ + struct llog_process_info *lpi = arg; + struct lu_env env; + int rc; + + unshare_fs_struct(); + + /* client env has no keys, tags is just 0 */ + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + goto out; + lpi->lpi_env = &env; + + rc = llog_process_thread(arg); + + lu_env_fini(&env); +out: + complete(&lpi->lpi_completion); + return rc; +} + +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork) +{ + struct llog_process_info *lpi; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + RETURN(-ENOMEM); + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + + if (fork) { + /* The new thread can't use parent env, + * init the new one in llog_process_thread_daemonize. */ + lpi->lpi_env = NULL; + init_completion(&lpi->lpi_completion); + rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi, + "llog_process_thread")); + if (IS_ERR_VALUE(rc)) { + CERROR("%s: cannot start thread: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + OBD_FREE_PTR(lpi); + RETURN(rc); + } + wait_for_completion(&lpi->lpi_completion); + } else { + lpi->lpi_env = env; + llog_process_thread(lpi); + } + rc = lpi->lpi_rc; + OBD_FREE_PTR(lpi); + RETURN(rc); +} +EXPORT_SYMBOL(llog_process_or_fork); + +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata) +{ + return llog_process_or_fork(env, loghandle, cb, data, catdata, true); +} +EXPORT_SYMBOL(llog_process); + +inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} +EXPORT_SYMBOL(llog_get_size); + +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + ENTRY; + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) + RETURN(-ENOMEM); + + if (cd != NULL) + first_index = cd->lpcd_first_idx + 1; + if (cd != NULL && cd->lpcd_last_idx) + index = cd->lpcd_last_idx; + else + index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + !ext2_test_bit(index, llh->llh_bitmap)) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + rc = llog_prev_block(env, loghandle, index, buf, + LLOG_CHUNK_SIZE); + if (rc) + GOTO(out, rc); + + rec = buf; + idx = rec->lrh_index; + CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = (void *)rec + rec->lrh_len; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + idx ++; + } + LASSERT(idx == index); + tail = (void *)rec + rec->lrh_len - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + if (tail->lrt_index == 0) + GOTO(out, rc = 0); /* no more records */ + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rec = (void *)tail - tail->lrt_len + + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(env, loghandle, + tail->lrt_index); + rc = 0; + } + if (rc) + GOTO(out, rc); + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) + GOTO(out, rc = 0); + tail = (void *)tail - tail->lrt_len; + } + } + +out: + if (buf) + OBD_FREE(buf, LLOG_CHUNK_SIZE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_reverse_process); + +/** + * new llog API + * + * API functions: + * llog_open - open llog, may not exist + * llog_exist - check if llog exists + * llog_close - close opened llog, pair for open, frees llog_handle + * llog_declare_create - declare llog creation + * llog_create - create new llog on disk, need transaction handle + * llog_declare_write_rec - declaration of llog write + * llog_write_rec - write llog record on disk, need transaction handle + * llog_declare_add - declare llog catalog record addition + * llog_add - add llog record in catalog, need transaction handle + */ +int llog_exist(struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_exist == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_exist(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_exist); + +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_create(env, loghandle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_create); + +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_create(env, handle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_create); + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + LASSERT(lop); + if (lop->lop_declare_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_write_rec(env, handle, rec, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_write_rec); + +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc, buflen; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + LASSERT(lop); + if (lop->lop_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + if (buf) + buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) + + sizeof(struct llog_rec_tail); + else + buflen = rec->lrh_len; + LASSERT(cfs_size_round(buflen) == buflen); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies, + buf, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_write_rec); + +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_add); + +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_declare_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_add); + +/** + * Helper function to open llog or create it if doesn't exist. + * It hides all transaction handling from caller. + */ +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name) +{ + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW); + if (rc) + RETURN(rc); + + if (llog_exist(*res)) + RETURN(0); + + if ((*res)->lgh_obj != NULL) { + struct dt_device *d; + + d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = llog_declare_create(env, *res, th); + if (rc == 0) { + rc = dt_trans_start_local(env, d, th); + if (rc == 0) + rc = llog_create(env, *res, th); + } + dt_trans_stop(env, d, th); + } else { + /* lvfs compat code */ + LASSERT((*res)->lgh_file == NULL); + rc = llog_create(env, *res, NULL); + } +out: + if (rc) + llog_close(env, *res); + RETURN(rc); +} +EXPORT_SYMBOL(llog_open_create); + +/** + * Helper function to delete existent llog. + */ +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name) +{ + struct llog_handle *handle; + int rc = 0, rc2; + + ENTRY; + + /* nothing to erase */ + if (name == NULL && logid == NULL) + RETURN(0); + + rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS); + if (rc < 0) + RETURN(rc); + + rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_destroy(env, handle); + + rc2 = llog_close(env, handle); + if (rc == 0) + rc = rc2; + RETURN(rc); +} +EXPORT_SYMBOL(llog_erase); + +/* + * Helper function for write record in llog. + * It hides all transaction handling from caller. + * Valid only with local llog. + */ +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx) +{ + int rc; + + ENTRY; + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + if (loghandle->lgh_obj != NULL) { + struct dt_device *dt; + struct thandle *th; + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, rec, idx, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + + down_write(&loghandle->lgh_lock); + rc = llog_write_rec(env, loghandle, rec, reccookie, + cookiecount, buf, idx, th); + up_write(&loghandle->lgh_lock); +out_trans: + dt_trans_stop(env, dt, th); + } else { /* lvfs compatibility */ + down_write(&loghandle->lgh_lock); + rc = llog_write_rec(env, loghandle, rec, reccookie, + cookiecount, buf, idx, NULL); + up_write(&loghandle->lgh_lock); + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_write); + +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + int raised; + int rc; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_logops); + + if (ctxt->loc_logops->lop_open == NULL) { + *lgh = NULL; + RETURN(-EOPNOTSUPP); + } + + *lgh = llog_alloc_handle(); + if (*lgh == NULL) + RETURN(-ENOMEM); + (*lgh)->lgh_ctxt = ctxt; + (*lgh)->lgh_logops = ctxt->loc_logops; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + if (rc) { + llog_free_handle(*lgh); + *lgh = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_open); + +int llog_close(const struct lu_env *env, struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + GOTO(out, rc); + if (lop->lop_close == NULL) + GOTO(out, rc = -EOPNOTSUPP); + rc = lop->lop_close(env, loghandle); +out: + llog_handle_put(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_close); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c new file mode 100644 index 000000000000..cf00b2f550ac --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c @@ -0,0 +1,833 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_cat.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include + +#include "llog_internal.h" + +/* Create a new log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + */ +static int llog_cat_new_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle *loghandle, + struct thandle *th) +{ + + struct llog_log_hdr *llh; + struct llog_logid_rec rec = { { 0 }, }; + int rc, index, bitmap_size; + ENTRY; + + llh = cathandle->lgh_hdr; + bitmap_size = LLOG_BITMAP_SIZE(llh); + + index = (cathandle->lgh_last_idx + 1) % bitmap_size; + + /* maximum number of available slots in catlog is bitmap_size - 2 */ + if (llh->llh_cat_idx == index) { + CERROR("no free catalog slots for log...\n"); + RETURN(-ENOSPC); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED)) + RETURN(-ENOSPC); + + rc = llog_create(env, loghandle, th); + /* if llog is already created, no need to initialize it */ + if (rc == -EEXIST) { + RETURN(0); + } else if (rc != 0) { + CERROR("%s: can't create new plain llog in catalog: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &cathandle->lgh_hdr->llh_tgtuuid); + if (rc) + GOTO(out_destroy, rc); + + if (index == 0) + index = 1; + + spin_lock(&loghandle->lgh_hdr_lock); + llh->llh_count++; + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("argh, index %u already set in log bitmap?\n", + index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + spin_unlock(&loghandle->lgh_hdr_lock); + + cathandle->lgh_last_idx = index; + llh->llh_tail.lrt_index = index; + + CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog" + DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, index, + POSTID(&cathandle->lgh_id.lgl_oi)); + /* build the record for this log in the catalog */ + rec.lid_hdr.lrh_len = sizeof(rec); + rec.lid_hdr.lrh_index = index; + rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec.lid_id = loghandle->lgh_id; + rec.lid_tail.lrt_len = sizeof(rec); + rec.lid_tail.lrt_index = index; + + /* update the catalog: header and record */ + rc = llog_write_rec(env, cathandle, &rec.lid_hdr, + &loghandle->u.phd.phd_cookie, 1, NULL, index, th); + if (rc < 0) + GOTO(out_destroy, rc); + + loghandle->lgh_hdr->llh_cat_idx = index; + RETURN(0); +out_destroy: + llog_destroy(env, loghandle); + RETURN(rc); +} + +/* Open an existent log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * We return a lock on the handle to ensure nobody yanks it from us. + * + * This takes extra reference on llog_handle via llog_handle_get() and require + * this reference to be put by caller using llog_handle_put() + */ +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid) +{ + struct llog_handle *loghandle; + int rc = 0; + + ENTRY; + + if (cathandle == NULL) + RETURN(-EBADF); + + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_logid *cgl = &loghandle->lgh_id; + + if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && + ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { + if (cgl->lgl_ogen != logid->lgl_ogen) { + CERROR("%s: log "DOSTID" generation %x != %x\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), cgl->lgl_ogen, + logid->lgl_ogen); + continue; + } + loghandle->u.phd.phd_cat_handle = cathandle; + up_write(&cathandle->lgh_lock); + GOTO(out, rc = 0); + } + } + up_write(&cathandle->lgh_lock); + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, + LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL); + if (rc < 0) { + llog_close(env, loghandle); + loghandle = NULL; + RETURN(rc); + } + + down_write(&cathandle->lgh_lock); + list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + up_write(&cathandle->lgh_lock); + + loghandle->u.phd.phd_cat_handle = cathandle; + loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; + loghandle->u.phd.phd_cookie.lgc_index = + loghandle->lgh_hdr->llh_cat_idx; + EXIT; +out: + llog_handle_get(loghandle); + *res = loghandle; + return 0; +} + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + + ENTRY; + + list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int index; + + /* unlink open-not-created llogs */ + list_del_init(&loghandle->u.phd.phd_entry); + llh = loghandle->lgh_hdr; + if (loghandle->lgh_obj != NULL && llh != NULL && + (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: failure destroying log during " + "cleanup: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + rc); + + index = loghandle->u.phd.phd_cookie.lgc_index; + llog_cat_cleanup(env, cathandle, NULL, index); + } + llog_close(env, loghandle); + } + /* if handle was stored in ctxt, remove it too */ + if (cathandle->lgh_ctxt->loc_handle == cathandle) + cathandle->lgh_ctxt->loc_handle = NULL; + rc = llog_close(env, cathandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_close); + +/** + * lockdep markers for nested struct llog_handle::lgh_lock locking. + */ +enum { + LLOGH_CAT, + LLOGH_LOG +}; + +/** Return the currently active log handle. If the current log handle doesn't + * have enough space left for the current record, start a new one. + * + * If reclen is 0, we only want to know what the currently active log is, + * otherwise we get a lock on this log so nobody can steal our space. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * + * NOTE: loghandle is write-locked upon successful return + */ +static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle, + struct thandle *th) +{ + struct llog_handle *loghandle = NULL; + ENTRY; + + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || + loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_read(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + up_read(&cathandle->lgh_lock); + + /* time to use next log */ + + /* first, we have to make sure the state hasn't changed */ + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + LASSERT(llh); + if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_write(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + + CDEBUG(D_INODE, "use next log\n"); + + loghandle = cathandle->u.chd.chd_next_log; + cathandle->u.chd.chd_current_log = loghandle; + cathandle->u.chd.chd_next_log = NULL; + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + up_write(&cathandle->lgh_lock); + LASSERT(loghandle); + RETURN(loghandle); +} + +/* Add a single record to the recovery log(s) using a catalog + * Returns as llog_write_record + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th) +{ + struct llog_handle *loghandle; + int rc; + ENTRY; + + LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE); + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + + /* loghandle is already locked by llog_cat_current_log() for us */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th); + if (rc < 0) + CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR, + "llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + if (rc == -ENOSPC) { + /* try to use next log */ + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + /* new llog can be created concurrently */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, + -1, th); + if (rc < 0) + CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add_rec); + +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th) +{ + struct llog_handle *loghandle, *next; + int rc = 0; + + ENTRY; + + if (cathandle->u.chd.chd_current_log == NULL) { + /* declare new plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_current_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } else if (cathandle->u.chd.chd_next_log == NULL) { + /* declare next plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_next_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_next_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } + if (rc) + GOTO(out, rc); + + if (!llog_exist(cathandle->u.chd.chd_current_log)) { + rc = llog_declare_create(env, cathandle->u.chd.chd_current_log, + th); + if (rc) + GOTO(out, rc); + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + /* declare records in the llogs */ + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, + rec, -1, th); + if (rc) + GOTO(out, rc); + + next = cathandle->u.chd.chd_next_log; + if (next) { + if (!llog_exist(next)) { + rc = llog_declare_create(env, next, th); + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + llog_declare_write_rec(env, next, rec, -1, th); + } +out: + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_declare_add_rec); + +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt; + struct thandle *th = NULL; + int rc; + + ctxt = cathandle->lgh_ctxt; + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + + if (cathandle->lgh_obj != NULL) { + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th); +out_trans: + dt_trans_stop(env, dt, th); + } else { /* lvfs compat code */ + LASSERT(cathandle->lgh_file != NULL); + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc == 0) + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, + buf, th); + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add); + +/* For each cookie in the cookie array, we clear the log in-use bit and either: + * - the log is empty, so mark it free in the catalog header and delete it + * - the log is not empty, just write out the log header + * + * The cookies may be in different log files, so we need to get new logs + * each time. + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies) +{ + int i, index, rc = 0, failed = 0; + + ENTRY; + + for (i = 0; i < count; i++, cookies++) { + struct llog_handle *loghandle; + struct llog_logid *lgl = &cookies->lgc_lgl; + int lrc; + + rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lgl->lgl_oi), rc); + failed++; + continue; + } + + lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index); + if (lrc == 1) { /* log has been destroyed */ + index = loghandle->u.phd.phd_cookie.lgc_index; + rc = llog_cat_cleanup(env, cathandle, loghandle, + index); + } else if (lrc == -ENOENT) { + if (rc == 0) /* ENOENT shouldn't rewrite any error */ + rc = lrc; + } else if (lrc < 0) { + failed++; + rc = lrc; + } + llog_handle_put(loghandle); + } + if (rc) + CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, failed, count, + rc); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_cancel_records); + +int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + RETURN(rc); + } + + if (rec->lrh_index < d->lpd_startcat) + /* Skip processing of the logs until startcat */ + RETURN(0); + + if (d->lpd_startidx > 0) { + struct llog_process_cat_data cd; + + cd.lpcd_first_idx = d->lpd_startidx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + &cd, false); + /* Continue processing the next log from idx 0 */ + d->lpd_startidx = 0; + } else { + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + NULL, false); + } + llog_handle_put(llh); + + RETURN(rc); +} + +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork) +{ + struct llog_process_data d; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + d.lpd_startcat = startcat; + d.lpd_startidx = startidx; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + struct llog_process_cat_data cd; + + CWARN("catlog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = llh->llh_cat_idx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + } else { + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, NULL, fork); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_process_or_fork); + +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx) +{ + return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat, + startidx, false); +} +EXPORT_SYMBOL(llog_cat_process); + +static int llog_cat_reverse_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + RETURN(rc); + } + + rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL); + llog_handle_put(llh); + RETURN(rc); +} + +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + CWARN("catalog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.lpcd_last_idx = 0; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, NULL); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_reverse_process); + +int llog_cat_set_first_idx(struct llog_handle *cathandle, int index) +{ + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int i, bitmap_size, idx; + ENTRY; + + bitmap_size = LLOG_BITMAP_SIZE(llh); + if (llh->llh_cat_idx == (index - 1)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + if (idx == cathandle->lgh_last_idx) + goto out; + for (i = (index + 1) % bitmap_size; + i != cathandle->lgh_last_idx; + i = (i + 1) % bitmap_size) { + if (!ext2_test_bit(i, llh->llh_bitmap)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + } else if (i == 0) { + llh->llh_cat_idx = 0; + } else { + break; + } + } +out: + CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n", + POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx); + } + + RETURN(0); +} + +/* Cleanup deleted plain llog traces from catalog */ +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index) +{ + int rc; + + LASSERT(index); + if (loghandle != NULL) { + /* remove destroyed llog from catalog list and + * chd_current_log variable */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + list_del_init(&loghandle->u.phd.phd_entry); + up_write(&cathandle->lgh_lock); + LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index); + /* llog was opened and keep in a list, close it now */ + llog_close(env, loghandle); + } + /* remove plain llog entry from catalog by index */ + llog_cat_set_first_idx(cathandle, index); + rc = llog_cancel_rec(env, cathandle, index); + if (rc == 0) + CDEBUG(D_HA, "cancel plain log at index" + " %u of catalog "DOSTID"\n", + index, POSTID(&cathandle->lgh_id.lgl_oi)); + return rc; +} + +int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + struct llog_log_hdr *llh; + int rc; + + ENTRY; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + if (rc == -ENOENT || rc == -ESTALE) { + /* remove index from catalog */ + llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index); + } + RETURN(rc); + } + + llh = loghandle->lgh_hdr; + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: fail to destroy empty log: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + + llog_cat_cleanup(env, cathandle, loghandle, + loghandle->u.phd.phd_cookie.lgc_index); + } + llog_handle_put(loghandle); + + RETURN(rc); +} +EXPORT_SYMBOL(cat_cancel_cb); + +/* helper to initialize catalog llog and process it to cancel */ +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh) +{ + int rc; + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL); + if (rc) + RETURN(rc); + + rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false); + if (rc) + CERROR("%s: llog_process() with cat_cancel_cb failed: rc = " + "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc); + RETURN(0); +} +EXPORT_SYMBOL(llog_cat_init_and_process); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h new file mode 100644 index 000000000000..539e1d4f9d4c --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LLOG_INTERNAL_H__ +#define __LLOG_INTERNAL_H__ + +#include + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + +}; + +struct llog_thread_info { + struct lu_attr lgi_attr; + struct lu_fid lgi_fid; + struct dt_object_format lgi_dof; + struct lu_buf lgi_buf; + loff_t lgi_off; + struct llog_rec_hdr lgi_lrh; + struct llog_rec_tail lgi_tail; +}; + +extern struct lu_context_key llog_thread_key; + +static inline struct llog_thread_info *llog_info(const struct lu_env *env) +{ + struct llog_thread_info *lgi; + + lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key); + LASSERT(lgi); + return lgi; +} + +static inline void +lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen) +{ + ostid_set_seq_llog(&logid->lgl_oi); + ostid_set_id(&logid->lgl_oi, ino); + logid->lgl_ogen = gen; +} + +int llog_info_init(void); +void llog_info_fini(void); + +void llog_handle_get(struct llog_handle *loghandle); +void llog_handle_put(struct llog_handle *loghandle); +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid); +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index); +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c new file mode 100644 index 000000000000..0732874e26c5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c @@ -0,0 +1,427 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include +#include +#include "llog_internal.h" + +static int str2logid(struct llog_logid *logid, char *str, int len) +{ + char *start, *end, *endp; + __u64 id, seq; + + ENTRY; + start = str; + if (*start != '#') + RETURN(-EINVAL); + + start++; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + id = simple_strtoull(start, &endp, 0); + if (endp != end) + RETURN(-EINVAL); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + seq = simple_strtoull(start, &endp, 0); + if (endp != end) + RETURN(-EINVAL); + + ostid_set_seq(&logid->lgl_oi, seq); + ostid_set_id(&logid->lgl_oi, id); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + logid->lgl_ogen = simple_strtoul(start, &endp, 16); + if (*endp != '\0') + RETURN(-EINVAL); + + RETURN(0); +} + +static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data; + static int l, remains, from, to; + static char *out; + char *endp; + int cur_index, rc = 0; + + ENTRY; + + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + cfs_size_round(ioc_data->ioc_inllen1) + + cfs_size_round(ioc_data->ioc_inllen2) + + cfs_size_round(ioc_data->ioc_inllen3); + from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + ioc_data->ioc_inllen1 = 0; + out = ioc_data->ioc_bulk; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d failed\n", + cur_index, rec->lrh_type, + rec->lrh_len); + } + if (handle->lgh_ctxt == NULL) + RETURN(-EOPNOTSUPP); + rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n", + POSTID(&lir->lid_id.lgl_oi), + lir->lid_id.lgl_ogen); + RETURN(rc); + } + rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL); + llog_handle_put(loghandle); + } else { + bool ok; + + switch (rec->lrh_type) { + case OST_SZ_REC: + case MDS_UNLINK_REC: + case MDS_UNLINK64_REC: + case MDS_SETATTR64_REC: + case OBD_CFG_REC: + case LLOG_GEN_REC: + case LLOG_HDR_MAGIC: + ok = true; + break; + default: + ok = false; + } + + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d %s\n", + cur_index, rec->lrh_type, rec->lrh_len, + ok ? "ok" : "failed"); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: no space to print log records\n", + handle->lgh_ctxt->loc_obd->obd_name); + RETURN(-LLOG_EEMPTY); + } + } + RETURN(rc); +} + +static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data; + static int l, remains, from, to; + static char *out; + char *endp; + int cur_index; + + ENTRY; + if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + cfs_size_round(ioc_data->ioc_inllen1) + + cfs_size_round(ioc_data->ioc_inllen2) + + cfs_size_round(ioc_data->ioc_inllen3); + from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + out = ioc_data->ioc_bulk; + ioc_data->ioc_inllen1 = 0; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + l = snprintf(out, remains, + "[index]: %05d [logid]: #"DOSTID"#%08x\n", + cur_index, POSTID(&lir->lid_id.lgl_oi), + lir->lid_id.lgl_ogen); + } else if (rec->lrh_type == OBD_CFG_REC) { + int rc; + + rc = class_config_parse_rec(rec, out, remains); + if (rc < 0) + RETURN(rc); + l = rc; + } else { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d\n", + cur_index, rec->lrh_type, rec->lrh_len); + } + out += l; + remains -= l; + if (remains <= 0) { + CERROR("not enough space for print log records\n"); + RETURN(-LLOG_EEMPTY); + } + + RETURN(0); +} +static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat, + struct llog_logid *logid) +{ + struct llog_handle *log; + int rc; + + ENTRY; + + rc = llog_cat_id2handle(env, cat, &log, logid); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n", + POSTID(&logid->lgl_oi), logid->lgl_ogen); + RETURN(-ENOENT); + } + + rc = llog_destroy(env, log); + if (rc) { + CDEBUG(D_IOCTL, "cannot destroy log\n"); + GOTO(out, rc); + } + llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index); +out: + llog_handle_put(log); + RETURN(rc); + +} + +static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) + RETURN(-EINVAL); + rc = llog_remove_log(env, handle, &lir->lid_id); + + RETURN(rc); +} + + +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data) +{ + struct llog_logid logid; + int rc = 0; + struct llog_handle *handle = NULL; + + ENTRY; + + if (*data->ioc_inlbuf1 == '#') { + rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1); + if (rc) + RETURN(rc); + rc = llog_open(env, ctxt, &handle, &logid, NULL, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else if (*data->ioc_inlbuf1 == '$') { + char *name = data->ioc_inlbuf1 + 1; + + rc = llog_open(env, ctxt, &handle, NULL, name, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else { + RETURN(-EINVAL); + } + + rc = llog_init_handle(env, handle, 0, NULL); + if (rc) + GOTO(out_close, rc = -ENOENT); + + switch (cmd) { + case OBD_IOC_LLOG_INFO: { + int l; + int remains = data->ioc_inllen2 + + cfs_size_round(data->ioc_inllen1); + char *out = data->ioc_bulk; + + l = snprintf(out, remains, + "logid: #"DOSTID"#%08x\n" + "flags: %x (%s)\n" + "records count: %d\n" + "last index: %d\n", + POSTID(&handle->lgh_id.lgl_oi), + handle->lgh_id.lgl_ogen, + handle->lgh_hdr->llh_flags, + handle->lgh_hdr->llh_flags & + LLOG_F_IS_CAT ? "cat" : "plain", + handle->lgh_hdr->llh_count, + handle->lgh_last_idx); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: not enough space for log header info\n", + ctxt->loc_obd->obd_name); + rc = -ENOSPC; + } + break; + } + case OBD_IOC_LLOG_CHECK: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_check_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_PRINT: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_print_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_CANCEL: { + struct llog_cookie cookie; + struct llog_logid plain; + char *endp; + + cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + GOTO(out_close, rc = -EINVAL); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_cancel_rec(NULL, handle, cookie.lgc_index); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */ + GOTO(out_close, rc = -ENOTTY); + + rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + cookie.lgc_lgl = plain; + rc = llog_cat_cancel_records(env, handle, 1, &cookie); + if (rc) + GOTO(out_close, rc); + break; + } + case OBD_IOC_LLOG_REMOVE: { + struct llog_logid plain; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_destroy(env, handle); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 > 0) { + /* remove indicate log from the catalog */ + rc = str2logid(&plain, data->ioc_inlbuf2, + data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + rc = llog_remove_log(env, handle, &plain); + } else { + /* remove all the log of the catalog */ + rc = llog_process(env, handle, llog_delete_cb, NULL, + NULL); + if (rc) + GOTO(out_close, rc); + } + break; + } + default: + CERROR("%s: Unknown ioctl cmd %#x\n", + ctxt->loc_obd->obd_name, cmd); + GOTO(out_close, rc = -ENOTTY); + } + +out_close: + if (handle->lgh_hdr && + handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + llog_cat_close(env, handle); + else + llog_close(env, handle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_ioctl); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c new file mode 100644 index 000000000000..7e12dc62141f --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c @@ -0,0 +1,862 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_lvfs.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "llog_internal.h" + +#if defined(LLOG_LVFS) + +static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file, + int len, int index) +{ + struct llog_rec_hdr rec = { 0 }; + struct llog_rec_tail tail; + int rc; + ENTRY; + + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + tail.lrt_len = rec.lrh_len = len; + tail.lrt_index = rec.lrh_index = index; + rec.lrh_type = LLOG_PAD_MAGIC; + + rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0); + if (rc) { + CERROR("error writing padding record: rc %d\n", rc); + goto out; + } + + file->f_pos += len - sizeof(rec) - sizeof(tail); + rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0); + if (rc) { + CERROR("error writing padding record: rc %d\n", rc); + goto out; + } + + out: + RETURN(rc); +} + +static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file, + struct llog_rec_hdr *rec, void *buf, loff_t off) +{ + int rc; + struct llog_rec_tail end; + loff_t saved_off = file->f_pos; + int buflen = rec->lrh_len; + + ENTRY; + + file->f_pos = off; + + if (buflen == 0) + CWARN("0-length record\n"); + + if (!buf) { + rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0); + if (rc) { + CERROR("error writing log record: rc %d\n", rc); + goto out; + } + GOTO(out, rc = 0); + } + + /* the buf case */ + rec->lrh_len = sizeof(*rec) + buflen + sizeof(end); + rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0); + if (rc) { + CERROR("error writing log hdr: rc %d\n", rc); + goto out; + } + + rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0); + if (rc) { + CERROR("error writing log buffer: rc %d\n", rc); + goto out; + } + + end.lrt_len = rec->lrh_len; + end.lrt_index = rec->lrh_index; + rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0); + if (rc) { + CERROR("error writing log tail: rc %d\n", rc); + goto out; + } + + rc = 0; + out: + if (saved_off > file->f_pos) + file->f_pos = saved_off; + LASSERT(rc <= 0); + RETURN(rc); +} + +static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file, + void *buf, int size, loff_t off) +{ + loff_t offset = off; + int rc; + ENTRY; + + rc = fsfilt_read_record(obd, file, buf, size, &offset); + if (rc) { + CERROR("error reading log record: rc %d\n", rc); + RETURN(rc); + } + RETURN(0); +} + +static int llog_lvfs_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_device *obd; + int rc; + ENTRY; + + LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE); + + obd = handle->lgh_ctxt->loc_exp->exp_obd; + + if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + RETURN(LLOG_EEMPTY); + } + + rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr, + LLOG_CHUNK_SIZE, 0); + if (rc) { + CERROR("error reading log header from %.*s\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name); + } else { + struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr; + + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log %.*s header magic: %#x (expected %#x)\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name, + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("incorrectly sized log %.*s header: %#x " + "(expected %#x)\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name, + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } + } + + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode); + + RETURN(rc); +} + +/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */ +/* appends if idx == -1, otherwise overwrites record idx. */ +static int llog_lvfs_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, int cookiecount, + void *buf, int idx, struct thandle *th) +{ + struct llog_log_hdr *llh; + int reclen = rec->lrh_len, index, rc; + struct llog_rec_tail *lrt; + struct obd_device *obd; + struct file *file; + size_t left; + ENTRY; + + llh = loghandle->lgh_hdr; + file = loghandle->lgh_file; + obd = loghandle->lgh_ctxt->loc_exp->exp_obd; + + /* record length should not bigger than LLOG_CHUNK_SIZE */ + if (buf) + rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail)) ? -E2BIG : 0; + else + rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0; + if (rc) + RETURN(rc); + + if (buf) + /* write_blob adds header and tail to lrh_len. */ + reclen = sizeof(*rec) + rec->lrh_len + + sizeof(struct llog_rec_tail); + + if (idx != -1) { + loff_t saved_offset; + + /* no header: only allowed to insert record 1 */ + if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) { + CERROR("idx != -1 in empty log\n"); + LBUG(); + } + + if (idx && llh->llh_size && llh->llh_size != rec->lrh_len) + RETURN(-EINVAL); + + if (!ext2_test_bit(idx, llh->llh_bitmap)) + CERROR("Modify unset record %u\n", idx); + if (idx != rec->lrh_index) + CERROR("Index mismatch %d %u\n", idx, rec->lrh_index); + + rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0); + /* we are done if we only write the header or on error */ + if (rc || idx == 0) + RETURN(rc); + + if (buf) { + /* We assume that caller has set lgh_cur_* */ + saved_offset = loghandle->lgh_cur_offset; + CDEBUG(D_OTHER, + "modify record "DOSTID": idx:%d/%u/%d, len:%u " + "offset %llu\n", + POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index, + loghandle->lgh_cur_idx, rec->lrh_len, + (long long)(saved_offset - sizeof(*llh))); + if (rec->lrh_index != loghandle->lgh_cur_idx) { + CERROR("modify idx mismatch %u/%d\n", + idx, loghandle->lgh_cur_idx); + RETURN(-EFAULT); + } + } else { + /* Assumes constant lrh_len */ + saved_offset = sizeof(*llh) + (idx - 1) * reclen; + } + + rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /* Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + * + * We know that llog_current_log() will return a loghandle that is + * big enough to hold reclen, so all we care about is padding here. + */ + left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1)); + + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_lvfs_pad(obd, file, left, index); + if (rc) + RETURN(rc); + loghandle->lgh_last_idx++; /*for pad rec*/ + } + /* if it's the last idx in log file, then return -ENOSPC */ + if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1) + RETURN(-ENOSPC); + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + rec->lrh_index = index; + if (buf == NULL) { + lrt = (struct llog_rec_tail *) + ((char *)rec + rec->lrh_len - sizeof(*lrt)); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + } + /*The caller should make sure only 1 process access the lgh_last_idx, + *Otherwise it might hit the assert.*/ + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + spin_lock(&loghandle->lgh_hdr_lock); + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("argh, index %u already set in log bitmap?\n", index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + llh->llh_tail.lrt_index = index; + + rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0); + if (rc) + RETURN(rc); + + rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos); + if (rc) + RETURN(rc); + + CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n", + POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + if (rc == 0 && rec->lrh_type == LLOG_GEN_REC) + rc = 1; + + RETURN(rc); +} + +/* We can skip reading at least as many log blocks as the number of +* minimum sized log records we are skipping. If it turns out +* that we are not far enough along the log (because the +* actual records are larger than minimum size) we just skip +* some more records. */ + +static void llog_skip_over(__u64 *off, int curr, int goal) +{ + if (goal <= curr) + return; + *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) & + ~(LLOG_CHUNK_SIZE - 1); +} + + +/* sets: + * - cur_offset to the furthest point read in the log file + * - cur_idx to the log index preceeding cur_offset + * returns -EIO/-EINVAL on error + */ +static int llog_lvfs_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + int rc; + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n", + next_idx, *cur_idx, *cur_offset); + + while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + loff_t ppos; + int llen; + + llog_skip_over(cur_offset, *cur_idx, next_idx); + + /* read up to next LLOG_CHUNK_SIZE block */ + ppos = *cur_offset; + llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1)); + rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, + loghandle->lgh_file, buf, llen, + cur_offset); + if (rc < 0) { + CERROR("Cant read llog block at log id "DOSTID + "/%u offset "LPU64"\n", + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + *cur_offset); + RETURN(rc); + } + + /* put number of bytes read into rc to make code simpler */ + rc = *cur_offset - ppos; + if (rc < len) { + /* signal the end of the valid buffer to llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) /* end of file, nothing to do */ + RETURN(0); + + if (rc < sizeof(*tail)) { + CERROR("Invalid llog block at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + RETURN(-EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)(buf + rc - + sizeof(struct llog_rec_tail)); + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)(buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("Invalid llog tail at log id "DOSTID"/%u offset " + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + RETURN(-EINVAL); + } + if (tail->lrt_index < next_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > next_idx) { + CERROR("missed desired record? %u > %u\n", + rec->lrh_index, next_idx); + RETURN(-ENOENT); + } + RETURN(0); + } + RETURN(-EIO); +} + +static int llog_lvfs_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + __u64 cur_offset; + int rc; + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + cur_offset = LLOG_CHUNK_SIZE; + llog_skip_over(&cur_offset, 0, prev_idx); + + while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + loff_t ppos = cur_offset; + + rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, + loghandle->lgh_file, buf, len, + &cur_offset); + if (rc < 0) { + CERROR("Cant read llog block at log id "DOSTID + "/%u offset "LPU64"\n", + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + cur_offset); + RETURN(rc); + } + + /* put number of bytes read into rc to make code simpler */ + rc = cur_offset - ppos; + + if (rc == 0) /* end of file, nothing to do */ + RETURN(0); + + if (rc < sizeof(*tail)) { + CERROR("Invalid llog block at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)(buf + rc - + sizeof(struct llog_rec_tail)); + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)(buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("Invalid llog tail at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("missed desired record? %u > %u\n", + rec->lrh_index, prev_idx); + RETURN(-ENOENT); + } + RETURN(0); + } + RETURN(-EIO); +} + +static struct file *llog_filp_open(char *dir, char *name, int flags, int mode) +{ + char *logname; + struct file *filp; + int len; + + OBD_ALLOC(logname, PATH_MAX); + if (logname == NULL) + return ERR_PTR(-ENOMEM); + + len = snprintf(logname, PATH_MAX, "%s/%s", dir, name); + if (len >= PATH_MAX - 1) { + filp = ERR_PTR(-ENAMETOOLONG); + } else { + filp = l_filp_open(logname, flags, mode); + if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT) + CERROR("logfile creation %s: %ld\n", logname, + PTR_ERR(filp)); + } + OBD_FREE(logname, PATH_MAX); + return filp; +} + +static int llog_lvfs_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct l_dentry *dchild = NULL; + struct obd_device *obd; + int rc = 0; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + obd = ctxt->loc_exp->exp_obd; + + LASSERT(handle); + if (logid != NULL) { + dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi, + logid->lgl_ogen); + if (IS_ERR(dchild)) { + rc = PTR_ERR(dchild); + CERROR("%s: error looking up logfile #"DOSTID "#%08x:" + " rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + if (dchild->d_inode == NULL) { + l_dput(dchild); + rc = -ENOENT; + CERROR("%s: nonexistent llog #"DOSTID"#%08x:" + "rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, + O_RDWR | O_LARGEFILE); + l_dput(dchild); + if (IS_ERR(handle->lgh_file)) { + rc = PTR_ERR(handle->lgh_file); + handle->lgh_file = NULL; + CERROR("%s: error opening llog #"DOSTID"#%08x:" + "rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + handle->lgh_id = *logid; + } else if (name) { + handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name, + O_RDWR | O_LARGEFILE, 0644); + if (IS_ERR(handle->lgh_file)) { + rc = PTR_ERR(handle->lgh_file); + handle->lgh_file = NULL; + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + rc = 0; + } else { + GOTO(out, rc); + } + } else { + lustre_build_llog_lvfs_oid(&handle->lgh_id, + handle->lgh_file->f_dentry->d_inode->i_ino, + handle->lgh_file->f_dentry->d_inode->i_generation); + } + } else { + LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param); + handle->lgh_file = NULL; + } + + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL) + GOTO(out_name, rc = -ENOENT); + + RETURN(0); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + RETURN(rc); +} + +static int llog_lvfs_exist(struct llog_handle *handle) +{ + return (handle->lgh_file != NULL); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_lvfs_create(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th) +{ + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct obd_device *obd; + struct l_dentry *dchild = NULL; + struct file *file; + struct obdo *oa = NULL; + int rc = 0; + int open_flags = O_RDWR | O_CREAT | O_LARGEFILE; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + obd = ctxt->loc_exp->exp_obd; + LASSERT(handle->lgh_file == NULL); + + if (handle->lgh_name) { + file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name, + open_flags, 0644); + if (IS_ERR(file)) + RETURN(PTR_ERR(file)); + + lustre_build_llog_lvfs_oid(&handle->lgh_id, + file->f_dentry->d_inode->i_ino, + file->f_dentry->d_inode->i_generation); + handle->lgh_file = file; + } else { + OBDO_ALLOC(oa); + if (oa == NULL) + RETURN(-ENOMEM); + + ostid_set_seq_llog(&oa->o_oi); + oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP; + + rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL); + if (rc) + GOTO(out, rc); + + /* FIXME: rationalize the misuse of o_generation in + * this API along with mds_obd_{create,destroy}. + * Hopefully it is only an internal API issue. */ +#define o_generation o_parent_oid + dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi, + oa->o_generation); + if (IS_ERR(dchild)) + GOTO(out, rc = PTR_ERR(dchild)); + + file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags); + l_dput(dchild); + if (IS_ERR(file)) + GOTO(out, rc = PTR_ERR(file)); + handle->lgh_id.lgl_oi = oa->o_oi; + handle->lgh_id.lgl_ogen = oa->o_generation; + handle->lgh_file = file; +out: + OBDO_FREE(oa); + } + RETURN(rc); +} + +static int llog_lvfs_close(const struct lu_env *env, + struct llog_handle *handle) +{ + int rc; + + ENTRY; + + if (handle->lgh_file == NULL) + RETURN(0); + rc = filp_close(handle->lgh_file, 0); + if (rc) + CERROR("%s: error closing llog #"DOSTID"#%08x: " + "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name, + POSTID(&handle->lgh_id.lgl_oi), + handle->lgh_id.lgl_ogen, rc); + handle->lgh_file = NULL; + if (handle->lgh_name) { + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + handle->lgh_name = NULL; + } + RETURN(rc); +} + +static int llog_lvfs_destroy(const struct lu_env *env, + struct llog_handle *handle) +{ + struct dentry *fdentry; + struct obdo *oa; + struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd; + char *dir; + void *th; + struct inode *inode; + int rc, rc1; + ENTRY; + + dir = MOUNT_CONFIGS_DIR; + + LASSERT(handle->lgh_file); + fdentry = handle->lgh_file->f_dentry; + inode = fdentry->d_parent->d_inode; + if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) { + struct lvfs_run_ctxt saved; + struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dget(fdentry); + rc = llog_lvfs_close(env, handle); + if (rc == 0) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + rc = ll_vfs_unlink(inode, fdentry, mnt); + mutex_unlock(&inode->i_mutex); + } + mntput(mnt); + + dput(fdentry); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + RETURN(rc); + } + + OBDO_ALLOC(oa); + if (oa == NULL) + RETURN(-ENOMEM); + + oa->o_oi = handle->lgh_id.lgl_oi; + oa->o_generation = handle->lgh_id.lgl_ogen; +#undef o_generation + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER; + + rc = llog_lvfs_close(env, handle); + if (rc) + GOTO(out, rc); + + th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1); + if (IS_ERR(th)) { + CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th)); + GOTO(out, rc = PTR_ERR(th)); + } + + rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa, + NULL, NULL, NULL, NULL); + + rc1 = fsfilt_commit(obd, inode, th, 0); + if (rc == 0 && rc1 != 0) + rc = rc1; + out: + OBDO_FREE(oa); + RETURN(rc); +} + +static int llog_lvfs_declare_create(const struct lu_env *env, + struct llog_handle *res, + struct thandle *th) +{ + return 0; +} + +static int llog_lvfs_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + return 0; +} + +struct llog_operations llog_lvfs_ops = { + .lop_write_rec = llog_lvfs_write_rec, + .lop_next_block = llog_lvfs_next_block, + .lop_prev_block = llog_lvfs_prev_block, + .lop_read_header = llog_lvfs_read_header, + .lop_create = llog_lvfs_create, + .lop_destroy = llog_lvfs_destroy, + .lop_close = llog_lvfs_close, + .lop_open = llog_lvfs_open, + .lop_exist = llog_lvfs_exist, + .lop_declare_create = llog_lvfs_declare_create, + .lop_declare_write_rec = llog_lvfs_declare_write_rec, +}; +EXPORT_SYMBOL(llog_lvfs_ops); +#else /* !__KERNEL__ */ +struct llog_operations llog_lvfs_ops = {}; +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c new file mode 100644 index 000000000000..7e2290796315 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c @@ -0,0 +1,319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include "llog_internal.h" + +/* helper functions for calling the llog obd methods */ +static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + OBD_ALLOC_PTR(ctxt); + if (!ctxt) + return NULL; + + ctxt->loc_obd = obd; + atomic_set(&ctxt->loc_refcount, 1); + + return ctxt; +} + +static void llog_ctxt_destroy(struct llog_ctxt *ctxt) +{ + if (ctxt->loc_exp) { + class_export_put(ctxt->loc_exp); + ctxt->loc_exp = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + OBD_FREE_PTR(ctxt); +} + +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg = ctxt->loc_olg; + struct obd_device *obd; + int rc = 0; + + spin_lock(&olg->olg_lock); + if (!atomic_dec_and_test(&ctxt->loc_refcount)) { + spin_unlock(&olg->olg_lock); + return rc; + } + olg->olg_ctxts[ctxt->loc_idx] = NULL; + spin_unlock(&olg->olg_lock); + + obd = ctxt->loc_obd; + spin_lock(&obd->obd_dev_lock); + /* sync with llog ctxt user thread */ + spin_unlock(&obd->obd_dev_lock); + + /* obd->obd_starting is needed for the case of cleanup + * in error case while obd is starting up. */ + LASSERTF(obd->obd_starting == 1 || + obd->obd_stopping == 1 || obd->obd_set_up == 0, + "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, + !!obd->obd_stopping, !!obd->obd_set_up); + + /* cleanup the llog ctxt here */ + if (CTXTP(ctxt, cleanup)) + rc = CTXTP(ctxt, cleanup)(env, ctxt); + + llog_ctxt_destroy(ctxt); + wake_up(&olg->olg_waitq); + return rc; +} +EXPORT_SYMBOL(__llog_ctxt_put); + +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct obd_llog_group *olg; + int rc, idx; + ENTRY; + + LASSERT(ctxt != NULL); + LASSERT(ctxt != LP_POISON); + + olg = ctxt->loc_olg; + LASSERT(olg != NULL); + LASSERT(olg != LP_POISON); + + idx = ctxt->loc_idx; + + /* + * Banlance the ctxt get when calling llog_cleanup() + */ + LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); + LASSERT(atomic_read(&ctxt->loc_refcount) > 1); + llog_ctxt_put(ctxt); + + /* + * Try to free the ctxt. + */ + rc = __llog_ctxt_put(env, ctxt); + if (rc) + CERROR("Error %d while cleaning up ctxt %p\n", + rc, ctxt); + + l_wait_event(olg->olg_waitq, + llog_group_ctxt_null(olg, idx), &lwi); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cleanup); + +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op) +{ + struct llog_ctxt *ctxt; + int rc = 0; + ENTRY; + + if (index < 0 || index >= LLOG_MAX_CTXTS) + RETURN(-EINVAL); + + LASSERT(olg != NULL); + + ctxt = llog_new_ctxt(obd); + if (!ctxt) + RETURN(-ENOMEM); + + ctxt->loc_obd = obd; + ctxt->loc_olg = olg; + ctxt->loc_idx = index; + ctxt->loc_logops = op; + mutex_init(&ctxt->loc_mutex); + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); + ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; + + rc = llog_group_set_ctxt(olg, ctxt, index); + if (rc) { + llog_ctxt_destroy(ctxt); + if (rc == -EEXIST) { + ctxt = llog_group_get_ctxt(olg, index); + if (ctxt) { + /* + * mds_lov_update_desc() might call here multiple + * times. So if the llog is already set up then + * don't to do it again. + */ + CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", + obd->obd_name, index); + LASSERT(ctxt->loc_olg == olg); + LASSERT(ctxt->loc_obd == obd); + LASSERT(ctxt->loc_exp == disk_obd->obd_self_export); + LASSERT(ctxt->loc_logops == op); + llog_ctxt_put(ctxt); + } + rc = 0; + } + RETURN(rc); + } + + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(env, obd, olg, index, disk_obd); + } + + if (rc) { + CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", + obd->obd_name, index, op->lop_setup, rc); + llog_group_clear_ctxt(olg, index); + llog_ctxt_destroy(ctxt); + } else { + CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", + obd->obd_name, index); + ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_setup); + +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags) +{ + int rc = 0; + ENTRY; + + if (!ctxt) + RETURN(0); + + if (CTXTP(ctxt, sync)) + rc = CTXTP(ctxt, sync)(ctxt, exp, flags); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_sync); + +int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies) +{ + int raised, rc; + ENTRY; + + if (!ctxt) { + CERROR("No ctxt\n"); + RETURN(-ENODEV); + } + + if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED) + RETURN(-ENXIO); + + CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP); + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies, + numcookies); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_obd_add); + +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags) +{ + int rc; + ENTRY; + + if (!ctxt) { + CERROR("No ctxt\n"); + RETURN(-ENODEV); + } + + CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP); + rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cancel); + +int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *index) +{ + int rc; + ENTRY; + OBD_CHECK_DT_OP(obd, llog_init, 0); + OBD_COUNTER_INCREMENT(obd, llog_init); + + rc = OBP(obd, llog_init)(obd, olg, disk_obd, index); + RETURN(rc); +} +EXPORT_SYMBOL(obd_llog_init); + +int obd_llog_finish(struct obd_device *obd, int count) +{ + int rc; + ENTRY; + OBD_CHECK_DT_OP(obd, llog_finish, 0); + OBD_COUNTER_INCREMENT(obd, llog_finish); + + rc = OBP(obd, llog_finish)(obd, count); + RETURN(rc); +} +EXPORT_SYMBOL(obd_llog_finish); + +/* context key constructor/destructor: llog_key_init, llog_key_fini */ +LU_KEY_INIT_FINI(llog, struct llog_thread_info); +/* context key: llog_thread_key */ +LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); +LU_KEY_INIT_GENERIC(llog); +EXPORT_SYMBOL(llog_thread_key); + +int llog_info_init(void) +{ + llog_key_init_generic(&llog_thread_key, NULL); + lu_context_key_register(&llog_thread_key); + return 0; +} + +void llog_info_fini(void) +{ + lu_context_key_degister(&llog_thread_key); +} diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c new file mode 100644 index 000000000000..6dbd21a863c2 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c @@ -0,0 +1,1323 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API + * + * Author: Alexey Zhuravlev + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#ifndef EXPORT_SYMTAB +#define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include + +#include "llog_internal.h" +#include "local_storage.h" + +/* + * - multi-chunks or big-declaration approach + * - use unique sequence instead of llog sb tracking unique ids + * - re-use existing environment + * - named llog support (can be used for testing only at the present) + * - llog_origin_connect() work with OSD API + */ + +static int llog_osd_declare_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_declare_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +static int llog_osd_create_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +static int llog_osd_pad(const struct lu_env *env, struct dt_object *o, + loff_t *off, int len, int index, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + int rc; + + ENTRY; + + LASSERT(th); + LASSERT(off); + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len; + lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index; + lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC; + + lgi->lgi_buf.lb_buf = &lgi->lgi_lrh; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh); + dt_write_lock(env, o, 0); + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + lgi->lgi_buf.lb_buf = &lgi->lgi_tail; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail); + *off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail); + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); +out: + dt_write_unlock(env, o); + RETURN(rc); +} + +static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o, + struct llog_rec_hdr *rec, void *buf, + loff_t *off, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + int buflen = rec->lrh_len; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(o); + + if (buflen == 0) + CWARN("0-length record\n"); + + CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n", + rec->lrh_type, buf, buflen, *off); + + lgi->lgi_attr.la_valid = LA_SIZE; + lgi->lgi_attr.la_size = *off; + + if (!buf) { + lgi->lgi_buf.lb_len = buflen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing log record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + /* the buf case */ + /* protect the following 3 writes from concurrent read */ + dt_write_lock(env, o, 0); + rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail); + lgi->lgi_buf.lb_len = sizeof(*rec); + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing log hdr: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out_unlock, rc); + } + + lgi->lgi_buf.lb_len = buflen; + lgi->lgi_buf.lb_buf = buf; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing log buffer: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out_unlock, rc); + } + + lgi->lgi_tail.lrt_len = rec->lrh_len; + lgi->lgi_tail.lrt_index = rec->lrh_index; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail); + lgi->lgi_buf.lb_buf = &lgi->lgi_tail; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing log tail: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + +out_unlock: + dt_write_unlock(env, o); + +out: + /* cleanup the content written above */ + if (rc) { + dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th, + BYPASS_CAPA); + dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA); + } + + RETURN(rc); +} + +static int llog_osd_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_rec_hdr *llh_hdr; + struct dt_object *o; + struct llog_thread_info *lgi; + int rc; + + ENTRY; + + LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE); + + o = handle->lgh_obj; + LASSERT(o); + + lgi = llog_info(env); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + RETURN(rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + + if (lgi->lgi_attr.la_size == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + RETURN(LLOG_EEMPTY); + } + + lgi->lgi_off = 0; + lgi->lgi_buf.lb_buf = handle->lgh_hdr; + lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE; + + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + if (rc) { + CERROR("%s: error reading log header from "DFID": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), rc); + RETURN(rc); + } + + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("%s: bad log %s "DFID" header magic: %#x " + "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + RETURN(-EIO); + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("%s: incorrectly sized log %s "DFID" header: " + "%#x (expected %#x)\n" + "you may need to re-run lconf --write_conf.\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + RETURN(-EIO); + } + + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + + RETURN(0); +} + +static int llog_osd_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(th); + LASSERT(loghandle); + + o = loghandle->lgh_obj; + LASSERT(o); + + /* each time we update header */ + rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0, + th); + if (rc || idx == 0) /* if error or just header */ + RETURN(rc); + + if (dt_object_exists(o)) { + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + lgi->lgi_off = lgi->lgi_attr.la_size; + LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE)); + if (rc) + RETURN(rc); + + rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th); + if (rc) + RETURN(rc); + } else { + lgi->lgi_off = 0; + } + + /* XXX: implement declared window or multi-chunks approach */ + rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th); + + RETURN(rc); +} + +/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */ +/* appends if idx == -1, otherwise overwrites record idx. */ +static int llog_osd_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, int cookiecount, + void *buf, int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_log_hdr *llh; + int reclen = rec->lrh_len; + int index, rc, old_tail_idx; + struct llog_rec_tail *lrt; + struct dt_object *o; + size_t left; + + ENTRY; + + LASSERT(env); + llh = loghandle->lgh_hdr; + LASSERT(llh); + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(th); + + CDEBUG(D_OTHER, "new record %x to "DFID"\n", + rec->lrh_type, PFID(lu_object_fid(&o->do_lu))); + + /* record length should not bigger than LLOG_CHUNK_SIZE */ + if (buf) + rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail)) ? -E2BIG : 0; + else + rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0; + if (rc) + RETURN(rc); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + RETURN(rc); + + if (buf) + /* write_blob adds header and tail to lrh_len. */ + reclen = sizeof(*rec) + rec->lrh_len + + sizeof(struct llog_rec_tail); + + if (idx != -1) { + /* no header: only allowed to insert record 1 */ + if (idx != 1 && lgi->lgi_attr.la_size == 0) + LBUG(); + + if (idx && llh->llh_size && llh->llh_size != rec->lrh_len) + RETURN(-EINVAL); + + if (!ext2_test_bit(idx, llh->llh_bitmap)) + CERROR("%s: modify unset record %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx); + if (idx != rec->lrh_index) + CERROR("%s: index mismatch %d %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + rec->lrh_index); + + lgi->lgi_off = 0; + rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, + &lgi->lgi_off, th); + /* we are done if we only write the header or on error */ + if (rc || idx == 0) + RETURN(rc); + + if (buf) { + /* We assume that caller has set lgh_cur_* */ + lgi->lgi_off = loghandle->lgh_cur_offset; + CDEBUG(D_OTHER, + "modify record "DOSTID": idx:%d/%u/%d, len:%u " + "offset %llu\n", + POSTID(&loghandle->lgh_id.lgl_oi), idx, + rec->lrh_index, + loghandle->lgh_cur_idx, rec->lrh_len, + (long long)(lgi->lgi_off - sizeof(*llh))); + if (rec->lrh_index != loghandle->lgh_cur_idx) { + CERROR("%s: modify idx mismatch %u/%d\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + loghandle->lgh_cur_idx); + RETURN(-EFAULT); + } + } else { + /* Assumes constant lrh_len */ + lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen; + } + + rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /* Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + * + * We know that llog_current_log() will return a loghandle that is + * big enough to hold reclen, so all we care about is padding here. + */ + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = lgi->lgi_attr.la_size; + left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1)); + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th); + if (rc) + RETURN(rc); + loghandle->lgh_last_idx++; /*for pad rec*/ + } + /* if it's the last idx in log file, then return -ENOSPC */ + if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1) + RETURN(-ENOSPC); + + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + rec->lrh_index = index; + if (buf == NULL) { + lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len - + sizeof(*lrt)); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + } + /* The caller should make sure only 1 process access the lgh_last_idx, + * Otherwise it might hit the assert.*/ + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + spin_lock(&loghandle->lgh_hdr_lock); + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("%s: index %u already set in log bitmap\n", + o->do_lu.lo_dev->ld_obd->obd_name, index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + old_tail_idx = llh->llh_tail.lrt_index; + llh->llh_tail.lrt_index = index; + + lgi->lgi_off = 0; + rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off, + th); + if (rc) + GOTO(out, rc); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + GOTO(out, rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = lgi->lgi_attr.la_size; + + rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th); + +out: + /* cleanup llog for error case */ + if (rc) { + spin_lock(&loghandle->lgh_hdr_lock); + ext2_clear_bit(index, llh->llh_bitmap); + llh->llh_count--; + spin_unlock(&loghandle->lgh_hdr_lock); + + /* restore the header */ + loghandle->lgh_last_idx--; + llh->llh_tail.lrt_index = old_tail_idx; + lgi->lgi_off = 0; + llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, + &lgi->lgi_off, th); + } + + CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n", + POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + RETURN(rc); +} + +/* We can skip reading at least as many log blocks as the number of + * minimum sized log records we are skipping. If it turns out + * that we are not far enough along the log (because the + * actual records are larger than minimum size) we just skip + * some more records. + */ +static void llog_skip_over(__u64 *off, int curr, int goal) +{ + if (goal <= curr) + return; + *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) & + ~(LLOG_CHUNK_SIZE - 1); +} + +/* sets: + * - cur_offset to the furthest point read in the log file + * - cur_idx to the log index preceeding cur_offset + * returns -EIO/-EINVAL on error + */ +static int llog_osd_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(lgi); + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n", + next_idx, *cur_idx, *cur_offset); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(dt_object_exists(o)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + while (*cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + llog_skip_over(cur_offset, *cur_idx, next_idx); + + /* read up to next LLOG_CHUNK_SIZE block */ + lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE - + (*cur_offset & (LLOG_CHUNK_SIZE - 1)); + lgi->lgi_buf.lb_buf = buf; + + /* Note: read lock is not needed around la_size get above at + * the time of dt_attr_get(). There are only two cases that + * matter. Either la_size == cur_offset, in which case the + * entire read is skipped, or la_size > cur_offset and the loop + * is entered and this thread is blocked at dt_read_lock() + * until the write is completed. When the write completes, then + * the dt_read() will be done with the full length, and will + * get the full data. + */ + dt_read_lock(env, o, 0); + rc = dt_read(env, o, &lgi->lgi_buf, cur_offset); + dt_read_unlock(env, o); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset "LPU64": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), *cur_offset, + rc); + GOTO(out, rc); + } + + if (rc < len) { + /* signal the end of the valid buffer to + * llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < next_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > next_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, next_idx); + GOTO(out, rc = -ENOENT); + } + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +static int llog_osd_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + loff_t cur_offset; + int rc; + + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(dt_object_exists(o)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + cur_offset = LLOG_CHUNK_SIZE; + llog_skip_over(&cur_offset, 0, prev_idx); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + while (cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + lgi->lgi_buf.lb_len = len; + lgi->lgi_buf.lb_buf = buf; + /* It is OK to have locking around dt_read() only, see + * comment in llog_osd_next_block for details + */ + dt_read_lock(env, o, 0); + rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset); + dt_read_unlock(env, o); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset "LPU64": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), cur_offset, rc); + GOTO(out, rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, prev_idx); + GOTO(out, rc = -ENOENT); + } + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +struct dt_object *llog_osd_dir_get(const struct lu_env *env, + struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dir; + int rc; + + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + if (ctxt->loc_dir == NULL) { + rc = dt_root_get(env, dt, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + dir = dt_locate(env, dt, &dti->dti_fid); + } else { + lu_object_get(&ctxt->loc_dir->do_lu); + dir = ctxt->loc_dir; + } + + return dir; +} + +static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct dt_object *o; + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(env); + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG); + mutex_unlock(&ls->ls_los_mutex); + LASSERT(los); + ls_device_put(env, ls); + + LASSERT(handle); + + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else if (name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out, rc = PTR_ERR(llog_dir)); + dt_read_lock(env, llog_dir, 0); + rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid); + dt_read_unlock(env, llog_dir); + lu_object_put(env, &llog_dir->do_lu); + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, + &lgi->lgi_fid); + } + if (rc < 0) + GOTO(out, rc); + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + } else { + LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param); + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, &lgi->lgi_fid); + if (rc < 0) + GOTO(out, rc); + } + + o = ls_locate(env, ls, &lgi->lgi_fid); + if (IS_ERR(o)) + GOTO(out_name, rc = PTR_ERR(o)); + + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) + GOTO(out_put, rc = -ENOENT); + + fid_to_logid(&lgi->lgi_fid, &handle->lgh_id); + handle->lgh_obj = o; + handle->private_data = los; + LASSERT(handle->lgh_ctxt); + + RETURN(rc); + +out_put: + lu_object_put(env, &o->do_lu); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + dt_los_put(los); + RETURN(rc); +} + +static int llog_osd_exist(struct llog_handle *handle) +{ + LASSERT(handle->lgh_obj); + return (dt_object_exists(handle->lgh_obj) && + !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header)); +} + +static int llog_osd_declare_create(const struct lu_env *env, + struct llog_handle *res, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct local_oid_storage *los; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(res->lgh_obj); + LASSERT(th); + + /* object can be created by another thread */ + o = res->lgh_obj; + if (dt_object_exists(o)) + RETURN(0); + + los = res->private_data; + LASSERT(los); + + rc = llog_osd_declare_new_object(env, los, o, th); + if (rc) + RETURN(rc); + + rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rc = dt_declare_insert(env, llog_dir, + (struct dt_rec *)&lgi->lgi_fid, + (struct dt_key *)res->lgh_name, th); + lu_object_put(env, &llog_dir->do_lu); + if (rc) + CERROR("%s: can't declare named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_osd_create(const struct lu_env *env, struct llog_handle *res, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct local_oid_storage *los; + struct dt_object *o; + int rc = 0; + + ENTRY; + + LASSERT(env); + o = res->lgh_obj; + LASSERT(o); + + /* llog can be already created */ + if (dt_object_exists(o)) + RETURN(-EEXIST); + + los = res->private_data; + LASSERT(los); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = llog_osd_create_new_object(env, los, o, th); + else + rc = -EEXIST; + + dt_write_unlock(env, o); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + dt_read_lock(env, llog_dir, 0); + rc = dt_insert(env, llog_dir, + (struct dt_rec *)&lgi->lgi_fid, + (struct dt_key *)res->lgh_name, + th, BYPASS_CAPA, 1); + dt_read_unlock(env, llog_dir); + lu_object_put(env, &llog_dir->do_lu); + if (rc) + CERROR("%s: can't create named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle) +{ + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(handle->lgh_obj); + + lu_object_put(env, &handle->lgh_obj->do_lu); + + los = handle->private_data; + LASSERT(los); + dt_los_put(los); + + if (handle->lgh_name) + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + + RETURN(rc); +} + +static int llog_osd_destroy(const struct lu_env *env, + struct llog_handle *loghandle) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + struct dt_device *d; + struct thandle *th; + char *name = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + + d = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(d); + LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out_trans, rc = PTR_ERR(llog_dir)); + + name = loghandle->lgh_name; + rc = dt_declare_delete(env, llog_dir, + (struct dt_key *)name, th); + if (rc) + GOTO(out_trans, rc); + } + + dt_declare_ref_del(env, o, th); + + rc = dt_declare_destroy(env, o, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) { + if (name) { + dt_read_lock(env, llog_dir, 0); + rc = dt_delete(env, llog_dir, + (struct dt_key *) name, + th, BYPASS_CAPA); + dt_read_unlock(env, llog_dir); + if (rc) { + CERROR("%s: can't remove llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + name, rc); + GOTO(out_unlock, rc); + } + } + dt_ref_del(env, o, th); + rc = dt_destroy(env, o, th); + if (rc) + GOTO(out_unlock, rc); + } +out_unlock: + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (llog_dir != NULL) + lu_object_put(env, &llog_dir->do_lu); + RETURN(rc); +} + +static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd) +{ + struct local_oid_storage *los; + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt; + int rc = 0; + + ENTRY; + + LASSERT(obd); + LASSERT(olg->olg_ctxts[ctxt_idx]); + + ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]); + LASSERT(ctxt); + + /* initialize data allowing to generate new fids, + * literally we need a sequece */ + lgi->lgi_fid.f_seq = FID_SEQ_LLOG; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, &los); + if (rc < 0) + return rc; + + lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, &los); + llog_ctxt_put(ctxt); + return rc; +} + +static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los, *nlos; + + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, FID_SEQ_LLOG); + nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME); + mutex_unlock(&ls->ls_los_mutex); + if (los != NULL) { + dt_los_put(los); + local_oid_storage_fini(env, los); + } + if (nlos != NULL) { + dt_los_put(nlos); + local_oid_storage_fini(env, nlos); + } + ls_device_put(env, ls); + return 0; +} + +struct llog_operations llog_osd_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, +}; +EXPORT_SYMBOL(llog_osd_ops); + +/* reads the catalog list */ +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + ENTRY; + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID); + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) { + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (rc) + GOTO(out, rc); + } + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", + (int)lgi->lgi_attr.la_size, size); + + /* return just number of llogs */ + if (idarray == NULL) { + rc = lgi->lgi_attr.la_size / sizeof(*idarray); + GOTO(out, rc); + } + + /* read for new ost index or for empty file */ + memset(idarray, 0, size); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + GOTO(out, rc = 0); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + size = lgi->lgi_attr.la_size - lgi->lgi_off; + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + if (rc) { + CERROR("%s: error reading CATALOGS: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + EXIT; +out: + lu_object_put(env, &o->do_lu); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_get_cat_list); + +/* writes the cat list */ +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + if (!count) + RETURN(0); + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID); + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) + GOTO(out, rc = -ENOENT); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc) + CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc); +out_trans: + dt_trans_stop(env, d, th); +out: + lu_object_put(env, &o->do_lu); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_put_cat_list); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c new file mode 100644 index 000000000000..ea70b99706f6 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c @@ -0,0 +1,402 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_swab.c + * + * Swabbing of llog datatypes (from disk or over the wire). + * + * Author: jacob berkman + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n", + POSTID(&d->lgd_logid.lgl_oi)); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset); +} + +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s (&fid->f_seq); + __swab32s (&fid->f_oid); + __swab32s (&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + +void lustre_swab_ost_id(struct ost_id *oid) +{ + if (fid_seq_is_mdt0(oid->oi.oi_seq)) { + __swab64s(&oid->oi.oi_id); + __swab64s(&oid->oi.oi_seq); + } else { + lustre_swab_lu_fid(&oid->oi_fid); + } +} +EXPORT_SYMBOL(lustre_swab_ost_id); + +void lustre_swab_llogd_body (struct llogd_body *d) +{ + ENTRY; + print_llogd_body(d); + lustre_swab_ost_id(&d->lgd_logid.lgl_oi); + __swab32s (&d->lgd_logid.lgl_ogen); + __swab32s (&d->lgd_ctxt_idx); + __swab32s (&d->lgd_llh_flags); + __swab32s (&d->lgd_index); + __swab32s (&d->lgd_saved_index); + __swab32s (&d->lgd_len); + __swab64s (&d->lgd_cur_offset); + print_llogd_body(d); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) +{ + __swab64s (&d->lgdc_gen.mnt_cnt); + __swab64s (&d->lgdc_gen.conn_cnt); + lustre_swab_ost_id(&d->lgdc_logid.lgl_oi); + __swab32s (&d->lgdc_logid.lgl_ogen); + __swab32s (&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid(struct ll_fid *fid) +{ + __swab64s (&fid->id); + __swab32s (&fid->generation); + __swab32s (&fid->f_type); +} +EXPORT_SYMBOL(lustre_swab_ll_fid); + +void lustre_swab_lu_seq_range(struct lu_seq_range *range) +{ + __swab64s (&range->lsr_start); + __swab64s (&range->lsr_end); + __swab32s (&range->lsr_index); + __swab32s (&range->lsr_flags); +} +EXPORT_SYMBOL(lustre_swab_lu_seq_range); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec) +{ + struct llog_rec_tail *tail = NULL; + + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + __swab32s(&rec->lrh_id); + + switch (rec->lrh_type) { + case OST_SZ_REC: + { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_ioepoch); + tail = &lsc->lsc_tail; + break; + } + case MDS_UNLINK_REC: + { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_oseq); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case MDS_UNLINK64_REC: + { + struct llog_unlink64_rec *lur = + (struct llog_unlink64_rec *)rec; + + lustre_swab_lu_fid(&lur->lur_fid); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case CHANGELOG_REC: + { + struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec; + + __swab16s(&cr->cr.cr_namelen); + __swab16s(&cr->cr.cr_flags); + __swab32s(&cr->cr.cr_type); + __swab64s(&cr->cr.cr_index); + __swab64s(&cr->cr.cr_prev); + __swab64s(&cr->cr.cr_time); + lustre_swab_lu_fid(&cr->cr.cr_tfid); + lustre_swab_lu_fid(&cr->cr.cr_pfid); + if (CHANGELOG_REC_EXTENDED(&cr->cr)) { + struct llog_changelog_ext_rec *ext = + (struct llog_changelog_ext_rec *)rec; + + lustre_swab_lu_fid(&ext->cr.cr_sfid); + lustre_swab_lu_fid(&ext->cr.cr_spfid); + tail = &ext->cr_tail; + } else { + tail = &cr->cr_tail; + } + break; + } + case CHANGELOG_USER_REC: + { + struct llog_changelog_user_rec *cur = + (struct llog_changelog_user_rec*)rec; + + __swab32s(&cur->cur_id); + __swab64s(&cur->cur_endrec); + tail = &cur->cur_tail; + break; + } + + case MDS_SETATTR64_REC: + { + struct llog_setattr64_rec *lsr = + (struct llog_setattr64_rec *)rec; + + lustre_swab_ost_id(&lsr->lsr_oi); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_uid_h); + __swab32s(&lsr->lsr_gid); + __swab32s(&lsr->lsr_gid_h); + tail = &lsr->lsr_tail; + break; + } + case OBD_CFG_REC: + /* these are swabbed as they are consumed */ + break; + case LLOG_HDR_MAGIC: + { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + tail = &llh->llh_tail; + break; + } + case LLOG_LOGID_MAGIC: + { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + lustre_swab_ost_id(&lid->lid_id.lgl_oi); + __swab32s(&lid->lid_id.lgl_ogen); + tail = &lid->lid_tail; + break; + } + case LLOG_GEN_REC: + { + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + __swab64s(&lgr->lgr_gen.mnt_cnt); + __swab64s(&lgr->lgr_gen.conn_cnt); + tail = &lgr->lgr_tail; + break; + } + case LLOG_PAD_MAGIC: + break; + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} +EXPORT_SYMBOL(lustre_swab_llog_rec); + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len); +} + +void lustre_swab_llog_hdr (struct llog_log_hdr *h) +{ + ENTRY; + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr); + + print_llog_hdr(h); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +static void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n", + i, lcfg->lcfg_buflens[i]); + EXIT; +} + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + EXIT; + return; + } + + __swab32s(&lcfg->lcfg_command); + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + EXIT; + return; +} +EXPORT_SYMBOL(lustre_swab_lustre_cfg); + +/* used only for compatibility with old on-disk cfg_marker data */ +struct cfg_marker32 { + __u32 cm_step; + __u32 cm_flags; + __u32 cm_vers; + __u32 padding; + __u32 cm_createtime; + __u32 cm_canceltime; + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ + (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) + +void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) +{ + struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker; + ENTRY; + + if (swab) { + __swab32s(&marker->cm_step); + __swab32s(&marker->cm_flags); + __swab32s(&marker->cm_vers); + } + if (size == sizeof(*cm32)) { + __u32 createtime, canceltime; + /* There was a problem with the original declaration of + * cfg_marker on 32-bit systems because it used time_t as + * a wire protocol structure, and didn't verify this in + * wirecheck. We now have to convert the offsets of the + * later fields in order to work on 32- and 64-bit systems. + * + * Fortunately, the cm_comment field has no functional use + * so can be sacrificed when converting the timestamp size. + * + * Overwrite fields from the end first, so they are not + * clobbered, and use memmove() instead of memcpy() because + * the source and target buffers overlap. bug 16771 */ + createtime = cm32->cm_createtime; + canceltime = cm32->cm_canceltime; + memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); + marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; + memmove(marker->cm_tgtname, cm32->cm_tgtname, + sizeof(marker->cm_tgtname)); + if (swab) { + __swab32s(&createtime); + __swab32s(&canceltime); + } + marker->cm_createtime = createtime; + marker->cm_canceltime = canceltime; + CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) " + "for target %s, converting\n", + marker->cm_tgtname); + } else if (swab) { + __swab64s(&marker->cm_createtime); + __swab64s(&marker->cm_canceltime); + } + + EXIT; + return; +} +EXPORT_SYMBOL(lustre_swab_cfg_marker); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c new file mode 100644 index 000000000000..d397f781ec43 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c @@ -0,0 +1,1087 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_test.c + * + * Author: Phil Schwan + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +#include +#include +#include + +/* This is slightly more than the number of records that can fit into a + * single llog file, because the llog_log_header takes up some of the + * space in the first block that cannot be used for the bitmap. */ +#define LLOG_TEST_RECNUM (LLOG_CHUNK_SIZE * 8) + +static int llog_test_rand; +static struct obd_uuid uuid = { .uuid = "test_uuid" }; +static struct llog_logid cat_logid; + +struct llog_mini_rec { + struct llog_rec_hdr lmr_hdr; + struct llog_rec_tail lmr_tail; +} __attribute__((packed)); + +static int verify_handle(char *test, struct llog_handle *llh, int num_recs) +{ + int i; + int last_idx = 0; + int active_recs = 0; + + for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) { + if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) { + last_idx = i; + active_recs++; + } + } + + if (active_recs != num_recs) { + CERROR("%s: expected %d active recs after write, found %d\n", + test, num_recs, active_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_hdr->llh_count != num_recs) { + CERROR("%s: handle->count is %d, expected %d after write\n", + test, llh->lgh_hdr->llh_count, num_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_last_idx < last_idx) { + CERROR("%s: handle->last_idx is %d, expected %d after write\n", + test, llh->lgh_last_idx, last_idx); + RETURN(-ERANGE); + } + + RETURN(0); +} + +/* Test named-log create/open, close */ +static int llog_test_1(const struct lu_env *env, + struct obd_device *obd, char *name) +{ + struct llog_handle *llh; + struct llog_ctxt *ctxt; + int rc; + int rc2; + + ENTRY; + + CWARN("1a: create a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open_create(env, ctxt, &llh, NULL, name); + if (rc) { + CERROR("1a: llog_create with name %s failed: %d\n", name, rc); + GOTO(out, rc); + } + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("1a: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + rc = verify_handle("1", llh, 1); + + CWARN("1b: close newly-created log\n"); +out_close: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("1b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* Test named-log reopen; returns opened log on success */ +static int llog_test_2(const struct lu_env *env, struct obd_device *obd, + char *name, struct llog_handle **llh) +{ + struct llog_ctxt *ctxt; + struct llog_handle *loghandle; + struct llog_logid logid; + int rc; + + ENTRY; + + CWARN("2a: re-open a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2a: re-open log with name %s failed: %d\n", name, rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2a: can't init llog handle: %d\n", rc); + GOTO(out_close_llh, rc); + } + + rc = verify_handle("2", *llh, 1); + if (rc) + GOTO(out_close_llh, rc); + + /* XXX: there is known issue with tests 2b, MGS is not able to create + * anonymous llog, exit now to allow following tests run. + * It is fixed in upcoming llog over OSD code */ + GOTO(out_put, rc); + + CWARN("2b: create a log without specified NAME & LOGID\n"); + rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL); + if (rc) { + CERROR("2b: create log failed\n"); + GOTO(out_close_llh, rc); + } + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2b: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid = loghandle->lgh_id; + llog_close(env, loghandle); + + CWARN("2c: re-open the log by LOGID\n"); + rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + CWARN("2b: destroy this log\n"); + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("2d: destroy log failed\n"); +out_close: + llog_close(env, loghandle); +out_close_llh: + if (rc) + llog_close(env, *llh); +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test record writing, single and in bulk */ +static int llog_test_3(const struct lu_env *env, struct obd_device *obd, + struct llog_handle *llh) +{ + struct llog_gen_rec lgr; + int rc, i; + int num_recs = 1; /* 1 for the header */ + + ENTRY; + + lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr); + lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + CWARN("3a: write one create_rec\n"); + rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1); + num_recs++; + if (rc < 0) { + CERROR("3a: write one log record failed: %d\n", rc); + RETURN(rc); + } + + rc = verify_handle("3a", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3b: write 10 cfg log records with 8 bytes bufs\n"); + for (i = 0; i < 10; i++) { + struct llog_rec_hdr hdr; + char buf[8]; + + hdr.lrh_len = 8; + hdr.lrh_type = OBD_CFG_REC; + memset(buf, 0, sizeof buf); + rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1); + if (rc < 0) { + CERROR("3b: write 10 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3b", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3c: write 1000 more log records\n"); + for (i = 0; i < 1000; i++) { + rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1); + if (rc < 0) { + CERROR("3c: write 1000 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3c", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n"); + for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) { + struct llog_rec_hdr hdr; + char buf_even[24]; + char buf_odd[32]; + + memset(buf_odd, 0, sizeof buf_odd); + memset(buf_even, 0, sizeof buf_even); + if ((i % 2) == 0) { + hdr.lrh_len = 24; + hdr.lrh_type = OBD_CFG_REC; + rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1); + } else { + hdr.lrh_len = 32; + hdr.lrh_type = OBD_CFG_REC; + rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1); + } + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("3d: write recs failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("3d: write record more than BITMAP size!\n"); + RETURN(-EINVAL); + } + CWARN("3d: wrote %d more records before end of llog is reached\n", + num_recs); + + rc = verify_handle("3d", llh, num_recs); + + RETURN(rc); +} + +/* Test catalogue additions */ +static int llog_test_4(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath; + char name[10]; + int rc, rc2, i, buflen; + struct llog_mini_rec lmr; + struct llog_cookie cookie; + struct llog_ctxt *ctxt; + int num_recs = 0; + char *buf; + struct llog_rec_hdr rec; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = 0xf00f00; + + sprintf(name, "%x", llog_test_rand + 1); + CWARN("4a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("4a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("4a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + num_recs++; + cat_logid = cath->lgh_id; + + CWARN("4b: write 1 record into the catalog\n"); + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL); + if (rc != 1) { + CERROR("4b: write 1 catalog record failed at: %d\n", rc); + GOTO(out, rc); + } + num_recs++; + rc = verify_handle("4b", cath, 2); + if (rc) + GOTO(out, rc); + + rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4c: cancel 1 log record\n"); + rc = llog_cat_cancel_records(env, cath, 1, &cookie); + if (rc) { + CERROR("4c: cancel 1 catalog based record failed: %d\n", rc); + GOTO(out, rc); + } + num_recs--; + + rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM); + for (i = 0; i < LLOG_TEST_RECNUM; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL); + if (rc) { + CERROR("4d: write %d records failed at #%d: %d\n", + LLOG_TEST_RECNUM, i + 1, rc); + GOTO(out, rc); + } + num_recs++; + } + + /* make sure new plain llog appears */ + rc = verify_handle("4d", cath, 3); + if (rc) + GOTO(out, rc); + + CWARN("4e: add 5 large records, one record per block\n"); + buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail); + OBD_ALLOC(buf, buflen); + if (buf == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < 5; i++) { + rec.lrh_len = buflen; + rec.lrh_type = OBD_CFG_REC; + rc = llog_cat_add(env, cath, &rec, NULL, buf); + if (rc) { + CERROR("4e: write 5 records failed at #%d: %d\n", + i + 1, rc); + GOTO(out_free, rc); + } + num_recs++; + } +out_free: + OBD_FREE(buf, buflen); +out: + CWARN("4f: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("4: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int cat_counter; + +static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + cat_counter++; + + RETURN(0); +} + +static int plain_counter; + +static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n", + rec->lrh_index, PFID(&fid)); + + plain_counter++; + + RETURN(0); +} + +static int cancel_count; + +static int llog_cancel_rec_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_cookie cookie; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + cookie.lgc_lgl = llh->lgh_id; + cookie.lgc_index = rec->lrh_index; + + llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie); + cancel_count++; + if (cancel_count == LLOG_TEST_RECNUM) + RETURN(-LLOG_EEMPTY); + RETURN(0); +} + +/* Test log and catalogue processing */ +static int llog_test_5(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = 0xf00f00; + + CWARN("5a: re-open catalog by id\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("5a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("5a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5b: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5b: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("5b: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM); + cancel_count = 0; + rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("5c: process with cat_cancel_cb failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5c: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5c: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("5c: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5d: add 1 record to the log with many canceled empty pages\n"); + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL); + if (rc) { + CERROR("5d: add record to the log with many canceled empty " + "pages failed\n"); + GOTO(out, rc); + } + + CWARN("5e: print plain log entries.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0); + if (rc) { + CERROR("5e: process with plain_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5e: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5f: print plain log entries reversely.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar"); + if (rc) { + CERROR("5f: reversely process with plain_print_cb failed:" + "%d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5f: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + +out: + CWARN("5g: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("5g: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test client api; open log by name and process */ +static int llog_test_6(const struct lu_env *env, struct obd_device *obd, + char *name) +{ + struct obd_device *mgc_obd; + struct llog_ctxt *ctxt; + struct obd_uuid *mgs_uuid; + struct obd_export *exp; + struct obd_uuid uuid = { "LLOG_TEST6_UUID" }; + struct llog_handle *llh = NULL; + struct llog_ctxt *nctxt; + int rc, rc2; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid; + + CWARN("6a: re-open log %s using client API\n", name); + mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL); + if (mgc_obd == NULL) { + CERROR("6a: no MGC devices connected to %s found.\n", + mgs_uuid->uuid); + GOTO(ctxt_release, rc = -ENOENT); + } + + rc = obd_connect(NULL, &exp, mgc_obd, &uuid, + NULL /* obd_connect_data */, NULL); + if (rc != -EALREADY) { + CERROR("6a: connect on connected MGC (%s) failed to return" + " -EALREADY", mgc_obd->obd_name); + if (rc == 0) + obd_disconnect(exp); + GOTO(ctxt_release, rc = -EINVAL); + } + + nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("6a: llog_open failed %d\n", rc); + GOTO(nctxt_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) { + CERROR("6a: llog_init_handle failed %d\n", rc); + GOTO(parse_out, rc); + } + + plain_counter = 1; /* llog header is first record */ + CWARN("6b: process log %s using client API\n", name); + rc = llog_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6b: llog_process failed %d\n", rc); + CWARN("6b: processed %d records\n", plain_counter); + + rc = verify_handle("6b", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + + plain_counter = 1; /* llog header is first record */ + CWARN("6c: process log %s reversely using client API\n", name); + rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6c: llog_reverse_process failed %d\n", rc); + CWARN("6c: processed %d records\n", plain_counter); + + rc = verify_handle("6c", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + +parse_out: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("6: llog_close failed: rc = %d\n", rc2); + if (rc == 0) + rc = rc2; + } +nctxt_put: + llog_ctxt_put(nctxt); +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static union { + struct llog_rec_hdr lrh; /* common header */ + struct llog_logid_rec llr; /* LLOG_LOGID_MAGIC */ + struct llog_unlink64_rec lur; /* MDS_UNLINK64_REC */ + struct llog_setattr64_rec lsr64; /* MDS_SETATTR64_REC */ + struct llog_size_change_rec lscr; /* OST_SZ_REC */ + struct llog_changelog_rec lcr; /* CHANGELOG_REC */ + struct llog_changelog_user_rec lcur; /* CHANGELOG_USER_REC */ + struct llog_gen_rec lgr; /* LLOG_GEN_REC */ +} llog_records; + +static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n", + rec->lrh_type, rec->lrh_index, PFID(&fid)); + + plain_counter++; + return 0; +} + +static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + /* test LLOG_DEL_RECORD is working */ + return LLOG_DEL_RECORD; +} + +static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + int rc = 0, i, process_count; + int num_recs = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc) { + CERROR("7_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc) { + CERROR("7_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) { + rc = llog_write(env, llh, &llog_records.lrh, NULL, 0, + NULL, -1); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("7_sub: write recs failed at #%d: %d\n", + i + 1, rc); + GOTO(out_close, rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("7_sub: write record more than BITMAP size!\n"); + GOTO(out_close, rc = -EINVAL); + } + + rc = verify_handle("7_sub", llh, num_recs + 1); + if (rc) { + CERROR("7_sub: verify handle failed: %d\n", rc); + GOTO(out_close, rc); + } + if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1) + CWARN("7_sub: records are not aligned, written %d from %u\n", + num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1); + + plain_counter = 0; + rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + process_count = plain_counter; + if (process_count != num_recs) { + CERROR("7_sub: processed %d records from %d total\n", + process_count, num_recs); + GOTO(out_close, rc = -EINVAL); + } + + plain_counter = 0; + rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: reverse llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + if (process_count != plain_counter) { + CERROR("7_sub: Reverse/direct processing found different" + "number of records: %d/%d\n", + plain_counter, process_count); + GOTO(out_close, rc = -EINVAL); + } + if (llog_exist(llh)) { + CERROR("7_sub: llog exists but should be zapped\n"); + GOTO(out_close, rc = -EEXIST); + } + + rc = verify_handle("7_sub", llh, 1); +out_close: + if (rc) + llog_destroy(env, llh); + llog_close(env, llh); + RETURN(rc); +} + +/* Test all llog records writing and processing */ +static int llog_test_7(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("7a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7b: test llog_unlink64_rec\n"); + llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur); + llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur); + llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7b: llog_unlink_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7c: test llog_setattr64_rec\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7c: llog_setattr64_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7d: test llog_size_change_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7d: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7e: test llog_changelog_rec\n"); + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7e: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7f: test llog_changelog_user_rec\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7f: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7g: test llog_gen_rec\n"); + llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7g: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* ------------------------------------------------------------------------- + * Tests above, boring obd functions below + * ------------------------------------------------------------------------- */ +static int llog_run_tests(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + int rc, err; + char name[10]; + + ENTRY; + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + sprintf(name, "%x", llog_test_rand); + + rc = llog_test_1(env, obd, name); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_2(env, obd, name, &llh); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_3(env, obd, llh); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_4(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_5(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_6(env, obd, name); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_7(env, obd); + if (rc) + GOTO(cleanup, rc); + +cleanup: + err = llog_destroy(env, llh); + if (err) + CERROR("cleanup: llog_destroy failed: %d\n", err); + llog_close(env, llh); + if (rc == 0) + rc = err; +cleanup_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +#ifdef LPROCFS +static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} }; +static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} }; +static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_llog_test_module_vars; + lvars->obd_vars = lprocfs_llog_test_obd_vars; +} +#endif + +static int llog_test_cleanup(struct obd_device *obd) +{ + struct obd_device *tgt; + struct lu_env env; + int rc; + + ENTRY; + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd; + rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT)); + if (rc) + CERROR("failed to llog_test_llog_finish: %d\n", rc); + lu_env_fini(&env); + RETURN(rc); +} + +static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_device *tgt; + struct llog_ctxt *ctxt; + struct dt_object *o; + struct lu_env env; + struct lu_context test_session; + int rc; + + ENTRY; + + if (lcfg->lcfg_bufcount < 2) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + if (lcfg->lcfg_buflens[1] < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + /* disk obd */ + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("target device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + rc = lu_context_init(&test_session, LCT_SESSION); + if (rc) + GOTO(cleanup_env, rc); + test_session.lc_thread = (struct ptlrpc_thread *)current; + lu_context_enter(&test_session); + env.le_ses = &test_session; + + CWARN("Setup llog-test device over %s device\n", + lustre_cfg_string(lcfg, 1)); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev); + + rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt, + &llog_osd_ops); + if (rc) + GOTO(cleanup_session, rc); + + /* use MGS llog dir for tests */ + ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + o = ctxt->loc_dir; + llog_ctxt_put(ctxt); + + ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = o; + llog_ctxt_put(ctxt); + + llog_test_rand = cfs_rand(); + + rc = llog_run_tests(&env, tgt); + if (rc) + llog_test_cleanup(obd); +cleanup_session: + lu_context_exit(&test_session); + lu_context_fini(&test_session); +cleanup_env: + lu_env_fini(&env); + RETURN(rc); +} + +static struct obd_ops llog_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = llog_test_setup, + .o_cleanup = llog_test_cleanup, +}; + +static int __init llog_test_init(void) +{ + struct lprocfs_static_vars lvars; + + lprocfs_llog_test_init_vars(&lvars); + return class_register_type(&llog_obd_ops, NULL, + lvars.module_vars, "llog_test", NULL); +} + +static void __exit llog_test_exit(void) +{ + class_unregister_type("llog_test"); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("llog test module"); +MODULE_LICENSE("GPL"); + +module_init(llog_test_init); +module_exit(llog_test_exit); diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c new file mode 100644 index 000000000000..b11ca6706448 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c @@ -0,0 +1,855 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "local_storage.h" + +/* all initialized local storages on this node are linked on this */ +static LIST_HEAD(ls_list_head); +static DEFINE_MUTEX(ls_list_mutex); + +static int ls_object_init(const struct lu_env *env, struct lu_object *o, + const struct lu_object_conf *unused) +{ + struct ls_device *ls; + struct lu_object *below; + struct lu_device *under; + + ENTRY; + + ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev); + under = &ls->ls_osd->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + if (below == NULL) + RETURN(-ENOMEM); + + lu_object_add(o, below); + + RETURN(0); +} + +static void ls_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct ls_object *obj = lu2ls_obj(o); + struct lu_object_header *h = o->lo_header; + + dt_object_fini(&obj->ls_obj); + lu_object_header_fini(h); + OBD_FREE_PTR(obj); +} + +struct lu_object_operations ls_lu_obj_ops = { + .loo_object_init = ls_object_init, + .loo_object_free = ls_object_free, +}; + +struct lu_object *ls_object_alloc(const struct lu_env *env, + const struct lu_object_header *_h, + struct lu_device *d) +{ + struct lu_object_header *h; + struct ls_object *o; + struct lu_object *l; + + LASSERT(_h == NULL); + + OBD_ALLOC_PTR(o); + if (o != NULL) { + l = &o->ls_obj.do_lu; + h = &o->ls_header; + + lu_object_header_init(h); + dt_object_init(&o->ls_obj, h, d); + lu_object_add_top(h, l); + + l->lo_ops = &ls_lu_obj_ops; + + return l; + } else { + return NULL; + } +} + +static struct lu_device_operations ls_lu_dev_ops = { + .ldo_object_alloc = ls_object_alloc +}; + +static struct ls_device *__ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls, *ret = NULL; + + list_for_each_entry(ls, &ls_list_head, ls_linkage) { + if (ls->ls_osd == dev) { + atomic_inc(&ls->ls_refcount); + ret = ls; + break; + } + } + return ret; +} + +struct ls_device *ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + mutex_unlock(&ls_list_mutex); + + return ls; +} + +static struct lu_device_type_operations ls_device_type_ops = { + .ldto_start = NULL, + .ldto_stop = NULL, +}; + +static struct lu_device_type ls_lu_type = { + .ldt_name = "local_storage", + .ldt_ops = &ls_device_type_ops, +}; + +struct ls_device *ls_device_get(struct dt_device *dev) +{ + struct ls_device *ls; + + ENTRY; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + if (ls) + GOTO(out_ls, ls); + + /* not found, then create */ + OBD_ALLOC_PTR(ls); + if (ls == NULL) + GOTO(out_ls, ls = ERR_PTR(-ENOMEM)); + + atomic_set(&ls->ls_refcount, 1); + INIT_LIST_HEAD(&ls->ls_los_list); + mutex_init(&ls->ls_los_mutex); + + ls->ls_osd = dev; + + LASSERT(dev->dd_lu_dev.ld_site); + lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type); + ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops; + ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site; + + /* finally add ls to the list */ + list_add(&ls->ls_linkage, &ls_list_head); +out_ls: + mutex_unlock(&ls_list_mutex); + RETURN(ls); +} + +void ls_device_put(const struct lu_env *env, struct ls_device *ls) +{ + LASSERT(env); + if (!atomic_dec_and_test(&ls->ls_refcount)) + return; + + mutex_lock(&ls_list_mutex); + if (atomic_read(&ls->ls_refcount) == 0) { + LASSERT(list_empty(&ls->ls_los_list)); + list_del(&ls->ls_linkage); + lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0); + lu_device_fini(&ls->ls_top_dev.dd_lu_dev); + OBD_FREE_PTR(ls); + } + mutex_unlock(&ls_list_mutex); +} + +/** + * local file fid generation + */ +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid) +{ + LASSERT(los->los_dev); + LASSERT(los->los_obj); + + /* take next OID */ + + /* to make it unique after reboot we store + * the latest generated fid atomically with + * object creation see local_object_create() */ + + mutex_lock(&los->los_id_lock); + fid->f_seq = los->los_seq; + fid->f_oid = los->los_last_oid++; + fid->f_ver = 0; + mutex_unlock(&los->los_id_lock); + + return 0; +} + +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + int rc; + + ENTRY; + + /* update fid generation file */ + if (los != NULL) { + LASSERT(dt_object_exists(los->los_obj)); + rc = dt_declare_record_write(env, los->los_obj, + sizeof(struct los_ondisk), 0, th); + if (rc) + RETURN(rc); + } + + rc = dt_declare_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(dti->dti_lma); + rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th); + + RETURN(rc); +} + +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + struct los_ondisk losd; + int rc; + + ENTRY; + + rc = dt_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + if (los == NULL) + RETURN(rc); + + LASSERT(los->los_obj); + LASSERT(dt_object_exists(los->los_obj)); + + /* many threads can be updated this, serialize + * them here to avoid the race where one thread + * takes the value first, but writes it last */ + mutex_lock(&los->los_id_lock); + + /* update local oid number on disk so that + * we know the last one used after reboot */ + losd.lso_magic = cpu_to_le32(LOS_MAGIC); + losd.lso_next_oid = cpu_to_le32(los->los_last_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off, + th); + mutex_unlock(&los->los_id_lock); + + RETURN(rc); +} + +/* + * Create local named object (file, directory or index) in parent directory. + */ +struct dt_object *__local_file_create(const struct lu_env *env, + const struct lu_fid *fid, + struct local_oid_storage *los, + struct ls_device *ls, + struct dt_object *parent, + const char *name, struct lu_attr *attr, + struct dt_object_format *dof) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + dto = ls_locate(env, ls, fid); + if (unlikely(IS_ERR(dto))) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + GOTO(out, rc = -EEXIST); + + th = dt_trans_create(env, ls->ls_osd); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + dt_declare_ref_add(env, dto, th); + dt_declare_ref_add(env, parent, th); + } + + rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, ls->ls_osd, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", + PFID(lu_object_fid(&dto->do_lu))); + rc = local_object_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); + + if (dti->dti_dof.dof_type == DFT_DIR) { + if (!dt_try_as_dir(env, dto)) + GOTO(destroy, rc = -ENOTDIR); + /* Add "." and ".." for newly created dir */ + rc = dt_insert(env, dto, (void *)fid, (void *)".", th, + BYPASS_CAPA, 1); + if (rc) + GOTO(destroy, rc); + dt_ref_add(env, dto, th); + rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu), + (void *)"..", th, BYPASS_CAPA, 1); + if (rc) + GOTO(destroy, rc); + } + + dt_write_lock(env, parent, 0); + rc = dt_insert(env, parent, (const struct dt_rec *)fid, + (const struct dt_key *)name, th, BYPASS_CAPA, 1); + if (dti->dti_dof.dof_type == DFT_DIR) + dt_ref_add(env, parent, th); + dt_write_unlock(env, parent); + if (rc) + GOTO(destroy, rc); +destroy: + if (rc) + dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, ls->ls_osd, th); +out: + if (rc) { + lu_object_put_nocache(env, &dto->do_lu); + dto = ERR_PTR(rc); + } + RETURN(dto); +} + +/* + * Look up and create (if it does not exist) a local named file or directory in + * parent directory. + */ +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid); + else if (rc != -ENOENT) + dto = ERR_PTR(rc); + else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create); + +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + dto = dt_locate(env, dt, &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + lu_object_put_nocache(env, &dto->do_lu); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create_with_fid); + +/* + * Look up and create (if it does not exist) a local named index file in parent + * directory. + */ +struct dt_object *local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; + +} +EXPORT_SYMBOL(local_index_find_or_create); + +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + if (!lu_fid_eq(fid, &dti->dti_fid)) + dto = ERR_PTR(-EINVAL); + else + dto = dt_locate(env, dt, fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + lu_object_put_nocache(env, &dto->do_lu); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_index_find_or_create_with_fid); + +static int local_object_declare_unlink(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + struct dt_object *c, const char *name, + struct thandle *th) +{ + int rc; + + rc = dt_declare_delete(env, p, (const struct dt_key *)name, th); + if (rc < 0) + return rc; + + rc = dt_declare_ref_del(env, c, th); + if (rc < 0) + return rc; + + return dt_declare_destroy(env, c, th); +} + +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == -ENOENT) + RETURN(0); + else if (rc < 0) + RETURN(rc); + + dto = dt_locate(env, dt, &dti->dti_fid); + if (unlikely(IS_ERR(dto))) + RETURN(PTR_ERR(dto)); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_unlink(env, dt, parent, dto, name, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(stop, rc); + + dt_write_lock(env, dto, 0); + rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA); + if (rc < 0) + GOTO(unlock, rc); + + rc = dt_ref_del(env, dto, th); + if (rc < 0) { + rc = dt_insert(env, parent, + (const struct dt_rec *)&dti->dti_fid, + (const struct dt_key *)name, th, BYPASS_CAPA, 1); + GOTO(unlock, rc); + } + + rc = dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +stop: + dt_trans_stop(env, dt, th); +out: + lu_object_put_nocache(env, &dto->do_lu); + return rc; +} +EXPORT_SYMBOL(local_object_unlink); + +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq) +{ + struct local_oid_storage *los, *ret = NULL; + + list_for_each_entry(los, &ls->ls_los_list, los_list) { + if (los->los_seq == seq) { + atomic_inc(&los->los_refcount); + ret = los; + break; + } + } + return ret; +} + +void dt_los_put(struct local_oid_storage *los) +{ + if (atomic_dec_and_test(&los->los_refcount)) + /* should never happen, only local_oid_storage_fini should + * drop refcount to zero */ + LBUG(); + return; +} + +/** + * Initialize local OID storage for required sequence. + * That may be needed for services that uses local files and requires + * dynamic OID allocation for them. + * + * Per each sequence we have an object with 'first_fid' identificator + * containing the counter for OIDs of locally created files with that + * sequence. + * + * It is used now by llog subsystem and MGS for NID tables + * + * Function gets first_fid to create counter object. + * All dynamic fids will be generated with the same sequence and incremented + * OIDs + * + * Returned local_oid_storage is in-memory representaion of OID storage + */ +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los) +{ + struct dt_thread_info *dti = dt_info(env); + struct ls_device *ls; + struct los_ondisk losd; + struct dt_object *root = NULL; + struct dt_object *o = NULL; + struct thandle *th; + int rc; + + ENTRY; + + ls = ls_device_get(dev); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + *los = dt_los_find(ls, fid_seq(first_fid)); + if (*los != NULL) + GOTO(out, rc = 0); + + /* not found, then create */ + OBD_ALLOC_PTR(*los); + if (*los == NULL) + GOTO(out, rc = -ENOMEM); + + atomic_set(&(*los)->los_refcount, 1); + mutex_init(&(*los)->los_id_lock); + (*los)->los_dev = &ls->ls_top_dev; + atomic_inc(&ls->ls_refcount); + list_add(&(*los)->los_list, &ls->ls_los_list); + + rc = dt_root_get(env, dev, &dti->dti_fid); + if (rc) + GOTO(out_los, rc); + + root = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(root)) + GOTO(out_los, rc = PTR_ERR(root)); + + /* initialize data allowing to generate new fids, + * literally we need a sequence */ + snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-%Lx-lastid", + fid_seq(first_fid)); + rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid); + if (rc == -ENOENT) + dti->dti_fid = *first_fid; + else if (rc < 0) + GOTO(out_los, rc); + + o = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(o)) + GOTO(out_los, rc = PTR_ERR(o)); + LASSERT(fid_seq(&dti->dti_fid) == fid_seq(first_fid)); + if (!dt_object_exists(o)) { + LASSERT(rc == -ENOENT); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out_los, rc = PTR_ERR(th)); + + dti->dti_attr.la_valid = LA_MODE | LA_TYPE; + dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &dti->dti_attr, NULL, + &dti->dti_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_declare_insert(env, root, + (const struct dt_rec *)&dti->dti_fid, + (const struct dt_key *)dti->dti_buf, + th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_declare_record_write(env, o, sizeof(losd), 0, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, root, 0); + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) + GOTO(out_lock, rc = 0); + + rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof, + th); + if (rc) + GOTO(out_lock, rc); + + losd.lso_magic = cpu_to_le32(LOS_MAGIC); + losd.lso_next_oid = cpu_to_le32(fid_oid(first_fid) + 1); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th); + if (rc) + GOTO(out_lock, rc); + rc = dt_insert(env, root, + (const struct dt_rec *)&dti->dti_fid, + (const struct dt_key *)dti->dti_buf, + th, BYPASS_CAPA, 1); + if (rc) + GOTO(out_lock, rc); +out_lock: + dt_write_unlock(env, o); + dt_write_unlock(env, root); +out_trans: + dt_trans_stop(env, dev, th); + } else { + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) { + CERROR("local storage file "DFID" is corrupted\n", + PFID(first_fid)); + rc = -EINVAL; + } + } +out_los: + if (root != NULL && !IS_ERR(root)) + lu_object_put_nocache(env, &root->do_lu); + + if (rc != 0) { + list_del(&(*los)->los_list); + atomic_dec(&ls->ls_refcount); + OBD_FREE_PTR(*los); + *los = NULL; + if (o != NULL && !IS_ERR(o)) + lu_object_put_nocache(env, &o->do_lu); + } else { + (*los)->los_seq = fid_seq(first_fid); + (*los)->los_last_oid = le32_to_cpu(losd.lso_next_oid); + (*los)->los_obj = o; + } +out: + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); + return rc; +} +EXPORT_SYMBOL(local_oid_storage_init); + +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los) +{ + struct ls_device *ls; + + if (!atomic_dec_and_test(&los->los_refcount)) + return; + + LASSERT(env); + LASSERT(los->los_dev); + ls = dt2ls_dev(los->los_dev); + + mutex_lock(&ls->ls_los_mutex); + if (atomic_read(&los->los_refcount) == 0) { + if (los->los_obj) + lu_object_put_nocache(env, &los->los_obj->do_lu); + list_del(&los->los_list); + OBD_FREE_PTR(los); + } + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); +} +EXPORT_SYMBOL(local_oid_storage_fini); diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h new file mode 100644 index 000000000000..7c5c0bc855bd --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin + */ + +#include +#include +#include +#include + +struct ls_device { + struct dt_device ls_top_dev; + /* all initialized ls_devices on this node linked by this */ + struct list_head ls_linkage; + /* how many handle's reference this local storage */ + atomic_t ls_refcount; + /* underlaying OSD device */ + struct dt_device *ls_osd; + /* list of all local OID storages */ + struct list_head ls_los_list; + struct mutex ls_los_mutex; +}; + +static inline struct ls_device *dt2ls_dev(struct dt_device *d) +{ + return container_of0(d, struct ls_device, ls_top_dev); +} + +struct ls_object { + struct lu_object_header ls_header; + struct dt_object ls_obj; +}; + +static inline struct ls_object *lu2ls_obj(struct lu_object *o) +{ + return container_of0(o, struct ls_object, ls_obj.do_lu); +} + +static inline struct dt_object *ls_locate(const struct lu_env *env, + struct ls_device *ls, + const struct lu_fid *fid) +{ + return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev); +} + +struct ls_device *ls_device_get(struct dt_device *dev); +void ls_device_put(const struct lu_env *env, struct ls_device *ls); +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq); +void dt_los_put(struct local_oid_storage *los); diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c new file mode 100644 index 000000000000..7afc2ad4c8d5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c @@ -0,0 +1,575 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei + */ +/* + * lustre/obdclass/lprocfs_jobstats.c + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include + +#if defined(LPROCFS) + +/* + * JobID formats & JobID environment variable names for supported + * job schedulers: + * + * SLURM: + * JobID format: 32 bit integer. + * JobID env var: SLURM_JOB_ID. + * SGE: + * JobID format: Decimal integer range to 99999. + * JobID env var: JOB_ID. + * LSF: + * JobID format: 6 digit integer by default (up to 999999), can be + * increased to 10 digit (up to 2147483646). + * JobID env var: LSB_JOBID. + * Loadleveler: + * JobID format: String of machine_name.cluster_id.process_id, for + * example: fr2n02.32.0 + * JobID env var: LOADL_STEP_ID. + * PBS: + * JobID format: String of sequence_number[.server_name][@server]. + * JobID env var: PBS_JOBID. + * Maui/MOAB: + * JobID format: Same as PBS. + * JobID env var: Same as PBS. + */ + +struct job_stat { + struct hlist_node js_hash; + struct list_head js_list; + atomic_t js_refcount; + char js_jobid[JOBSTATS_JOBID_SIZE]; + time_t js_timestamp; /* seconds */ + struct lprocfs_stats *js_stats; + struct obd_job_stats *js_jobstats; +}; + +static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static void *job_stat_key(struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return job->js_jobid; +} + +static int job_stat_keycmp(const void *key, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return (strlen(job->js_jobid) == strlen(key)) && + !strncmp(job->js_jobid, key, strlen(key)); +} + +static void *job_stat_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct job_stat, js_hash); +} + +static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + atomic_inc(&job->js_refcount); +} + +static void job_free(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) == 0); + LASSERT(job->js_jobstats); + + write_lock(&job->js_jobstats->ojs_lock); + list_del_init(&job->js_list); + write_unlock(&job->js_jobstats->ojs_lock); + + lprocfs_free_stats(&job->js_stats); + OBD_FREE_PTR(job); +} + +static void job_putref(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) > 0); + if (atomic_dec_and_test(&job->js_refcount)) + job_free(job); +} + +static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + job_putref(job); +} + +static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode) +{ + CERROR("Should not have any items!"); +} + +static cfs_hash_ops_t job_stats_hash_ops = { + .hs_hash = job_stat_hash, + .hs_key = job_stat_key, + .hs_keycmp = job_stat_keycmp, + .hs_object = job_stat_object, + .hs_get = job_stat_get, + .hs_put_locked = job_stat_put_locked, + .hs_exit = job_stat_exit, +}; + +static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + time_t oldest = *((time_t *)data); + struct job_stat *job; + + job = hlist_entry(hnode, struct job_stat, js_hash); + if (!oldest || job->js_timestamp < oldest) + cfs_hash_bd_del_locked(hs, bd, hnode); + + return 0; +} + +static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force) +{ + time_t oldest, now; + + if (stats->ojs_cleanup_interval == 0) + return; + + now = cfs_time_current_sec(); + if (!force && now < stats->ojs_last_cleanup + + stats->ojs_cleanup_interval) + return; + + oldest = now - stats->ojs_cleanup_interval; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, + &oldest); + stats->ojs_last_cleanup = cfs_time_current_sec(); +} + +static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs) +{ + struct job_stat *job; + + LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn); + + OBD_ALLOC_PTR(job); + if (job == NULL) + return NULL; + + job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0); + if (job->js_stats == NULL) { + OBD_FREE_PTR(job); + return NULL; + } + + jobs->ojs_cntr_init_fn(job->js_stats); + + memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE); + job->js_timestamp = cfs_time_current_sec(); + job->js_jobstats = jobs; + INIT_HLIST_NODE(&job->js_hash); + INIT_LIST_HEAD(&job->js_list); + atomic_set(&job->js_refcount, 1); + + return job; +} + +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + struct job_stat *job, *job2; + ENTRY; + + LASSERT(stats && stats->ojs_hash); + + lprocfs_job_cleanup(stats, false); + + if (!jobid || !strlen(jobid)) + RETURN(-EINVAL); + + if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) { + CERROR("Invalid jobid size (%lu), expect(%d)\n", + (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE); + RETURN(-EINVAL); + } + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (job) + goto found; + + job = job_alloc(jobid, stats); + if (job == NULL) + RETURN(-ENOMEM); + + job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid, + &job->js_hash); + if (job2 != job) { + job_putref(job); + job = job2; + /* We cannot LASSERT(!list_empty(&job->js_list)) here, + * since we just lost the race for inserting "job" into the + * ojs_list, and some other thread is doing it _right_now_. + * Instead, be content the other thread is doing this, since + * "job2" was initialized in job_alloc() already. LU-2163 */ + } else { + LASSERT(list_empty(&job->js_list)); + write_lock(&stats->ojs_lock); + list_add_tail(&job->js_list, &stats->ojs_list); + write_unlock(&stats->ojs_lock); + } + +found: + LASSERT(stats == job->js_jobstats); + LASSERT(stats->ojs_cntr_num > event); + job->js_timestamp = cfs_time_current_sec(); + lprocfs_counter_add(job->js_stats, event, amount); + + job_putref(job); + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_log); + +void lprocfs_job_stats_fini(struct obd_device *obd) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + time_t oldest = 0; + + if (stats->ojs_hash == NULL) + return; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest); + cfs_hash_putref(stats->ojs_hash); + stats->ojs_hash = NULL; + LASSERT(list_empty(&stats->ojs_list)); +} +EXPORT_SYMBOL(lprocfs_job_stats_fini); + +static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + loff_t off = *pos; + struct job_stat *job; + + read_lock(&stats->ojs_lock); + if (off == 0) + return SEQ_START_TOKEN; + off--; + list_for_each_entry(job, &stats->ojs_list, js_list) { + if (!off--) + return job; + } + return NULL; +} + +static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v) +{ + struct obd_job_stats *stats = p->private; + + read_unlock(&stats->ojs_lock); +} + +static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + struct job_stat *job; + struct list_head *next; + + ++*pos; + if (v == SEQ_START_TOKEN) { + next = stats->ojs_list.next; + } else { + job = (struct job_stat *)v; + next = job->js_list.next; + } + + return next == &stats->ojs_list ? NULL : + list_entry(next, struct job_stat, js_list); +} + +/* + * Example of output on MDT: + * + * job_stats: + * - job_id: test_id.222.25844 + * snapshot_time: 1322494486 + * open: { samples: 3, unit: reqs } + * close: { samples: 3, unit: reqs } + * mknod: { samples: 0, unit: reqs } + * link: { samples: 0, unit: reqs } + * unlink: { samples: 0, unit: reqs } + * mkdir: { samples: 0, unit: reqs } + * rmdir: { samples: 0, unit: reqs } + * rename: { samples: 1, unit: reqs } + * getattr: { samples: 7, unit: reqs } + * setattr: { samples: 0, unit: reqs } + * getxattr: { samples: 0, unit: reqs } + * setxattr: { samples: 0, unit: reqs } + * statfs: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + * + * Example of output on OST: + * + * job_stats: + * - job_id 4854 + * snapshot_time: 1322494602 + * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 } + * write: { samples: 1, unit: bytes, min: 10, max: 10, sum: 10 } + * setattr: { samples: 0, unit: reqs } + * punch: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + */ + +static const char spaces[] = " "; + +static int inline width(const char *str, int len) +{ + return len - min((int)strlen(str), 15); +} + +static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v) +{ + struct job_stat *job = v; + struct lprocfs_stats *s; + struct lprocfs_counter ret; + struct lprocfs_counter *cntr; + struct lprocfs_counter_header *cntr_header; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(p, "job_stats:\n"); + return 0; + } + + seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid); + seq_printf(p, " %-16s %ld\n", "snapshot_time:", job->js_timestamp); + + s = job->js_stats; + for (i = 0; i < s->ls_num; i++) { + cntr = lprocfs_stats_counter_get(s, 0, i); + cntr_header = &s->ls_cnt_header[i]; + lprocfs_stats_collect(s, i, &ret); + + seq_printf(p, " %s:%.*s { samples: %11"LPF64"u", + cntr_header->lc_name, + width(cntr_header->lc_name, 15), spaces, + ret.lc_count); + if (cntr_header->lc_units[0] != '\0') + seq_printf(p, ", unit: %5s", cntr_header->lc_units); + + if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u," + " sum:%16"LPF64"u", + ret.lc_count ? ret.lc_min : 0, + ret.lc_count ? ret.lc_max : 0, + ret.lc_count ? ret.lc_sum : 0); + } + if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) { + seq_printf(p, ", sumsq: %18"LPF64"u", + ret.lc_count ? ret.lc_sumsquare : 0); + } + + seq_printf(p, " }\n"); + + } + return 0; +} + +struct seq_operations lprocfs_jobstats_seq_sops = { + start: lprocfs_jobstats_seq_start, + stop: lprocfs_jobstats_seq_stop, + next: lprocfs_jobstats_seq_next, + show: lprocfs_jobstats_seq_show, +}; + +static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + int rc; + + if (LPROCFS_ENTRY_AND_CHECK(dp)) + return -ENOENT; + + rc = seq_open(file, &lprocfs_jobstats_seq_sops); + if (rc) { + LPROCFS_EXIT(); + return rc; + } + seq = file->private_data; + seq->private = dp->data; + return 0; +} + +static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + char jobid[JOBSTATS_JOBID_SIZE]; + int all = 0; + struct job_stat *job; + + if (!memcmp(buf, "clear", strlen("clear"))) { + all = 1; + } else if (len < JOBSTATS_JOBID_SIZE) { + memset(jobid, 0, JOBSTATS_JOBID_SIZE); + /* Trim '\n' if any */ + if (buf[len - 1] == '\n') + memcpy(jobid, buf, len - 1); + else + memcpy(jobid, buf, len); + } else { + return -EINVAL; + } + + LASSERT(stats->ojs_hash); + if (all) { + time_t oldest = 0; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, + &oldest); + return len; + } + + if (!strlen(jobid)) + return -EINVAL; + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (!job) + return -EINVAL; + + cfs_hash_del_key(stats->ojs_hash, jobid); + + job_putref(job); + return len; +} + +struct file_operations lprocfs_jobstats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_jobstats_seq_open, + .read = seq_read, + .write = lprocfs_jobstats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback init_fn) +{ + struct proc_dir_entry *entry; + struct obd_job_stats *stats; + ENTRY; + + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_type->typ_name); + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) { + CERROR("Invalid obd device type.\n"); + RETURN(-EINVAL); + } + stats = &obd->u.obt.obt_jobstats; + + LASSERT(stats->ojs_hash == NULL); + stats->ojs_hash = cfs_hash_create("JOB_STATS", + HASH_JOB_STATS_CUR_BITS, + HASH_JOB_STATS_MAX_BITS, + HASH_JOB_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &job_stats_hash_ops, + CFS_HASH_DEFAULT); + if (stats->ojs_hash == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&stats->ojs_list); + rwlock_init(&stats->ojs_lock); + stats->ojs_cntr_num = cntr_num; + stats->ojs_cntr_init_fn = init_fn; + stats->ojs_cleanup_interval = 600; /* 10 mins by default */ + stats->ojs_last_cleanup = cfs_time_current_sec(); + + LPROCFS_WRITE_ENTRY(); + entry = create_proc_entry("job_stats", 0644, obd->obd_proc_entry); + LPROCFS_WRITE_EXIT(); + if (entry) { + entry->proc_fops = &lprocfs_jobstats_seq_fops; + entry->data = stats; + RETURN(0); + } else { + lprocfs_job_stats_fini(obd); + RETURN(-ENOMEM); + } +} +EXPORT_SYMBOL(lprocfs_job_stats_init); + +int lprocfs_rd_job_interval(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_job_stats *stats; + + LASSERT(obd != NULL); + stats = &obd->u.obt.obt_jobstats; + *eof = 1; + return snprintf(page, count, "%d\n", stats->ojs_cleanup_interval); +} +EXPORT_SYMBOL(lprocfs_rd_job_interval); + +int lprocfs_wr_job_interval(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_job_stats *stats; + int val, rc; + + LASSERT(obd != NULL); + stats = &obd->u.obt.obt_jobstats; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + stats->ojs_cleanup_interval = val; + lprocfs_job_cleanup(stats, true); + + return count; + +} +EXPORT_SYMBOL(lprocfs_wr_job_interval); + +#endif /* LPROCFS*/ diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c new file mode 100644 index 000000000000..96e568f6757a --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c @@ -0,0 +1,2599 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_status.c + * + * Author: Hariharan Thantry + */ + +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include + +#if defined(LPROCFS) + +static int lprocfs_no_percpu_stats = 0; +CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644, + "Do not alloc percpu data for lprocfs stats"); + +#define MAX_STRING_SIZE 128 + +/* for bug 10866, global variable */ +DECLARE_RWSEM(_lprocfs_lock); +EXPORT_SYMBOL(_lprocfs_lock); + +int lprocfs_single_release(struct inode *inode, struct file *file) +{ + LPROCFS_EXIT(); + return single_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_single_release); + +int lprocfs_seq_release(struct inode *inode, struct file *file) +{ + LPROCFS_EXIT(); + return seq_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_seq_release); + +static struct proc_dir_entry *__lprocfs_srch(struct proc_dir_entry *head, + const char *name) +{ + struct proc_dir_entry *temp; + + if (head == NULL) + return NULL; + + temp = head->subdir; + while (temp != NULL) { + if (strcmp(temp->name, name) == 0) { + return temp; + } + + temp = temp->next; + } + return NULL; +} + +struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head, + const char *name) +{ + struct proc_dir_entry *temp; + + LPROCFS_SRCH_ENTRY(); + temp = __lprocfs_srch(head, name); + LPROCFS_SRCH_EXIT(); + return temp; +} +EXPORT_SYMBOL(lprocfs_srch); + +/* lprocfs API calls */ + +/* Function that emulates snprintf but also has the side effect of advancing + the page pointer for the next write into the buffer, incrementing the total + length written to the buffer, and decrementing the size left in the + buffer. */ +static int lprocfs_obd_snprintf(char **page, int end, int *len, + const char *format, ...) +{ + va_list list; + int n; + + if (*len >= end) + return 0; + + va_start(list, format); + n = vsnprintf(*page, end - *len, format, list); + va_end(list); + + *page += n; *len += n; + return n; +} + +proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + read_proc_t *read_proc, + write_proc_t *write_proc, + void *data, + struct file_operations *fops) +{ + proc_dir_entry_t *proc; + mode_t mode = 0; + + if (root == NULL || name == NULL) + return ERR_PTR(-EINVAL); + if (read_proc) + mode = 0444; + if (write_proc) + mode |= 0200; + if (fops) + mode = 0644; + LPROCFS_WRITE_ENTRY(); + proc = create_proc_entry(name, mode, root); + if (!proc) { + CERROR("LprocFS: No memory to create /proc entry %s", name); + LPROCFS_WRITE_EXIT(); + return ERR_PTR(-ENOMEM); + } + proc->read_proc = read_proc; + proc->write_proc = write_proc; + proc->data = data; + if (fops) + proc->proc_fops = fops; + LPROCFS_WRITE_EXIT(); + return proc; +} +EXPORT_SYMBOL(lprocfs_add_simple); + +struct proc_dir_entry *lprocfs_add_symlink(const char *name, + struct proc_dir_entry *parent, const char *format, ...) +{ + struct proc_dir_entry *entry; + char *dest; + va_list ap; + + if (parent == NULL || format == NULL) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (dest == NULL) + return NULL; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = proc_symlink(name, parent, dest); + if (entry == NULL) + CERROR("LprocFS: Could not create symbolic link from %s to %s", + name, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); + return entry; +} +EXPORT_SYMBOL(lprocfs_add_symlink); + +static ssize_t lprocfs_fops_read(struct file *f, char __user *buf, + size_t size, loff_t *ppos) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + char *page, *start = NULL; + int rc = 0, eof = 1, count; + + if (*ppos >= PAGE_CACHE_SIZE) + return 0; + + page = (char *)__get_free_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + if (LPROCFS_ENTRY_AND_CHECK(dp)) { + rc = -ENOENT; + goto out; + } + + OBD_FAIL_TIMEOUT(OBD_FAIL_LPROC_REMOVE, 10); + if (dp->read_proc) + rc = dp->read_proc(page, &start, *ppos, PAGE_CACHE_SIZE, + &eof, dp->data); + LPROCFS_EXIT(); + if (rc <= 0) + goto out; + + /* for lustre proc read, the read count must be less than PAGE_SIZE */ + LASSERT(eof == 1); + + if (start == NULL) { + rc -= *ppos; + if (rc < 0) + rc = 0; + if (rc == 0) + goto out; + start = page + *ppos; + } else if (start < page) { + start = page; + } + + count = (rc < size) ? rc : size; + if (copy_to_user(buf, start, count)) { + rc = -EFAULT; + goto out; + } + *ppos += count; + +out: + free_page((unsigned long)page); + return rc; +} + +static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf, + size_t size, loff_t *ppos) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + int rc = -EIO; + + if (LPROCFS_ENTRY_AND_CHECK(dp)) + return -ENOENT; + if (dp->write_proc) + rc = dp->write_proc(f, buf, size, dp->data); + LPROCFS_EXIT(); + return rc; +} + +static struct file_operations lprocfs_generic_fops = { + .owner = THIS_MODULE, + .read = lprocfs_fops_read, + .write = lprocfs_fops_write, +}; + +int lprocfs_evict_client_open(struct inode *inode, struct file *f) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + struct obd_device *obd = dp->data; + + atomic_inc(&obd->obd_evict_inprogress); + + return 0; +} + +int lprocfs_evict_client_release(struct inode *inode, struct file *f) +{ + struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); + struct obd_device *obd = dp->data; + + atomic_dec(&obd->obd_evict_inprogress); + wake_up(&obd->obd_evict_inprogress_waitq); + + return 0; +} + +struct file_operations lprocfs_evict_client_fops = { + .owner = THIS_MODULE, + .read = lprocfs_fops_read, + .write = lprocfs_fops_write, + .open = lprocfs_evict_client_open, + .release = lprocfs_evict_client_release, +}; +EXPORT_SYMBOL(lprocfs_evict_client_fops); + +/** + * Add /proc entries. + * + * \param root [in] The parent proc entry on which new entry will be added. + * \param list [in] Array of proc entries to be added. + * \param data [in] The argument to be passed when entries read/write routines + * are called through /proc file. + * + * \retval 0 on success + * < 0 on error + */ +int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + int rc = 0; + + if (root == NULL || list == NULL) + return -EINVAL; + + LPROCFS_WRITE_ENTRY(); + while (list->name != NULL) { + struct proc_dir_entry *cur_root, *proc; + char *pathcopy, *cur, *next, pathbuf[64]; + int pathsize = strlen(list->name) + 1; + + proc = NULL; + cur_root = root; + + /* need copy of path for strsep */ + if (strlen(list->name) > sizeof(pathbuf) - 1) { + OBD_ALLOC(pathcopy, pathsize); + if (pathcopy == NULL) + GOTO(out, rc = -ENOMEM); + } else { + pathcopy = pathbuf; + } + + next = pathcopy; + strcpy(pathcopy, list->name); + + while (cur_root != NULL && (cur = strsep(&next, "/"))) { + if (*cur =='\0') /* skip double/trailing "/" */ + continue; + + proc = __lprocfs_srch(cur_root, cur); + CDEBUG(D_OTHER, "cur_root=%s, cur=%s, next=%s, (%s)\n", + cur_root->name, cur, next, + (proc ? "exists" : "new")); + if (next != NULL) { + cur_root = (proc ? proc : + proc_mkdir(cur, cur_root)); + } else if (proc == NULL) { + mode_t mode = 0; + if (list->proc_mode != 0000) { + mode = list->proc_mode; + } else { + if (list->read_fptr) + mode = 0444; + if (list->write_fptr) + mode |= 0200; + } + proc = create_proc_entry(cur, mode, cur_root); + } + } + + if (pathcopy != pathbuf) + OBD_FREE(pathcopy, pathsize); + + if (cur_root == NULL || proc == NULL) { + CERROR("LprocFS: No memory to create /proc entry %s", + list->name); + GOTO(out, rc = -ENOMEM); + } + + if (list->fops) + proc->proc_fops = list->fops; + else + proc->proc_fops = &lprocfs_generic_fops; + proc->read_proc = list->read_fptr; + proc->write_proc = list->write_fptr; + proc->data = (list->data ? list->data : data); + list++; + } +out: + LPROCFS_WRITE_EXIT(); + return rc; +} +EXPORT_SYMBOL(lprocfs_add_vars); + +void lprocfs_remove_nolock(struct proc_dir_entry **rooth) +{ + struct proc_dir_entry *root = *rooth; + struct proc_dir_entry *temp = root; + struct proc_dir_entry *rm_entry; + struct proc_dir_entry *parent; + + if (!root) + return; + *rooth = NULL; + + parent = root->parent; + LASSERT(parent != NULL); + + while (1) { + while (temp->subdir != NULL) + temp = temp->subdir; + + rm_entry = temp; + temp = temp->parent; + + /* Memory corruption once caused this to fail, and + without this LASSERT we would loop here forever. */ + LASSERTF(strlen(rm_entry->name) == rm_entry->namelen, + "0x%p %s/%s len %d\n", rm_entry, temp->name, + rm_entry->name, (int)strlen(rm_entry->name)); + + remove_proc_entry(rm_entry->name, temp); + if (temp == parent) + break; + } +} + +void lprocfs_remove(struct proc_dir_entry **rooth) +{ + LPROCFS_WRITE_ENTRY(); /* search vs remove race */ + lprocfs_remove_nolock(rooth); + LPROCFS_WRITE_EXIT(); +} +EXPORT_SYMBOL(lprocfs_remove); + +void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + LASSERT(parent != NULL); + remove_proc_entry(name, parent); +} +EXPORT_SYMBOL(lprocfs_remove_proc_entry); + +void lprocfs_try_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *t = NULL; + struct proc_dir_entry **p; + int len, busy = 0; + + LASSERT(parent != NULL); + len = strlen(name); + + LPROCFS_WRITE_ENTRY(); + + /* lookup target name */ + for (p = &parent->subdir; *p; p = &(*p)->next) { + if ((*p)->namelen != len) + continue; + if (memcmp(name, (*p)->name, len)) + continue; + t = *p; + break; + } + + if (t) { + /* verify it's empty: do not count "num_refs" */ + for (p = &t->subdir; *p; p = &(*p)->next) { + if ((*p)->namelen != strlen("num_refs")) { + busy = 1; + break; + } + if (memcmp("num_refs", (*p)->name, + strlen("num_refs"))) { + busy = 1; + break; + } + } + } + + if (busy == 0) + lprocfs_remove_nolock(&t); + + LPROCFS_WRITE_EXIT(); + + return; +} +EXPORT_SYMBOL(lprocfs_try_remove_proc_entry); + +struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ + struct proc_dir_entry *newchild; + + newchild = lprocfs_srch(parent, name); + if (newchild != NULL) { + CERROR(" Lproc: Attempting to register %s more than once \n", + name); + return ERR_PTR(-EALREADY); + } + + newchild = proc_mkdir(name, parent); + if (newchild != NULL && list != NULL) { + int rc = lprocfs_add_vars(newchild, list, data); + if (rc) { + lprocfs_remove(&newchild); + return ERR_PTR(rc); + } + } + return newchild; +} +EXPORT_SYMBOL(lprocfs_register); + +/* Generic callbacks */ +int lprocfs_rd_uint(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + unsigned int *temp = data; + return snprintf(page, count, "%u\n", *temp); +} +EXPORT_SYMBOL(lprocfs_rd_uint); + +int lprocfs_wr_uint(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + unsigned *p = data; + char dummy[MAX_STRING_SIZE + 1], *end; + unsigned long tmp; + + dummy[MAX_STRING_SIZE] = '\0'; + if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + return -EFAULT; + + tmp = simple_strtoul(dummy, &end, 0); + if (dummy == end) + return -EINVAL; + + *p = (unsigned int)tmp; + return count; +} +EXPORT_SYMBOL(lprocfs_wr_uint); + +int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + LASSERT(data != NULL); + *eof = 1; + return snprintf(page, count, LPU64"\n", *(__u64 *)data); +} +EXPORT_SYMBOL(lprocfs_rd_u64); + +int lprocfs_rd_atomic(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + atomic_t *atom = data; + LASSERT(atom != NULL); + *eof = 1; + return snprintf(page, count, "%d\n", atomic_read(atom)); +} +EXPORT_SYMBOL(lprocfs_rd_atomic); + +int lprocfs_wr_atomic(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + atomic_t *atm = data; + int val = 0; + int rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val <= 0) + return -ERANGE; + + atomic_set(atm, val); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_atomic); + +int lprocfs_rd_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + *eof = 1; + return snprintf(page, count, "%s\n", obd->obd_uuid.uuid); +} +EXPORT_SYMBOL(lprocfs_rd_uuid); + +int lprocfs_rd_name(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = data; + + LASSERT(dev != NULL); + *eof = 1; + return snprintf(page, count, "%s\n", dev->obd_name); +} +EXPORT_SYMBOL(lprocfs_rd_name); + +int lprocfs_rd_blksize(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, "%u\n", osfs.os_bsize); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_blksize); + +int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytestotal); + +int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesfree); + +int lprocfs_rd_kbytesavail(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesavail); + +int lprocfs_rd_filestotal(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_files); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filestotal); + +int lprocfs_rd_filesfree(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_ffree); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filesfree); + +int lprocfs_rd_server_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + char *imp_state_name = NULL; + int rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + imp_state_name = ptlrpc_import_state_name(imp->imp_state); + *eof = 1; + rc = snprintf(page, count, "%s\t%s%s\n", + obd2cli_tgt(obd), imp_state_name, + imp->imp_deactive ? "\tDEACTIVATED" : ""); + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_server_uuid); + +int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + struct ptlrpc_connection *conn; + int rc = 0; + + LASSERT(obd != NULL); + + LPROCFS_CLIMP_CHECK(obd); + conn = obd->u.cli.cl_import->imp_connection; + *eof = 1; + if (conn && obd->u.cli.cl_import) { + rc = snprintf(page, count, "%s\n", + conn->c_remote_uuid.uuid); + } else { + rc = snprintf(page, count, "%s\n", ""); + } + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_conn_uuid); + +/** add up per-cpu counters */ +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ + unsigned int num_entry; + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *cntr_header; + int i; + unsigned long flags = 0; + + memset(cnt, 0, sizeof(*cnt)); + + if (stats == NULL) { + /* set count to 1 to avoid divide-by-zero errs in callers */ + cnt->lc_count = 1; + return; + } + + cnt->lc_min = LC_MIN_INIT; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + cntr_header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); + + cnt->lc_count += percpu_cntr->lc_count; + cnt->lc_sum += percpu_cntr->lc_sum; + if (percpu_cntr->lc_min < cnt->lc_min) + cnt->lc_min = percpu_cntr->lc_min; + if (percpu_cntr->lc_max > cnt->lc_max) + cnt->lc_max = percpu_cntr->lc_max; + cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_stats_collect); + +/** + * Append a space separated list of current set flags to str. + */ +#define flag2str(flag) \ + if (imp->imp_##flag && max - len > 0) \ + len += snprintf(str + len, max - len, "%s" #flag, len ? ", " : ""); +static int obd_import_flags2str(struct obd_import *imp, char *str, int max) +{ + int len = 0; + + if (imp->imp_obd->obd_no_recov) + len += snprintf(str, max - len, "no_recov"); + + flag2str(invalid); + flag2str(deactive); + flag2str(replayable); + flag2str(pingable); + return len; +} +#undef flags2str + +static const char *obd_connect_names[] = { + "read_only", + "lov_index", + "unused", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "join_file(obsolete)", + "getattr_by_fid", + "no_oh_for_devices", + "remote_client", + "remote_client_by_force", + "max_byte_per_rpc", + "64bit_qdata", + "mds_capability", + "oss_capability", + "early_lock_cancel", + "som", + "adaptive_timeouts", + "lru_resize", + "mds_mds_connection", + "real_conn", + "change_qunit_size", + "alt_checksum_algorithm", + "fid_is_enabled", + "version_recovery", + "pools", + "grant_shrink", + "skip_orphan", + "large_ea", + "full20", + "layout_lock", + "64bithash", + "object_max_bytes", + "imp_recov", + "jobstats", + "umask", + "einprogress", + "grant_param", + "flock_owner", + "lvb_type", + "nanoseconds_times", + "lightweight_conn", + "short_io", + "pingless", + "unknown", + NULL +}; + +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep) +{ + __u64 mask = 1; + int i, ret = 0; + + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown flags "LPX64, + ret ? sep : "", flags & ~(mask - 1)); + return ret; +} +EXPORT_SYMBOL(obd_connect_flags2str); + +int lprocfs_rd_import(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + struct obd_import_conn *conn; + int i; + int j; + int k; + int rw = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + *eof = 1; + + i = snprintf(page, count, + "import:\n" + " name: %s\n" + " target: %s\n" + " state: %s\n" + " instance: %u\n" + " connect_flags: [", + obd->obd_name, + obd2cli_tgt(obd), + ptlrpc_import_state_name(imp->imp_state), + imp->imp_connect_data.ocd_instance); + i += obd_connect_flags2str(page + i, count - i, + imp->imp_connect_data.ocd_connect_flags, + ", "); + i += snprintf(page + i, count - i, + "]\n" + " import_flags: ["); + i += obd_import_flags2str(imp, page + i, count - i); + + i += snprintf(page + i, count - i, + "]\n" + " connection:\n" + " failover_nids: ["); + spin_lock(&imp->imp_lock); + j = 0; + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + i += snprintf(page + i, count - i, "%s%s", j ? ", " : "", + libcfs_nid2str(conn->oic_conn->c_peer.nid)); + j++; + } + i += snprintf(page + i, count - i, + "]\n" + " current_connection: %s\n" + " connection_attempts: %u\n" + " generation: %u\n" + " in-progress_invalidations: %u\n", + imp->imp_connection == NULL ? "" : + libcfs_nid2str(imp->imp_connection->c_peer.nid), + imp->imp_conn_cnt, + imp->imp_generation, + atomic_read(&imp->imp_inval_count)); + spin_unlock(&imp->imp_lock); + + if (obd->obd_svc_stats == NULL) + goto out_climp; + + header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; + lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); + if (ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + } else + ret.lc_sum = 0; + i += snprintf(page + i, count - i, + " rpcs:\n" + " inflight: %u\n" + " unregistering: %u\n" + " timeouts: %u\n" + " avg_waittime: "LPU64" %s\n", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), + atomic_read(&imp->imp_timeouts), + ret.lc_sum, header->lc_units); + + k = 0; + for(j = 0; j < IMP_AT_MAX_PORTALS; j++) { + if (imp->imp_at.iat_portal[j] == 0) + break; + k = max_t(unsigned int, k, + at_get(&imp->imp_at.iat_service_estimate[j])); + } + i += snprintf(page + i, count - i, + " service_estimates:\n" + " services: %u sec\n" + " network: %u sec\n", + k, + at_get(&imp->imp_at.iat_net_latency)); + + i += snprintf(page + i, count - i, + " transactions:\n" + " last_replay: "LPU64"\n" + " peer_committed: "LPU64"\n" + " last_checked: "LPU64"\n", + imp->imp_last_replay_transno, + imp->imp_peer_committed_transno, + imp->imp_last_transno_checked); + + /* avg data rates */ + for (rw = 0; rw <= 1; rw++) { + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, + &ret); + if (ret.lc_sum > 0 && ret.lc_count > 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + i += snprintf(page + i, count - i, + " %s_data_averages:\n" + " bytes_per_rpc: "LPU64"\n", + rw ? "write" : "read", + ret.lc_sum); + } + k = (int)ret.lc_sum; + j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; + header = &obd->obd_svc_stats->ls_cnt_header[j]; + lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); + if (ret.lc_sum > 0 && ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + i += snprintf(page + i, count - i, + " %s_per_rpc: "LPU64"\n", + header->lc_units, ret.lc_sum); + j = (int)ret.lc_sum; + if (j > 0) + i += snprintf(page + i, count - i, + " MB_per_sec: %u.%.02u\n", + k / j, (100 * k / j) % 100); + } + } + +out_climp: + LPROCFS_CLIMP_EXIT(obd); + return i; +} +EXPORT_SYMBOL(lprocfs_rd_import); + +int lprocfs_rd_state(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int i, j, k; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + *eof = 1; + + i = snprintf(page, count, "current_state: %s\n", + ptlrpc_import_state_name(imp->imp_state)); + i += snprintf(page + i, count - i, + "state_history:\n"); + k = imp->imp_state_hist_idx; + for (j = 0; j < IMP_STATE_HIST_LEN; j++) { + struct import_state_hist *ish = + &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; + if (ish->ish_state == 0) + continue; + i += snprintf(page + i, count - i, " - ["CFS_TIME_T", %s]\n", + ish->ish_time, + ptlrpc_import_state_name(ish->ish_state)); + } + + LPROCFS_CLIMP_EXIT(obd); + return i; +} +EXPORT_SYMBOL(lprocfs_rd_state); + +int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]); + rc += snprintf(page + rc, count - rc, "\n"); + return rc; +} +EXPORT_SYMBOL(lprocfs_at_hist_helper); + +/* See also ptlrpc_lprocfs_rd_timeouts */ +int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time_t now, worstt; + struct dhms ts; + int i, rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + *eof = 1; + + now = cfs_time_current_sec(); + + /* Some network health info for kicks */ + s2dhms(&ts, now - imp->imp_last_reply_time); + rc += snprintf(page + rc, count - rc, + "%-10s : %ld, "DHMS_FMT" ago\n", + "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts)); + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "network", cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "portal %-2d : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], + cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_timeouts); + +int lprocfs_rd_connect_flags(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + __u64 flags; + int ret = 0; + + LPROCFS_CLIMP_CHECK(obd); + flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; + ret = snprintf(page, count, "flags="LPX64"\n", flags); + ret += obd_connect_flags2str(page + ret, count - ret, flags, "\n"); + ret += snprintf(page + ret, count - ret, "\n"); + LPROCFS_CLIMP_EXIT(obd); + return ret; +} +EXPORT_SYMBOL(lprocfs_rd_connect_flags); + +int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + *eof = 1; + return snprintf(page, count, "%u\n", obd->obd_num_exports); +} +EXPORT_SYMBOL(lprocfs_rd_num_exports); + +int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_type *class = (struct obd_type*) data; + + LASSERT(class != NULL); + *eof = 1; + return snprintf(page, count, "%d\n", class->typ_refcnt); +} +EXPORT_SYMBOL(lprocfs_rd_numrefs); + +int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list) +{ + int rc = 0; + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_procroot != NULL); + + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + obd->obd_type->typ_procroot, + list, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name); + obd->obd_proc_entry = NULL; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_setup); + +int lprocfs_obd_cleanup(struct obd_device *obd) +{ + if (!obd) + return -EINVAL; + if (obd->obd_proc_exports_entry) { + /* Should be no exports left */ + LASSERT(obd->obd_proc_exports_entry->subdir == NULL); + lprocfs_remove(&obd->obd_proc_exports_entry); + obd->obd_proc_exports_entry = NULL; + } + if (obd->obd_proc_entry) { + lprocfs_remove(&obd->obd_proc_entry); + obd->obd_proc_entry = NULL; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_cleanup); + +static void lprocfs_free_client_stats(struct nid_stat *client_stat) +{ + CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat, + client_stat->nid_proc, client_stat->nid_stats); + + LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0, + "nid %s:count %d\n", libcfs_nid2str(client_stat->nid), + atomic_read(&client_stat->nid_exp_ref_count)); + + if (client_stat->nid_proc) + lprocfs_remove(&client_stat->nid_proc); + + if (client_stat->nid_stats) + lprocfs_free_stats(&client_stat->nid_stats); + + if (client_stat->nid_ldlm_stats) + lprocfs_free_stats(&client_stat->nid_ldlm_stats); + + OBD_FREE_PTR(client_stat); + return; + +} + +void lprocfs_free_per_client_stats(struct obd_device *obd) +{ + cfs_hash_t *hash = obd->obd_nid_stats_hash; + struct nid_stat *stat; + ENTRY; + + /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ + while (!list_empty(&obd->obd_nid_stats)) { + stat = list_entry(obd->obd_nid_stats.next, + struct nid_stat, nid_list); + list_del_init(&stat->nid_list); + cfs_hash_del(hash, &stat->nid, &stat->nid_hash); + lprocfs_free_client_stats(stat); + } + EXIT; +} +EXPORT_SYMBOL(lprocfs_free_per_client_stats); + +struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, + enum lprocfs_stats_flags flags) +{ + struct lprocfs_stats *stats; + unsigned int num_entry; + unsigned int percpusize = 0; + int i; + + if (num == 0) + return NULL; + + if (lprocfs_no_percpu_stats != 0) + flags |= LPROCFS_STATS_FLAG_NOPERCPU; + + if (flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + /* alloc percpu pointers for all possible cpu slots */ + LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); + if (stats == NULL) + return NULL; + + stats->ls_num = num; + stats->ls_flags = flags; + spin_lock_init(&stats->ls_lock); + + /* alloc num of counter headers */ + LIBCFS_ALLOC(stats->ls_cnt_header, + stats->ls_num * sizeof(struct lprocfs_counter_header)); + if (stats->ls_cnt_header == NULL) + goto fail; + + if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { + /* contains only one set counters */ + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize); + if (stats->ls_percpu[0] == NULL) + goto fail; + stats->ls_biggest_alloc_num = 1; + } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { + /* alloc all percpu data, currently only obd_memory use this */ + for (i = 0; i < num_entry; ++i) + if (lprocfs_stats_alloc_one(stats, i) < 0) + goto fail; + } + + return stats; + +fail: + lprocfs_free_stats(&stats); + return NULL; +} +EXPORT_SYMBOL(lprocfs_alloc_stats); + +void lprocfs_free_stats(struct lprocfs_stats **statsh) +{ + struct lprocfs_stats *stats = *statsh; + unsigned int num_entry; + unsigned int percpusize; + unsigned int i; + + if (stats == NULL || stats->ls_num == 0) + return; + *statsh = NULL; + + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + percpusize = lprocfs_stats_counter_size(stats); + for (i = 0; i < num_entry; i++) + if (stats->ls_percpu[i] != NULL) + LIBCFS_FREE(stats->ls_percpu[i], percpusize); + if (stats->ls_cnt_header != NULL) + LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num * + sizeof(struct lprocfs_counter_header)); + LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); +} +EXPORT_SYMBOL(lprocfs_free_stats); + +void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int i; + int j; + unsigned int num_entry; + unsigned long flags = 0; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + for (j = 0; j < stats->ls_num; j++) { + header = &stats->ls_cnt_header[j]; + percpu_cntr = lprocfs_stats_counter_get(stats, i, j); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + percpu_cntr->lc_sum_irq = 0; + } + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_clear_stats); + +static ssize_t lprocfs_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct lprocfs_stats *stats = seq->private; + + lprocfs_clear_stats(stats); + + return len; +} + +static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + /* return 1st cpu location */ + return (*pos >= stats->ls_num) ? NULL : + lprocfs_stats_counter_get(stats, 0, *pos); +} + +static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + ++*pos; + return (*pos >= stats->ls_num) ? NULL : + lprocfs_stats_counter_get(stats, 0, *pos); +} + +/* seq file export of one lprocfs counter */ +static int lprocfs_stats_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_stats *stats = p->private; + struct lprocfs_counter *cntr = v; + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + int entry_size; + int idx; + int rc = 0; + + if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) { + struct timeval now; + do_gettimeofday(&now); + rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n", + "snapshot_time", now.tv_sec, now.tv_usec); + if (rc < 0) + return rc; + } + entry_size = sizeof(*cntr); + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + entry_size += sizeof(__s64); + idx = ((void *)cntr - (void *)&(stats->ls_percpu[0])->lp_cntr[0]) / + entry_size; + + header = &stats->ls_cnt_header[idx]; + lprocfs_stats_collect(stats, idx, &ret); + + if (ret.lc_count == 0) + goto out; + + rc = seq_printf(p, "%-25s "LPD64" samples [%s]", header->lc_name, + ret.lc_count, header->lc_units); + + if (rc < 0) + goto out; + + if ((header->lc_config & LPROCFS_CNTR_AVGMINMAX) && + (ret.lc_count > 0)) { + rc = seq_printf(p, " "LPD64" "LPD64" "LPD64, + ret.lc_min, ret.lc_max, ret.lc_sum); + if (rc < 0) + goto out; + if (header->lc_config & LPROCFS_CNTR_STDDEV) + rc = seq_printf(p, " "LPD64, ret.lc_sumsquare); + if (rc < 0) + goto out; + } + rc = seq_printf(p, "\n"); + out: + return (rc < 0) ? rc : 0; +} + +struct seq_operations lprocfs_stats_seq_sops = { + start: lprocfs_stats_seq_start, + stop: lprocfs_stats_seq_stop, + next: lprocfs_stats_seq_next, + show: lprocfs_stats_seq_show, +}; + +static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seq; + int rc; + + if (LPROCFS_ENTRY_AND_CHECK(dp)) + return -ENOENT; + + rc = seq_open(file, &lprocfs_stats_seq_sops); + if (rc) { + LPROCFS_EXIT(); + return rc; + } + seq = file->private_data; + seq->private = dp->data; + return 0; +} + +struct file_operations lprocfs_stats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_stats_seq_open, + .read = seq_read, + .write = lprocfs_stats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + +int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + LPROCFS_WRITE_ENTRY(); + entry = create_proc_entry(name, 0644, root); + if (entry) { + entry->proc_fops = &lprocfs_stats_seq_fops; + entry->data = stats; + } + + LPROCFS_WRITE_EXIT(); + + if (entry == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(lprocfs_register_stats); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, const char *units) +{ + struct lprocfs_counter_header *header; + struct lprocfs_counter *percpu_cntr; + unsigned long flags = 0; + unsigned int i; + unsigned int num_cpu; + + LASSERT(stats != NULL); + + header = &stats->ls_cnt_header[index]; + LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", + index, name, units); + + header->lc_config = conf; + header->lc_name = name; + header->lc_units = units; + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; ++i) { + if (stats->ls_percpu[i] == NULL) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, index); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq = 0; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_init); + +#define LPROCFS_OBD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, create); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref); +} +EXPORT_SYMBOL(lprocfs_init_ops_stats); + +int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->obd_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_cntr_base == 0); + + num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) + + num_private_stats - 1 /* o_owner */; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_ops_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + /* If this LBUGs, it is likely that an obd + * operation was added to struct obd_ops in + * , and that the corresponding line item + * LPROCFS_OBD_OP_INIT(.., .., opname) + * is missing from the list above. */ + LASSERTF(stats->ls_cnt_header[i].lc_name != NULL, + "Missing obd_stat initializer obd_op " + "operation at offset %d.\n", i - num_private_stats); + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->obd_stats = stats; + obd->obd_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_obd_stats); + +void lprocfs_free_obd_stats(struct obd_device *obd) +{ + if (obd->obd_stats) + lprocfs_free_stats(&obd->obd_stats); +} +EXPORT_SYMBOL(lprocfs_free_obd_stats); + +#define LPROCFS_MD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus); + LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode); + LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_MD_OP_INIT(num_private_stats, stats, close); + LPROCFS_MD_OP_INIT(num_private_stats, stats, create); + LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing); + LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock); + LPROCFS_MD_OP_INIT(num_private_stats, stats, link); + LPROCFS_MD_OP_INIT(num_private_stats, stats, rename); + LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, sync); + LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match); + LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async); + LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock); +} +EXPORT_SYMBOL(lprocfs_init_mps_stats); + +int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->md_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->md_cntr_base == 0); + + num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) + + num_private_stats; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_mps_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + if (stats->ls_cnt_header[i].lc_name == NULL) { + CERROR("Missing md_stat initializer md_op " + "operation at offset %d. Aborting.\n", + i - num_private_stats); + LBUG(); + } + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->md_stats = stats; + obd->md_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_md_stats); + +void lprocfs_free_md_stats(struct obd_device *obd) +{ + struct lprocfs_stats *stats = obd->md_stats; + + if (stats != NULL) { + obd->md_stats = NULL; + obd->md_cntr_base = 0; + lprocfs_free_stats(&stats); + } +} +EXPORT_SYMBOL(lprocfs_free_md_stats); + +void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ + lprocfs_counter_init(ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC, + 0, "ldlm_enqueue", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC, + 0, "ldlm_convert", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC, + 0, "ldlm_cancel", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_bl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_cp_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_gl_callback", "reqs"); +} +EXPORT_SYMBOL(lprocfs_init_ldlm_stats); + +int lprocfs_exp_rd_nid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_export *exp = data; + LASSERT(exp != NULL); + *eof = 1; + return snprintf(page, count, "%s\n", obd_export_nid2str(exp)); +} + +struct exp_uuid_cb_data { + char *page; + int count; + int *eof; + int *len; +}; + +static void +lprocfs_exp_rd_cb_data_init(struct exp_uuid_cb_data *cb_data, char *page, + int count, int *eof, int *len) +{ + cb_data->page = page; + cb_data->count = count; + cb_data->eof = eof; + cb_data->len = len; +} + +int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct exp_uuid_cb_data *data = (struct exp_uuid_cb_data *)cb_data; + + if (exp->exp_nid_stats) + *data->len += snprintf((data->page + *data->len), + data->count, "%s\n", + obd_uuid2str(&exp->exp_client_uuid)); + return 0; +} + +int lprocfs_exp_rd_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct nid_stat *stats = (struct nid_stat *)data; + struct exp_uuid_cb_data cb_data; + struct obd_device *obd = stats->nid_obd; + int len = 0; + + *eof = 1; + page[0] = '\0'; + lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len); + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_uuid, &cb_data); + return (*cb_data.len); +} + +int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct exp_uuid_cb_data *data = cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + + if (exp->exp_lock_hash != NULL) { + if (!*data->len) { + *data->len += cfs_hash_debug_header(data->page, + data->count); + } + *data->len += cfs_hash_debug_str(hs, data->page + *data->len, + data->count); + } + + return 0; +} + +int lprocfs_exp_rd_hash(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct nid_stat *stats = (struct nid_stat *)data; + struct exp_uuid_cb_data cb_data; + struct obd_device *obd = stats->nid_obd; + int len = 0; + + *eof = 1; + page[0] = '\0'; + lprocfs_exp_rd_cb_data_init(&cb_data, page, count, eof, &len); + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_hash, &cb_data); + return (*cb_data.len); +} + +int lprocfs_nid_stats_clear_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, "%s\n", + "Write into this file to clear all nid stats and " + "stale nid entries"); +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_read); + +static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data) +{ + struct nid_stat *stat = obj; + ENTRY; + + CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count)); + if (atomic_read(&stat->nid_exp_ref_count) == 1) { + /* object has only hash references. */ + spin_lock(&stat->nid_obd->obd_nid_lock); + list_move(&stat->nid_list, data); + spin_unlock(&stat->nid_obd->obd_nid_lock); + RETURN(1); + } + /* we has reference to object - only clear data*/ + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); + + RETURN(0); +} + +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct nid_stat *client_stat; + LIST_HEAD(free_list); + + cfs_hash_cond_del(obd->obd_nid_stats_hash, + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, + nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } + + return count; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_write); + +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) +{ + struct nid_stat *new_stat, *old_stat; + struct obd_device *obd = NULL; + proc_dir_entry_t *entry; + char *buffer = NULL; + int rc = 0; + ENTRY; + + *newnid = 0; + + if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry || + !exp->exp_obd->obd_nid_stats_hash) + RETURN(-EINVAL); + + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. + * Anything else is nonsense.*/ + if (!nid || *nid == LNET_NID_ANY) + RETURN(0); + + obd = exp->exp_obd; + + CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash); + + OBD_ALLOC_PTR(new_stat); + if (new_stat == NULL) + RETURN(-ENOMEM); + + new_stat->nid = *nid; + new_stat->nid_obd = exp->exp_obd; + /* we need set default refcount to 1 to balance obd_disconnect */ + atomic_set(&new_stat->nid_exp_ref_count, 1); + + old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash, + nid, &new_stat->nid_hash); + CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", + old_stat, libcfs_nid2str(*nid), + atomic_read(&new_stat->nid_exp_ref_count)); + + /* We need to release old stats because lprocfs_exp_cleanup() hasn't + * been and will never be called. */ + if (exp->exp_nid_stats) { + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + } + + /* Return -EALREADY here so that we know that the /proc + * entry already has been created */ + if (old_stat != new_stat) { + exp->exp_nid_stats = old_stat; + GOTO(destroy_new, rc = -EALREADY); + } + /* not found - create */ + OBD_ALLOC(buffer, LNET_NIDSTR_SIZE); + if (buffer == NULL) + GOTO(destroy_new, rc = -ENOMEM); + + memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE); + new_stat->nid_proc = lprocfs_register(buffer, + obd->obd_proc_exports_entry, + NULL, NULL); + OBD_FREE(buffer, LNET_NIDSTR_SIZE); + + if (new_stat->nid_proc == NULL) { + CERROR("Error making export directory for nid %s\n", + libcfs_nid2str(*nid)); + GOTO(destroy_new_ns, rc = -ENOMEM); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", + lprocfs_exp_rd_uuid, NULL, new_stat, NULL); + if (IS_ERR(entry)) { + CWARN("Error adding the NID stats file\n"); + rc = PTR_ERR(entry); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "hash", + lprocfs_exp_rd_hash, NULL, new_stat, NULL); + if (IS_ERR(entry)) { + CWARN("Error adding the hash file\n"); + rc = PTR_ERR(entry); + GOTO(destroy_new_ns, rc); + } + + exp->exp_nid_stats = new_stat; + *newnid = 1; + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&new_stat->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + RETURN(rc); + +destroy_new_ns: + if (new_stat->nid_proc != NULL) + lprocfs_remove(&new_stat->nid_proc); + cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash); + +destroy_new: + nidstat_putref(new_stat); + OBD_FREE_PTR(new_stat); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_exp_setup); + +int lprocfs_exp_cleanup(struct obd_export *exp) +{ + struct nid_stat *stat = exp->exp_nid_stats; + + if(!stat || !exp->exp_obd) + RETURN(0); + + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + + return 0; +} +EXPORT_SYMBOL(lprocfs_exp_cleanup); + +int lprocfs_write_helper(const char *buffer, unsigned long count, + int *val) +{ + return lprocfs_write_frac_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_helper); + +int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult) +{ + char kernbuf[20], *end, *pbuf; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } + + *val = (int)simple_strtoul(pbuf, &end, 10) * mult; + if (pbuf == end) + return -EINVAL; + + if (end != NULL && *end == '.') { + int temp_val, pow = 1; + int i; + + pbuf = end + 1; + if (strlen(pbuf) > 5) + pbuf[5] = '\0'; /*only allow 5bits fractional*/ + + temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult; + + if (pbuf < end) { + for (i = 0; i < (end - pbuf); i++) + pow *= 10; + + *val += temp_val / pow; + } + } + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_helper); + +int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, + int mult) +{ + long decimal_val, frac_val; + int prtn; + + if (count < 10) + return -EINVAL; + + decimal_val = val / mult; + prtn = snprintf(buffer, count, "%ld", decimal_val); + frac_val = val % mult; + + if (prtn < (count - 4) && frac_val > 0) { + long temp_frac; + int i, temp_mult = 1, frac_bits = 0; + + temp_frac = frac_val * 10; + buffer[prtn++] = '.'; + while (frac_bits < 2 && (temp_frac / mult) < 1 ) { + /* only reserved 2 bits fraction */ + buffer[prtn++] ='0'; + temp_frac *= 10; + frac_bits++; + } + /* + * Need to think these cases : + * 1. #echo x.00 > /proc/xxx output result : x + * 2. #echo x.0x > /proc/xxx output result : x.0x + * 3. #echo x.x0 > /proc/xxx output result : x.x + * 4. #echo x.xx > /proc/xxx output result : x.xx + * Only reserved 2 bits fraction. + */ + for (i = 0; i < (5 - prtn); i++) + temp_mult *= 10; + + frac_bits = min((int)count - prtn, 3 - frac_bits); + prtn += snprintf(buffer + prtn, frac_bits, "%ld", + frac_val * temp_mult / mult); + + prtn--; + while(buffer[prtn] < '1' || buffer[prtn] > '9') { + prtn--; + if (buffer[prtn] == '.') { + prtn--; + break; + } + } + prtn++; + } + buffer[prtn++] ='\n'; + return prtn; +} +EXPORT_SYMBOL(lprocfs_read_frac_helper); + +int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val) +{ + return lprocfs_write_frac_u64_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_u64_helper); + +int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, + __u64 *val, int mult) +{ + char kernbuf[22], *end, *pbuf; + __u64 whole, frac = 0, units; + unsigned frac_d = 1; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } + + whole = simple_strtoull(pbuf, &end, 10); + if (pbuf == end) + return -EINVAL; + + if (end != NULL && *end == '.') { + int i; + pbuf = end + 1; + + /* need to limit frac_d to a __u32 */ + if (strlen(pbuf) > 10) + pbuf[10] = '\0'; + + frac = simple_strtoull(pbuf, &end, 10); + /* count decimal places */ + for (i = 0; i < (end - pbuf); i++) + frac_d *= 10; + } + + units = 1; + switch(*end) { + case 'p': case 'P': + units <<= 10; + case 't': case 'T': + units <<= 10; + case 'g': case 'G': + units <<= 10; + case 'm': case 'M': + units <<= 10; + case 'k': case 'K': + units <<= 10; + } + /* Specified units override the multiplier */ + if (units) + mult = mult < 0 ? -units : units; + + frac *= mult; + do_div(frac, frac_d); + *val = whole * mult + frac; + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_u64_helper); + +static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (len >= l2) { + len--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} + +/** + * Find the string \a name in the input \a buffer, and return a pointer to the + * value immediately following \a name, reducing \a count appropriately. + * If \a name is not found the original \a buffer is returned. + */ +char *lprocfs_find_named_value(const char *buffer, const char *name, + unsigned long *count) +{ + char *val; + size_t buflen = *count; + + /* there is no strnstr() in rhel5 and ubuntu kernels */ + val = lprocfs_strnstr(buffer, name, buflen); + if (val == NULL) + return (char *)buffer; + + val += strlen(name); /* skip prefix */ + while (val < buffer + buflen && isspace(*val)) /* skip separator */ + val++; + + *count = 0; + while (val < buffer + buflen && isalnum(*val)) { + ++*count; + ++val; + } + + return val - *count; +} +EXPORT_SYMBOL(lprocfs_find_named_value); + +int lprocfs_seq_create(proc_dir_entry_t *parent, + const char *name, + mode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + struct proc_dir_entry *entry; + ENTRY; + + /* Disallow secretly (un)writable entries. */ + LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0)); + + LPROCFS_WRITE_ENTRY(); + entry = create_proc_entry(name, mode, parent); + if (entry) { + entry->proc_fops = seq_fops; + entry->data = data; + } + LPROCFS_WRITE_EXIT(); + + if (entry == NULL) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_seq_create); + +int lprocfs_obd_seq_create(struct obd_device *dev, + const char *name, + mode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + return (lprocfs_seq_create(dev->obd_proc_entry, name, + mode, seq_fops, data)); +} +EXPORT_SYMBOL(lprocfs_obd_seq_create); + +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + spin_lock(&oh->oh_lock); + oh->oh_buckets[value]++; + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_tally); + +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ + unsigned int val; + + for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++) + ; + + lprocfs_oh_tally(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2); + +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += oh->oh_buckets[i]; + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum); + +void lprocfs_oh_clear(struct obd_histogram *oh) +{ + spin_lock(&oh->oh_lock); + memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_clear); + +int lprocfs_obd_rd_hash(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + int c = 0; + + if (obd == NULL) + return 0; + + c += cfs_hash_debug_header(page, count); + c += cfs_hash_debug_str(obd->obd_uuid_hash, page + c, count - c); + c += cfs_hash_debug_str(obd->obd_nid_hash, page + c, count - c); + c += cfs_hash_debug_str(obd->obd_nid_stats_hash, page+c, count-c); + + return c; +} +EXPORT_SYMBOL(lprocfs_obd_rd_hash); + +int lprocfs_obd_rd_recovery_status(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + int len = 0, size; + + LASSERT(obd != NULL); + LASSERT(count >= 0); + + /* Set start of user data returned to + page + off since the user may have + requested to read much smaller than + what we need to read */ + *start = page + off; + + /* We know we are allocated a page here. + Also we know that this function will + not need to write more than a page + so we can truncate at PAGE_CACHE_SIZE. */ + size = min(count + (int)off + 1, (int)PAGE_CACHE_SIZE); + + /* Initialize the page */ + memset(page, 0, size); + + if (lprocfs_obd_snprintf(&page, size, &len, "status: ") <= 0) + goto out; + if (obd->obd_max_recoverable_clients == 0) { + if (lprocfs_obd_snprintf(&page, size, &len, "INACTIVE\n") <= 0) + goto out; + + goto fclose; + } + + /* sampled unlocked, but really... */ + if (obd->obd_recovering == 0) { + if (lprocfs_obd_snprintf(&page, size, &len, "COMPLETE\n") <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, + "recovery_start: %lu\n", + obd->obd_recovery_start) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, + "recovery_duration: %lu\n", + obd->obd_recovery_end - + obd->obd_recovery_start) <= 0) + goto out; + /* Number of clients that have completed recovery */ + if (lprocfs_obd_snprintf(&page, size, &len, + "completed_clients: %d/%d\n", + obd->obd_max_recoverable_clients - + obd->obd_stale_clients, + obd->obd_max_recoverable_clients) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, + "replayed_requests: %d\n", + obd->obd_replayed_requests) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, + "last_transno: "LPD64"\n", + obd->obd_next_recovery_transno - 1)<=0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, "VBR: %s\n", + obd->obd_version_recov ? + "ENABLED" : "DISABLED") <=0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, "IR: %s\n", + obd->obd_no_ir ? + "DISABLED" : "ENABLED") <= 0) + goto out; + goto fclose; + } + + if (lprocfs_obd_snprintf(&page, size, &len, "RECOVERING\n") <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, "recovery_start: %lu\n", + obd->obd_recovery_start) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, "time_remaining: %lu\n", + cfs_time_current_sec() >= + obd->obd_recovery_start + + obd->obd_recovery_timeout ? 0 : + obd->obd_recovery_start + + obd->obd_recovery_timeout - + cfs_time_current_sec()) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len,"connected_clients: %d/%d\n", + atomic_read(&obd->obd_connected_clients), + obd->obd_max_recoverable_clients) <= 0) + goto out; + /* Number of clients that have completed recovery */ + if (lprocfs_obd_snprintf(&page, size, &len,"req_replay_clients: %d\n", + atomic_read(&obd->obd_req_replay_clients)) + <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len,"lock_repay_clients: %d\n", + atomic_read(&obd->obd_lock_replay_clients)) + <=0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len,"completed_clients: %d\n", + atomic_read(&obd->obd_connected_clients) - + atomic_read(&obd->obd_lock_replay_clients)) + <=0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len,"evicted_clients: %d\n", + obd->obd_stale_clients) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len,"replayed_requests: %d\n", + obd->obd_replayed_requests) <= 0) + goto out; + if (lprocfs_obd_snprintf(&page, size, &len, "queued_requests: %d\n", + obd->obd_requests_queued_for_recovery) <= 0) + goto out; + + if (lprocfs_obd_snprintf(&page, size, &len, "next_transno: "LPD64"\n", + obd->obd_next_recovery_transno) <= 0) + goto out; + +fclose: + *eof = 1; +out: + return min(count, len - (int)off); +} +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_status); + +int lprocfs_obd_rd_ir_factor(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->obd_recovery_ir_factor); +} +EXPORT_SYMBOL(lprocfs_obd_rd_ir_factor); + +int lprocfs_obd_wr_ir_factor(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < OBD_IR_FACTOR_MIN || val > OBD_IR_FACTOR_MAX) + return -EINVAL; + + obd->obd_recovery_ir_factor = val; + return count; +} +EXPORT_SYMBOL(lprocfs_obd_wr_ir_factor); + +int lprocfs_obd_rd_recovery_time_soft(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->obd_recovery_timeout); +} +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_soft); + +int lprocfs_obd_wr_recovery_time_soft(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->obd_recovery_timeout = val; + return count; +} +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_soft); + +int lprocfs_obd_rd_recovery_time_hard(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%u\n", obd->obd_recovery_time_hard); +} +EXPORT_SYMBOL(lprocfs_obd_rd_recovery_time_hard); + +int lprocfs_obd_wr_recovery_time_hard(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->obd_recovery_time_hard = val; + return count; +} +EXPORT_SYMBOL(lprocfs_obd_wr_recovery_time_hard); + +int lprocfs_obd_rd_max_pages_per_rpc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%d\n", cli->cl_max_pages_per_rpc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc); + +int lprocfs_target_rd_instance(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_device_target *target = &obd->u.obt; + + LASSERT(obd != NULL); + LASSERT(target->obt_magic == OBT_MAGIC); + *eof = 1; + return snprintf(page, count, "%u\n", obd->u.obt.obt_instance); +} +EXPORT_SYMBOL(lprocfs_target_rd_instance); +#endif /* LPROCFS*/ diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c new file mode 100644 index 000000000000..6c0de3f0a073 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -0,0 +1,2209 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include + +# include + +/* hash_long() */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void lu_object_free(const struct lu_env *env, struct lu_object *o); + +/** + * Decrease reference counter on object. If last reference is freed, return + * object to the cache, unless lu_object_is_dying(o) holds. In the latter + * case, free object immediately. + */ +void lu_object_put(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *top; + struct lu_site *site; + struct lu_object *orig; + cfs_hash_bd_t bd; + const struct lu_fid *fid; + + top = o->lo_header; + site = o->lo_dev->ld_site; + orig = o; + + /* + * till we have full fids-on-OST implemented anonymous objects + * are possible in OSP. such an object isn't listed in the site + * so we should not remove it from the site. + */ + fid = lu_object_fid(o); + if (fid_is_zero(fid)) { + LASSERT(top->loh_hash.next == NULL + && top->loh_hash.pprev == NULL); + LASSERT(list_empty(&top->loh_lru)); + if (!atomic_dec_and_test(&top->loh_ref)) + return; + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + lu_object_free(env, orig); + return; + } + + cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); + + if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) { + if (lu_object_is_dying(top)) { + + /* + * somebody may be waiting for this, currently only + * used for cl_object, see cl_object_put_last(). + */ + wake_up_all(&bkt->lsb_marche_funebre); + } + return; + } + + LASSERT(bkt->lsb_busy > 0); + bkt->lsb_busy--; + /* + * When last reference is released, iterate over object + * layers, and notify them that object is no longer busy. + */ + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + + if (!lu_object_is_dying(top)) { + LASSERT(list_empty(&top->loh_lru)); + list_add_tail(&top->loh_lru, &bkt->lsb_lru); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + return; + } + + /* + * If object is dying (will not be cached), removed it + * from hash table and LRU. + * + * This is done with hash table and LRU lists locked. As the only + * way to acquire first reference to previously unreferenced + * object is through hash-table lookup (lu_object_find()), + * or LRU scanning (lu_site_purge()), that are done under hash-table + * and LRU lock, no race with concurrent object lookup is possible + * and we can safely destroy object below. + */ + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) + cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + /* + * Object was already removed from hash and lru above, can + * kill it. + */ + lu_object_free(env, orig); +} +EXPORT_SYMBOL(lu_object_put); + +/** + * Put object and don't keep in cache. This is temporary solution for + * multi-site objects when its layering is not constant. + */ +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o) +{ + set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags); + return lu_object_put(env, o); +} +EXPORT_SYMBOL(lu_object_put_nocache); + +/** + * Kill the object and take it out of LRU cache. + * Currently used by client code for layout change. + */ +void lu_object_unhash(const struct lu_env *env, struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { + cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash; + cfs_hash_bd_t bd; + + cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1); + list_del_init(&top->loh_lru); + cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(obj_hash, &bd, 1); + } +} +EXPORT_SYMBOL(lu_object_unhash); + +/** + * Allocate new object. + * + * This follows object creation protocol, described in the comment within + * struct lu_device_operations definition. + */ +static struct lu_object *lu_object_alloc(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *scan; + struct lu_object *top; + struct list_head *layers; + int clean; + int result; + ENTRY; + + /* + * Create top-level object slice. This will also create + * lu_object_header. + */ + top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); + if (top == NULL) + RETURN(ERR_PTR(-ENOMEM)); + if (IS_ERR(top)) + RETURN(top); + /* + * This is the only place where object fid is assigned. It's constant + * after this point. + */ + top->lo_header->loh_fid = *f; + layers = &top->lo_header->loh_layers; + do { + /* + * Call ->loo_object_init() repeatedly, until no more new + * object slices are created. + */ + clean = 1; + list_for_each_entry(scan, layers, lo_linkage) { + if (scan->lo_flags & LU_OBJECT_ALLOCATED) + continue; + clean = 0; + scan->lo_header = top->lo_header; + result = scan->lo_ops->loo_object_init(env, scan, conf); + if (result != 0) { + lu_object_free(env, top); + RETURN(ERR_PTR(result)); + } + scan->lo_flags |= LU_OBJECT_ALLOCATED; + } + } while (!clean); + + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_start != NULL) { + result = scan->lo_ops->loo_object_start(env, scan); + if (result != 0) { + lu_object_free(env, top); + RETURN(ERR_PTR(result)); + } + } + } + + lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); + RETURN(top); +} + +/** + * Free an object. + */ +static void lu_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_site *site; + struct lu_object *scan; + struct list_head *layers; + struct list_head splice; + + site = o->lo_dev->ld_site; + layers = &o->lo_header->loh_layers; + bkt = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid); + /* + * First call ->loo_object_delete() method to release all resources. + */ + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_delete != NULL) + scan->lo_ops->loo_object_delete(env, scan); + } + + /* + * Then, splice object layers into stand-alone list, and call + * ->loo_object_free() on all layers to free memory. Splice is + * necessary, because lu_object_header is freed together with the + * top-level slice. + */ + INIT_LIST_HEAD(&splice); + list_splice_init(layers, &splice); + while (!list_empty(&splice)) { + /* + * Free layers in bottom-to-top order, so that object header + * lives as long as possible and ->loo_object_free() methods + * can look at its contents. + */ + o = container_of0(splice.prev, struct lu_object, lo_linkage); + list_del_init(&o->lo_linkage); + LASSERT(o->lo_ops->loo_object_free != NULL); + o->lo_ops->loo_object_free(env, o); + } + + if (waitqueue_active(&bkt->lsb_marche_funebre)) + wake_up_all(&bkt->lsb_marche_funebre); +} + +/** + * Free \a nr objects from the cold end of the site LRU list. + */ +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) +{ + struct lu_object_header *h; + struct lu_object_header *temp; + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + cfs_hash_bd_t bd2; + struct list_head dispose; + int did_sth; + int start; + int count; + int bnr; + int i; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) + RETURN(0); + + INIT_LIST_HEAD(&dispose); + /* + * Under LRU list lock, scan LRU list and move unreferenced objects to + * the dispose list, removing them from LRU and hash table. + */ + start = s->ls_purge_start; + bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1; + again: + did_sth = 0; + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + if (i < start) + continue; + count = bnr; + cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1); + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + + list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { + LASSERT(atomic_read(&h->loh_ref) == 0); + + cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2); + LASSERT(bd.bd_bucket == bd2.bd_bucket); + + cfs_hash_bd_del_locked(s->ls_obj_hash, + &bd2, &h->loh_hash); + list_move(&h->loh_lru, &dispose); + if (did_sth == 0) + did_sth = 1; + + if (nr != ~0 && --nr == 0) + break; + + if (count > 0 && --count == 0) + break; + + } + cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1); + cond_resched(); + /* + * Free everything on the dispose list. This is safe against + * races due to the reasons described in lu_object_put(). + */ + while (!list_empty(&dispose)) { + h = container_of0(dispose.next, + struct lu_object_header, loh_lru); + list_del_init(&h->loh_lru); + lu_object_free(env, lu_object_top(h)); + lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); + } + + if (nr == 0) + break; + } + + if (nr != 0 && did_sth && start != 0) { + start = 0; /* restart from the first bucket */ + goto again; + } + /* race on s->ls_purge_start, but nobody cares */ + s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash); + + return nr; +} +EXPORT_SYMBOL(lu_site_purge); + +/* + * Object printing. + * + * Code below has to jump through certain loops to output object description + * into libcfs_debug_msg-based log. The problem is that lu_object_print() + * composes object description from strings that are parts of _lines_ of + * output (i.e., strings that are not terminated by newline). This doesn't fit + * very well into libcfs_debug_msg() interface that assumes that each message + * supplied to it is a self-contained output line. + * + * To work around this, strings are collected in a temporary buffer + * (implemented as a value of lu_cdebug_key key), until terminating newline + * character is detected. + * + */ + +enum { + /** + * Maximal line size. + * + * XXX overflow is not handled correctly. + */ + LU_CDEBUG_LINE = 512 +}; + +struct lu_cdebug_data { + /** + * Temporary buffer. + */ + char lck_area[LU_CDEBUG_LINE]; +}; + +/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ +LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); + +/** + * Key, holding temporary buffer. This key is registered very early by + * lu_global_init(). + */ +struct lu_context_key lu_global_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | + LCT_MG_THREAD | LCT_CL_THREAD, + .lct_init = lu_global_key_init, + .lct_fini = lu_global_key_fini +}; + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...) +{ + struct libcfs_debug_msg_data *msgdata = cookie; + struct lu_cdebug_data *key; + int used; + int complete; + va_list args; + + va_start(args, format); + + key = lu_context_key_get(&env->le_ctx, &lu_global_key); + LASSERT(key != NULL); + + used = strlen(key->lck_area); + complete = format[strlen(format) - 1] == '\n'; + /* + * Append new chunk to the buffer. + */ + vsnprintf(key->lck_area + used, + ARRAY_SIZE(key->lck_area) - used, format, args); + if (complete) { + if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) + libcfs_debug_msg(msgdata, "%s", key->lck_area); + key->lck_area[0] = 0; + } + va_end(args); + return 0; +} +EXPORT_SYMBOL(lu_cdebug_printer); + +/** + * Print object header. + */ +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr) +{ + (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", + hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), + PFID(&hdr->loh_fid), + hlist_unhashed(&hdr->loh_hash) ? "" : " hash", + list_empty((struct list_head *)&hdr->loh_lru) ? \ + "" : " lru", + hdr->loh_attr & LOHA_EXISTS ? " exist":""); +} +EXPORT_SYMBOL(lu_object_header_print); + +/** + * Print human readable representation of the \a o to the \a printer. + */ +void lu_object_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o) +{ + static const char ruler[] = "........................................"; + struct lu_object_header *top; + int depth; + + top = o->lo_header; + lu_object_header_print(env, cookie, printer, top); + (*printer)(env, cookie, "{ \n"); + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + depth = o->lo_depth + 4; + + /* + * print `.' \a depth times followed by type name and address + */ + (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, + o->lo_dev->ld_type->ldt_name, o); + if (o->lo_ops->loo_object_print != NULL) + o->lo_ops->loo_object_print(env, cookie, printer, o); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} header@%p\n", top); +} +EXPORT_SYMBOL(lu_object_print); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_invariant != NULL && + !o->lo_ops->loo_object_invariant(o)) + return 0; + } + return 1; +} +EXPORT_SYMBOL(lu_object_invariant); + +static struct lu_object *htable_lookup(struct lu_site *s, + cfs_hash_bd_t *bd, + const struct lu_fid *f, + wait_queue_t *waiter, + __u64 *version) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *h; + struct hlist_node *hnode; + __u64 ver = cfs_hash_bd_version_get(bd); + + if (*version == ver) + return NULL; + + *version = ver; + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd); + /* cfs_hash_bd_peek_locked is a somehow "internal" function + * of cfs_hash, it doesn't add refcount on object. */ + hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f); + if (hnode == NULL) { + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + return NULL; + } + + h = container_of0(hnode, struct lu_object_header, loh_hash); + if (likely(!lu_object_is_dying(h))) { + cfs_hash_get(s->ls_obj_hash, hnode); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); + list_del_init(&h->loh_lru); + return lu_object_top(h); + } + + /* + * Lookup found an object being destroyed this object cannot be + * returned (to assure that references to dying objects are eventually + * drained), and moreover, lookup has to wait until object is freed. + */ + + init_waitqueue_entry_current(waiter); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE); + return ERR_PTR(-EAGAIN); +} + +/** + * Search cache for an object with the fid \a f. If such object is found, + * return it. Otherwise, create new object, insert it into cache and return + * it. In any case, additional reference is acquired on the returned object. + */ +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); +} +EXPORT_SYMBOL(lu_object_find); + +static struct lu_object *lu_object_new(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *o; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + struct lu_site_bkt_data *bkt; + + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + hs = dev->ld_site->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; +} + +/** + * Core logic of lu_object_find*() functions. + */ +static struct lu_object *lu_object_find_try(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf, + wait_queue_t *waiter) +{ + struct lu_object *o; + struct lu_object *shadow; + struct lu_site *s; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + __u64 version = 0; + + /* + * This uses standard index maintenance protocol: + * + * - search index under lock, and return object if found; + * - otherwise, unlock index, allocate new object; + * - lock index and search again; + * - if nothing is found (usual case), insert newly created + * object into index; + * - otherwise (race: other thread inserted object), free + * object just allocated. + * - unlock index; + * - return object. + * + * For "LOC_F_NEW" case, we are sure the object is new established. + * It is unnecessary to perform lookup-alloc-lookup-insert, instead, + * just alloc and insert directly. + * + * If dying object is found during index search, add @waiter to the + * site wait-queue and return ERR_PTR(-EAGAIN). + */ + if (conf != NULL && conf->loc_flags & LOC_F_NEW) + return lu_object_new(env, dev, f, conf); + + s = dev->ld_site; + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + o = htable_lookup(s, &bd, f, waiter, &version); + cfs_hash_bd_unlock(hs, &bd, 1); + if (o != NULL) + return o; + + /* + * Allocate new object. This may result in rather complicated + * operations, including fld queries, inode loading, etc. + */ + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + LASSERT(lu_fid_eq(lu_object_fid(o), f)); + + cfs_hash_bd_lock(hs, &bd, 1); + + shadow = htable_lookup(s, &bd, f, waiter, &version); + if (likely(shadow == NULL)) { + struct lu_site_bkt_data *bkt; + + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; + } + + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); + cfs_hash_bd_unlock(hs, &bd, 1); + lu_object_free(env, o); + return shadow; +} + +/** + * Much like lu_object_find(), but top level device of object is specifically + * \a dev rather than top level device of the site. This interface allows + * objects of different "stacking" to be created within the same site. + */ +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_site_bkt_data *bkt; + struct lu_object *obj; + wait_queue_t wait; + + while (1) { + obj = lu_object_find_try(env, dev, f, conf, &wait); + if (obj != ERR_PTR(-EAGAIN)) + return obj; + /* + * lu_object_find_try() already added waiter into the + * wait queue. + */ + waitq_wait(&wait, TASK_UNINTERRUPTIBLE); + bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f); + remove_wait_queue(&bkt->lsb_marche_funebre, &wait); + } +} +EXPORT_SYMBOL(lu_object_find_at); + +/** + * Find object with given fid, and return its slice belonging to given device. + */ +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *top; + struct lu_object *obj; + + top = lu_object_find(env, dev, f, conf); + if (!IS_ERR(top)) { + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (obj == NULL) + lu_object_put(env, top); + } else + obj = top; + return obj; +} +EXPORT_SYMBOL(lu_object_find_slice); + +/** + * Global list of all device types. + */ +static LIST_HEAD(lu_device_types); + +int lu_device_type_init(struct lu_device_type *ldt) +{ + int result = 0; + + INIT_LIST_HEAD(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_init) + result = ldt->ldt_ops->ldto_init(ldt); + if (result == 0) + list_add(&ldt->ldt_linkage, &lu_device_types); + return result; +} +EXPORT_SYMBOL(lu_device_type_init); + +void lu_device_type_fini(struct lu_device_type *ldt) +{ + list_del_init(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_fini) + ldt->ldt_ops->ldto_fini(ldt); +} +EXPORT_SYMBOL(lu_device_type_fini); + +void lu_types_stop(void) +{ + struct lu_device_type *ldt; + + list_for_each_entry(ldt, &lu_device_types, ldt_linkage) { + if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop) + ldt->ldt_ops->ldto_stop(ldt); + } +} +EXPORT_SYMBOL(lu_types_stop); + +/** + * Global list of all sites on this node + */ +static LIST_HEAD(lu_sites); +static DEFINE_MUTEX(lu_sites_guard); + +/** + * Global environment used by site shrinker. + */ +static struct lu_env lu_shrink_env; + +struct lu_site_print_arg { + struct lu_env *lsp_env; + void *lsp_cookie; + lu_printer_t lsp_printer; +}; + +static int +lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data; + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (!list_empty(&h->loh_layers)) { + const struct lu_object *o; + + o = lu_object_top(h); + lu_object_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, o); + } else { + lu_object_header_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, h); + } + return 0; +} + +/** + * Print all objects in \a s. + */ +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer) +{ + struct lu_site_print_arg arg = { + .lsp_env = (struct lu_env *)env, + .lsp_cookie = cookie, + .lsp_printer = printer, + }; + + cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg); +} +EXPORT_SYMBOL(lu_site_print); + +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644, + "Percentage of memory to be used as lu_object cache"); + +/** + * Return desired hash table order. + */ +static int lu_htable_order(void) +{ + unsigned long cache_size; + int bits; + + /* + * Calculate hash table size, assuming that we want reasonable + * performance when 20% of total memory is occupied by cache of + * lu_objects. + * + * Size of lu_object is (arbitrary) taken as 1K (together with inode). + */ + cache_size = num_physpages; + +#if BITS_PER_LONG == 32 + /* limit hashtable size for lowmem systems to low RAM */ + if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT)) + cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4; +#endif + + /* clear off unreasonable cache setting. */ + if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { + CWARN("obdclass: invalid lu_cache_percent: %u, it must be in" + " the range of (0, %u]. Will use default value: %u.\n", + lu_cache_percent, LU_CACHE_PERCENT_MAX, + LU_CACHE_PERCENT_DEFAULT); + + lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; + } + cache_size = cache_size / 100 * lu_cache_percent * + (PAGE_CACHE_SIZE / 1024); + + for (bits = 1; (1 << bits) < cache_size; ++bits) { + ; + } + return bits; +} + +static unsigned lu_obj_hop_hash(cfs_hash_t *hs, + const void *key, unsigned mask) +{ + struct lu_fid *fid = (struct lu_fid *)key; + __u32 hash; + + hash = fid_flatten32(fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + hash = cfs_hash_long(hash, hs->hs_bkt_bits); + + /* give me another random factor */ + hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *lu_obj_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct lu_object_header, loh_hash); +} + +static void *lu_obj_hop_key(struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return &h->loh_fid; +} + +static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key); +} + +static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (atomic_add_return(1, &h->loh_ref) == 1) { + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + + cfs_hash_bd_get(hs, &h->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + bkt->lsb_busy++; + } +} + +static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + LBUG(); /* we should never called it */ +} + +cfs_hash_ops_t lu_site_hash_ops = { + .hs_hash = lu_obj_hop_hash, + .hs_key = lu_obj_hop_key, + .hs_keycmp = lu_obj_hop_keycmp, + .hs_object = lu_obj_hop_object, + .hs_get = lu_obj_hop_get, + .hs_put_locked = lu_obj_hop_put_locked, +}; + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + if (list_empty(&d->ld_linkage)) + list_add(&d->ld_linkage, &s->ls_ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_add_linkage); + +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + list_del_init(&d->ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_del_linkage); + +/** + * Initialize site \a s, with \a d as the top level device. + */ +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + +int lu_site_init(struct lu_site *s, struct lu_device *top) +{ + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + char name[16]; + int bits; + int i; + ENTRY; + + memset(s, 0, sizeof *s); + bits = lu_htable_order(); + snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); + for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); + bits >= LU_SITE_BITS_MIN; bits--) { + s->ls_obj_hash = cfs_hash_create(name, bits, bits, + bits - LU_SITE_BKT_BITS, + sizeof(*bkt), 0, 0, + &lu_site_hash_ops, + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF | + CFS_HASH_DEPTH | + CFS_HASH_ASSERT_EMPTY); + if (s->ls_obj_hash != NULL) + break; + } + + if (s->ls_obj_hash == NULL) { + CERROR("failed to create lu_site hash with bits: %d\n", bits); + return -ENOMEM; + } + + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + INIT_LIST_HEAD(&bkt->lsb_lru); + init_waitqueue_head(&bkt->lsb_marche_funebre); + } + + s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); + if (s->ls_stats == NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + return -ENOMEM; + } + + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, + 0, "created", "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, + 0, "cache_hit", "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, + 0, "cache_miss", "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, + 0, "cache_race", "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race", "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, + 0, "lru_purged", "lru_purged"); + + INIT_LIST_HEAD(&s->ls_linkage); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); + + INIT_LIST_HEAD(&s->ls_ld_linkage); + spin_lock_init(&s->ls_ld_lock); + + lu_dev_add_linkage(s, top); + + RETURN(0); +} +EXPORT_SYMBOL(lu_site_init); + +/** + * Finalize \a s and release its resources. + */ +void lu_site_fini(struct lu_site *s) +{ + mutex_lock(&lu_sites_guard); + list_del_init(&s->ls_linkage); + mutex_unlock(&lu_sites_guard); + + if (s->ls_obj_hash != NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + } + + if (s->ls_top_dev != NULL) { + s->ls_top_dev->ld_site = NULL; + lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); + lu_device_put(s->ls_top_dev); + s->ls_top_dev = NULL; + } + + if (s->ls_stats != NULL) + lprocfs_free_stats(&s->ls_stats); +} +EXPORT_SYMBOL(lu_site_fini); + +/** + * Called when initialization of stack for this site is completed. + */ +int lu_site_init_finish(struct lu_site *s) +{ + int result; + mutex_lock(&lu_sites_guard); + result = lu_context_refill(&lu_shrink_env.le_ctx); + if (result == 0) + list_add(&s->ls_linkage, &lu_sites); + mutex_unlock(&lu_sites_guard); + return result; +} +EXPORT_SYMBOL(lu_site_init_finish); + +/** + * Acquire additional reference on device \a d + */ +void lu_device_get(struct lu_device *d) +{ + atomic_inc(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_get); + +/** + * Release reference on device \a d. + */ +void lu_device_put(struct lu_device *d) +{ + LASSERT(atomic_read(&d->ld_ref) > 0); + atomic_dec(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_put); + +/** + * Initialize device \a d of type \a t. + */ +int lu_device_init(struct lu_device *d, struct lu_device_type *t) +{ + if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL) + t->ldt_ops->ldto_start(t); + memset(d, 0, sizeof *d); + atomic_set(&d->ld_ref, 0); + d->ld_type = t; + lu_ref_init(&d->ld_reference); + INIT_LIST_HEAD(&d->ld_linkage); + return 0; +} +EXPORT_SYMBOL(lu_device_init); + +/** + * Finalize device \a d. + */ +void lu_device_fini(struct lu_device *d) +{ + struct lu_device_type *t; + + t = d->ld_type; + if (d->ld_obd != NULL) { + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } + + lu_ref_fini(&d->ld_reference); + LASSERTF(atomic_read(&d->ld_ref) == 0, + "Refcount is %u\n", atomic_read(&d->ld_ref)); + LASSERT(t->ldt_device_nr > 0); + if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL) + t->ldt_ops->ldto_stop(t); +} +EXPORT_SYMBOL(lu_device_fini); + +/** + * Initialize object \a o that is part of compound object \a h and was created + * by device \a d. + */ +int lu_object_init(struct lu_object *o, + struct lu_object_header *h, struct lu_device *d) +{ + memset(o, 0, sizeof *o); + o->lo_header = h; + o->lo_dev = d; + lu_device_get(d); + o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o); + INIT_LIST_HEAD(&o->lo_linkage); + return 0; +} +EXPORT_SYMBOL(lu_object_init); + +/** + * Finalize object and release its resources. + */ +void lu_object_fini(struct lu_object *o) +{ + struct lu_device *dev = o->lo_dev; + + LASSERT(list_empty(&o->lo_linkage)); + + if (dev != NULL) { + lu_ref_del_at(&dev->ld_reference, + o->lo_dev_ref , "lu_object", o); + lu_device_put(dev); + o->lo_dev = NULL; + } +} +EXPORT_SYMBOL(lu_object_fini); + +/** + * Add object \a o as first layer of compound object \a h + * + * This is typically called by the ->ldo_object_alloc() method of top-level + * device. + */ +void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) +{ + list_move(&o->lo_linkage, &h->loh_layers); +} +EXPORT_SYMBOL(lu_object_add_top); + +/** + * Add object \a o as a layer of compound object, going after \a before. + * + * This is typically called by the ->ldo_object_alloc() method of \a + * before->lo_dev. + */ +void lu_object_add(struct lu_object *before, struct lu_object *o) +{ + list_move(&o->lo_linkage, &before->lo_linkage); +} +EXPORT_SYMBOL(lu_object_add); + +/** + * Initialize compound object. + */ +int lu_object_header_init(struct lu_object_header *h) +{ + memset(h, 0, sizeof *h); + atomic_set(&h->loh_ref, 1); + INIT_HLIST_NODE(&h->loh_hash); + INIT_LIST_HEAD(&h->loh_lru); + INIT_LIST_HEAD(&h->loh_layers); + lu_ref_init(&h->loh_reference); + return 0; +} +EXPORT_SYMBOL(lu_object_header_init); + +/** + * Finalize compound object. + */ +void lu_object_header_fini(struct lu_object_header *h) +{ + LASSERT(list_empty(&h->loh_layers)); + LASSERT(list_empty(&h->loh_lru)); + LASSERT(hlist_unhashed(&h->loh_hash)); + lu_ref_fini(&h->loh_reference); +} +EXPORT_SYMBOL(lu_object_header_fini); + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype) +{ + struct lu_object *o; + + list_for_each_entry(o, &h->loh_layers, lo_linkage) { + if (o->lo_dev->ld_type == dtype) + return o; + } + return NULL; +} +EXPORT_SYMBOL(lu_object_locate); + + + +/** + * Finalize and free devices in the device stack. + * + * Finalize device stack by purging object cache, and calling + * lu_device_type_operations::ldto_device_fini() and + * lu_device_type_operations::ldto_device_free() on all devices in the stack. + */ +void lu_stack_fini(const struct lu_env *env, struct lu_device *top) +{ + struct lu_site *site = top->ld_site; + struct lu_device *scan; + struct lu_device *next; + + lu_site_purge(env, site, ~0); + for (scan = top; scan != NULL; scan = next) { + next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); + lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); + lu_device_put(scan); + } + + /* purge again. */ + lu_site_purge(env, site, ~0); + + for (scan = top; scan != NULL; scan = next) { + const struct lu_device_type *ldt = scan->ld_type; + struct obd_type *type; + + next = ldt->ldt_ops->ldto_device_free(env, scan); + type = ldt->ldt_obd_type; + if (type != NULL) { + type->typ_refcnt--; + class_put_type(type); + } + } +} +EXPORT_SYMBOL(lu_stack_fini); + +enum { + /** + * Maximal number of tld slots. + */ + LU_CONTEXT_KEY_NR = 40 +}; + +static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; + +static DEFINE_SPINLOCK(lu_keys_guard); + +/** + * Global counter incremented whenever key is registered, unregistered, + * revived or quiesced. This is used to void unnecessary calls to + * lu_context_refill(). No locking is provided, as initialization and shutdown + * are supposed to be externally serialized. + */ +static unsigned key_set_version = 0; + +/** + * Register new key. + */ +int lu_context_key_register(struct lu_context_key *key) +{ + int result; + int i; + + LASSERT(key->lct_init != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(key->lct_tags != 0); + LASSERT(key->lct_owner != NULL); + + result = -ENFILE; + spin_lock(&lu_keys_guard); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (lu_keys[i] == NULL) { + key->lct_index = i; + atomic_set(&key->lct_used, 1); + lu_keys[i] = key; + lu_ref_init(&key->lct_reference); + result = 0; + ++key_set_version; + break; + } + } + spin_unlock(&lu_keys_guard); + return result; +} +EXPORT_SYMBOL(lu_context_key_register); + +static void key_fini(struct lu_context *ctx, int index) +{ + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { + struct lu_context_key *key; + + key = lu_keys[index]; + LASSERT(key != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(atomic_read(&key->lct_used) > 1); + + key->lct_fini(ctx, key, ctx->lc_value[index]); + lu_ref_del(&key->lct_reference, "ctx", ctx); + atomic_dec(&key->lct_used); + + LASSERT(key->lct_owner != NULL); + if ((ctx->lc_tags & LCT_NOREF) == 0) { + LINVRNT(module_refcount(key->lct_owner) > 0); + module_put(key->lct_owner); + } + ctx->lc_value[index] = NULL; + } +} + +/** + * Deregister key. + */ +void lu_context_key_degister(struct lu_context_key *key) +{ + LASSERT(atomic_read(&key->lct_used) >= 1); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + + lu_context_key_quiesce(key); + + ++key_set_version; + spin_lock(&lu_keys_guard); + key_fini(&lu_shrink_env.le_ctx, key->lct_index); + if (lu_keys[key->lct_index]) { + lu_keys[key->lct_index] = NULL; + lu_ref_fini(&key->lct_reference); + } + spin_unlock(&lu_keys_guard); + + LASSERTF(atomic_read(&key->lct_used) == 1, + "key has instances: %d\n", + atomic_read(&key->lct_used)); +} +EXPORT_SYMBOL(lu_context_key_degister); + +/** + * Register a number of keys. This has to be called after all keys have been + * initialized by a call to LU_CONTEXT_KEY_INIT(). + */ +int lu_context_key_register_many(struct lu_context_key *k, ...) +{ + struct lu_context_key *key = k; + va_list args; + int result; + + va_start(args, k); + do { + result = lu_context_key_register(key); + if (result) + break; + key = va_arg(args, struct lu_context_key *); + } while (key != NULL); + va_end(args); + + if (result != 0) { + va_start(args, k); + while (k != key) { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key *); + } + va_end(args); + } + + return result; +} +EXPORT_SYMBOL(lu_context_key_register_many); + +/** + * De-register a number of keys. This is a dual to + * lu_context_key_register_many(). + */ +void lu_context_key_degister_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_degister_many); + +/** + * Revive a number of keys. + */ +void lu_context_key_revive_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_revive(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_revive_many); + +/** + * Quiescent a number of keys. + */ +void lu_context_key_quiesce_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_quiesce(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_quiesce_many); + +/** + * Return value associated with key \a key in context \a ctx. + */ +void *lu_context_key_get(const struct lu_context *ctx, + const struct lu_context_key *key) +{ + LINVRNT(ctx->lc_state == LCS_ENTERED); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); + return ctx->lc_value[key->lct_index]; +} +EXPORT_SYMBOL(lu_context_key_get); + +/** + * List of remembered contexts. XXX document me. + */ +static LIST_HEAD(lu_context_remembered); + +/** + * Destroy \a key in all remembered contexts. This is used to destroy key + * values in "shared" contexts (like service threads), when a module owning + * the key is about to be unloaded. + */ +void lu_context_key_quiesce(struct lu_context_key *key) +{ + struct lu_context *ctx; + + if (!(key->lct_tags & LCT_QUIESCENT)) { + /* + * XXX layering violation. + */ + key->lct_tags |= LCT_QUIESCENT; + /* + * XXX memory barrier has to go here. + */ + spin_lock(&lu_keys_guard); + list_for_each_entry(ctx, &lu_context_remembered, + lc_remember) + key_fini(ctx, key->lct_index); + spin_unlock(&lu_keys_guard); + ++key_set_version; + } +} +EXPORT_SYMBOL(lu_context_key_quiesce); + +void lu_context_key_revive(struct lu_context_key *key) +{ + key->lct_tags &= ~LCT_QUIESCENT; + ++key_set_version; +} +EXPORT_SYMBOL(lu_context_key_revive); + +static void keys_fini(struct lu_context *ctx) +{ + int i; + + if (ctx->lc_value == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) + key_fini(ctx, i); + + OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + ctx->lc_value = NULL; +} + +static int keys_fill(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_value != NULL); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (ctx->lc_value[i] == NULL && key != NULL && + (key->lct_tags & ctx->lc_tags) && + /* + * Don't create values for a LCT_QUIESCENT key, as this + * will pin module owning a key. + */ + !(key->lct_tags & LCT_QUIESCENT)) { + void *value; + + LINVRNT(key->lct_init != NULL); + LINVRNT(key->lct_index == i); + + value = key->lct_init(ctx, key); + if (unlikely(IS_ERR(value))) + return PTR_ERR(value); + + LASSERT(key->lct_owner != NULL); + if (!(ctx->lc_tags & LCT_NOREF)) + try_module_get(key->lct_owner); + lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); + atomic_inc(&key->lct_used); + /* + * This is the only place in the code, where an + * element of ctx->lc_value[] array is set to non-NULL + * value. + */ + ctx->lc_value[i] = value; + if (key->lct_exit != NULL) + ctx->lc_tags |= LCT_HAS_EXIT; + } + ctx->lc_version = key_set_version; + } + return 0; +} + +static int keys_init(struct lu_context *ctx) +{ + OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + if (likely(ctx->lc_value != NULL)) + return keys_fill(ctx); + + return -ENOMEM; +} + +/** + * Initialize context data-structure. Create values for all keys. + */ +int lu_context_init(struct lu_context *ctx, __u32 tags) +{ + int rc; + + memset(ctx, 0, sizeof *ctx); + ctx->lc_state = LCS_INITIALIZED; + ctx->lc_tags = tags; + if (tags & LCT_REMEMBER) { + spin_lock(&lu_keys_guard); + list_add(&ctx->lc_remember, &lu_context_remembered); + spin_unlock(&lu_keys_guard); + } else { + INIT_LIST_HEAD(&ctx->lc_remember); + } + + rc = keys_init(ctx); + if (rc != 0) + lu_context_fini(ctx); + + return rc; +} +EXPORT_SYMBOL(lu_context_init); + +/** + * Finalize context data-structure. Destroy key values. + */ +void lu_context_fini(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_FINALIZED; + + if ((ctx->lc_tags & LCT_REMEMBER) == 0) { + LASSERT(list_empty(&ctx->lc_remember)); + keys_fini(ctx); + + } else { /* could race with key degister */ + spin_lock(&lu_keys_guard); + keys_fini(ctx); + list_del_init(&ctx->lc_remember); + spin_unlock(&lu_keys_guard); + } +} +EXPORT_SYMBOL(lu_context_fini); + +/** + * Called before entering context. + */ +void lu_context_enter(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_ENTERED; +} +EXPORT_SYMBOL(lu_context_enter); + +/** + * Called after exiting from \a ctx + */ +void lu_context_exit(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_state == LCS_ENTERED); + ctx->lc_state = LCS_LEFT; + if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) { + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (ctx->lc_value[i] != NULL) { + struct lu_context_key *key; + + key = lu_keys[i]; + LASSERT(key != NULL); + if (key->lct_exit != NULL) + key->lct_exit(ctx, + key, ctx->lc_value[i]); + } + } + } +} +EXPORT_SYMBOL(lu_context_exit); + +/** + * Allocate for context all missing keys that were registered after context + * creation. key_set_version is only changed in rare cases when modules + * are loaded and removed. + */ +int lu_context_refill(struct lu_context *ctx) +{ + return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx); +} +EXPORT_SYMBOL(lu_context_refill); + +/** + * lu_ctx_tags/lu_ses_tags will be updated if there are new types of + * obd being added. Currently, this is only used on client side, specifically + * for echo device client, for other stack (like ptlrpc threads), context are + * predefined when the lu_device type are registered, during the module probe + * phase. + */ +__u32 lu_context_tags_default = 0; +__u32 lu_session_tags_default = 0; + +void lu_context_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_update); + +void lu_context_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_clear); + +void lu_session_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_update); + +void lu_session_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_clear); + +int lu_env_init(struct lu_env *env, __u32 tags) +{ + int result; + + env->le_ses = NULL; + result = lu_context_init(&env->le_ctx, tags); + if (likely(result == 0)) + lu_context_enter(&env->le_ctx); + return result; +} +EXPORT_SYMBOL(lu_env_init); + +void lu_env_fini(struct lu_env *env) +{ + lu_context_exit(&env->le_ctx); + lu_context_fini(&env->le_ctx); + env->le_ses = NULL; +} +EXPORT_SYMBOL(lu_env_fini); + +int lu_env_refill(struct lu_env *env) +{ + int result; + + result = lu_context_refill(&env->le_ctx); + if (result == 0 && env->le_ses != NULL) + result = lu_context_refill(env->le_ses); + return result; +} +EXPORT_SYMBOL(lu_env_refill); + +/** + * Currently, this API will only be used by echo client. + * Because echo client and normal lustre client will share + * same cl_env cache. So echo client needs to refresh + * the env context after it get one from the cache, especially + * when normal client and echo client co-exist in the same client. + */ +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, + __u32 stags) +{ + int result; + + if ((env->le_ctx.lc_tags & ctags) != ctags) { + env->le_ctx.lc_version = 0; + env->le_ctx.lc_tags |= ctags; + } + + if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) { + env->le_ses->lc_version = 0; + env->le_ses->lc_tags |= stags; + } + + result = lu_env_refill(env); + + return result; +} +EXPORT_SYMBOL(lu_env_refill_by_tags); + +static struct shrinker *lu_site_shrinker = NULL; + +typedef struct lu_site_stats{ + unsigned lss_populated; + unsigned lss_max_search; + unsigned lss_total; + unsigned lss_busy; +} lu_site_stats_t; + +static void lu_site_stats_get(cfs_hash_t *hs, + lu_site_stats_t *stats, int populated) +{ + cfs_hash_bd_t bd; + int i; + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd); + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 1); + stats->lss_busy += bkt->lsb_busy; + stats->lss_total += cfs_hash_bd_count_get(&bd); + stats->lss_max_search = max((int)stats->lss_max_search, + cfs_hash_bd_depmax_get(&bd)); + if (!populated) { + cfs_hash_bd_unlock(hs, &bd, 1); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + if (!hlist_empty(hhead)) + stats->lss_populated++; + } + cfs_hash_bd_unlock(hs, &bd, 1); + } +} + + +/* + * There exists a potential lock inversion deadlock scenario when using + * Lustre on top of ZFS. This occurs between one of ZFS's + * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially, + * thread A will take the lu_sites_guard lock and sleep on the ht_lock, + * while thread B will take the ht_lock and sleep on the lu_sites_guard + * lock. Obviously neither thread will wake and drop their respective hold + * on their lock. + * + * To prevent this from happening we must ensure the lu_sites_guard lock is + * not taken while down this code path. ZFS reliably does not set the + * __GFP_FS bit in its code paths, so this can be used to determine if it + * is safe to take the lu_sites_guard lock. + * + * Ideally we should accurately return the remaining number of cached + * objects without taking the lu_sites_guard lock, but this is not + * possible in the current implementation. + */ +static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + lu_site_stats_t stats; + struct lu_site *s; + struct lu_site *tmp; + int cached = 0; + int remain = shrink_param(sc, nr_to_scan); + LIST_HEAD(splice); + + if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) { + if (remain != 0) + return -1; + else + /* We must not take the lu_sites_guard lock when + * __GFP_FS is *not* set because of the deadlock + * possibility detailed above. Additionally, + * since we cannot determine the number of + * objects in the cache without taking this + * lock, we're in a particularly tough spot. As + * a result, we'll just lie and say our cache is + * empty. This _should_ be ok, as we can't + * reclaim objects when __GFP_FS is *not* set + * anyways. + */ + return 0; + } + + CDEBUG(D_INODE, "Shrink %d objects\n", remain); + + mutex_lock(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + if (shrink_param(sc, nr_to_scan) != 0) { + remain = lu_site_purge(&lu_shrink_env, s, remain); + /* + * Move just shrunk site to the tail of site list to + * assure shrinking fairness. + */ + list_move_tail(&s->ls_linkage, &splice); + } + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 0); + cached += stats.lss_total - stats.lss_busy; + if (shrink_param(sc, nr_to_scan) && remain <= 0) + break; + } + list_splice(&splice, lu_sites.prev); + mutex_unlock(&lu_sites_guard); + + cached = (cached / 100) * sysctl_vfs_cache_pressure; + if (shrink_param(sc, nr_to_scan) == 0) + CDEBUG(D_INODE, "%d objects cached\n", cached); + return cached; +} + +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *unused, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +int lu_debugging_setup(void) +{ + return lu_env_init(&lu_debugging_env, ~0); +} + +void lu_context_keys_dump(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (key != NULL) { + CERROR("[%d]: %p %x (%p,%p,%p) %d %d \"%s\"@%p\n", + i, key, key->lct_tags, + key->lct_init, key->lct_fini, key->lct_exit, + key->lct_index, atomic_read(&key->lct_used), + key->lct_owner ? key->lct_owner->name : "", + key->lct_owner); + lu_ref_print(&key->lct_reference); + } + } +} +EXPORT_SYMBOL(lu_context_keys_dump); + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void) +{ + int result; + + CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); + + result = lu_ref_global_init(); + if (result != 0) + return result; + + LU_CONTEXT_KEY_INIT(&lu_global_key); + result = lu_context_key_register(&lu_global_key); + if (result != 0) + return result; + + /* + * At this level, we don't know what tags are needed, so allocate them + * conservatively. This should not be too bad, because this + * environment is global. + */ + mutex_lock(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + mutex_unlock(&lu_sites_guard); + if (result != 0) + return result; + + /* + * seeks estimation: 3 seeks to read a record from oi, one to read + * inode, one for ea. Unfortunately setting this high value results in + * lu_object/inode cache consuming all the memory. + */ + lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink); + if (lu_site_shrinker == NULL) + return -ENOMEM; + + return result; +} + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void) +{ + if (lu_site_shrinker != NULL) { + remove_shrinker(lu_site_shrinker); + lu_site_shrinker = NULL; + } + + lu_context_key_degister(&lu_global_key); + + /* + * Tear shrinker environment down _after_ de-registering + * lu_global_key, because the latter has a value in the former. + */ + mutex_lock(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + mutex_unlock(&lu_sites_guard); + + lu_ref_global_fini(); +} + +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) +{ +#ifdef LPROCFS + struct lprocfs_counter ret; + + lprocfs_stats_collect(stats, idx, &ret); + return (__u32)ret.lc_count; +#else + return 0; +#endif +} + +/** + * Output site statistical counters into a buffer. Suitable for + * lprocfs_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, char *page, int count) +{ + lu_site_stats_t stats; + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 1); + + return snprintf(page, count, "%d/%d %d/%d %d %d %d %d %d %d %d\n", + stats.lss_busy, + stats.lss_total, + stats.lss_populated, + CFS_HASH_NHLIST(s->ls_obj_hash), + stats.lss_max_search, + ls_stats_read(s->ls_stats, LU_SS_CREATED), + ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), + ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), + ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), + ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); +} +EXPORT_SYMBOL(lu_site_stats_print); + +/** + * Helper function to initialize a number of kmem slab caches at once. + */ +int lu_kmem_init(struct lu_kmem_descr *caches) +{ + int result; + struct lu_kmem_descr *iter = caches; + + for (result = 0; iter->ckd_cache != NULL; ++iter) { + *iter->ckd_cache = kmem_cache_create(iter->ckd_name, + iter->ckd_size, + 0, 0, NULL); + if (*iter->ckd_cache == NULL) { + result = -ENOMEM; + /* free all previously allocated caches */ + lu_kmem_fini(caches); + break; + } + } + return result; +} +EXPORT_SYMBOL(lu_kmem_init); + +/** + * Helper function to finalize a number of kmem slab cached at once. Dual to + * lu_kmem_init(). + */ +void lu_kmem_fini(struct lu_kmem_descr *caches) +{ + for (; caches->ckd_cache != NULL; ++caches) { + if (*caches->ckd_cache != NULL) { + kmem_cache_destroy(*caches->ckd_cache); + *caches->ckd_cache = NULL; + } + } +} +EXPORT_SYMBOL(lu_kmem_fini); + +/** + * Temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid) +{ + struct lu_site *s = o->lo_dev->ld_site; + struct lu_fid *old = &o->lo_header->loh_fid; + struct lu_site_bkt_data *bkt; + struct lu_object *shadow; + wait_queue_t waiter; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + __u64 version = 0; + + LASSERT(fid_is_zero(old)); + + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1); + shadow = htable_lookup(s, &bd, fid, &waiter, &version); + /* supposed to be unique */ + LASSERT(shadow == NULL); + *old = *fid; + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); +} +EXPORT_SYMBOL(lu_object_assign_fid); + +/** + * allocates object with 0 (non-assiged) fid + * XXX: temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf) +{ + struct lu_fid fid; + struct lu_object *o; + + fid_zero(&fid); + o = lu_object_alloc(env, dev, &fid, conf); + + return o; +} +EXPORT_SYMBOL(lu_object_anon); + +struct lu_buf LU_BUF_NULL = { + .lb_buf = NULL, + .lb_len = 0 +}; +EXPORT_SYMBOL(LU_BUF_NULL); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, int size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, int size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len) +{ + if (buf->lb_buf == NULL && buf->lb_len == 0) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && (buf->lb_buf != NULL)) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, int len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + OBD_ALLOC_LARGE(ptr, len); + if (ptr == NULL) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf != NULL) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} +EXPORT_SYMBOL(lu_buf_check_and_grow); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c new file mode 100644 index 000000000000..23a76f158356 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c @@ -0,0 +1,50 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_ref.c + * + * Lustre reference. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +# include + +#include +#include +#include +#include diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c new file mode 100644 index 000000000000..229db6c39b78 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include + +/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */ +LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred); + +static struct lu_context_key lu_ucred_key = { + .lct_tags = LCT_SESSION, + .lct_init = lu_ucred_key_init, + .lct_fini = lu_ucred_key_fini +}; + +/** + * Get ucred key if session exists and ucred key is allocated on it. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred(const struct lu_env *env) +{ + if (!env->le_ses) + return NULL; + return lu_context_key_get(env->le_ses, &lu_ucred_key); +} +EXPORT_SYMBOL(lu_ucred); + +/** + * Get ucred key and check if it is properly initialized. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred_check(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred(env); + if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW) + return NULL; + return uc; +} +EXPORT_SYMBOL(lu_ucred_check); + +/** + * Get ucred key, which must exist and must be properly initialized. + * Assert otherwise. + */ +struct lu_ucred *lu_ucred_assert(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred_check(env); + LASSERT(uc != NULL); + return uc; +} +EXPORT_SYMBOL(lu_ucred_assert); + +int lu_ucred_global_init(void) +{ + LU_CONTEXT_KEY_INIT(&lu_ucred_key); + return lu_context_key_register(&lu_ucred_key); +} + +void lu_ucred_global_fini(void) +{ + lu_context_key_degister(&lu_ucred_key); +} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c new file mode 100644 index 000000000000..69d6499ef731 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c @@ -0,0 +1,263 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lustre_handles.c + * + * Author: Phil Schwan + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include + + +static __u64 handle_base; +#define HANDLE_INCR 7 +static spinlock_t handle_base_lock; + +static struct handle_bucket { + spinlock_t lock; + struct list_head head; +} *handle_hash; + +#define HANDLE_HASH_SIZE (1 << 16) +#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) + +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ +void class_handle_hash(struct portals_handle *h, + struct portals_handle_ops *ops) +{ + struct handle_bucket *bucket; + ENTRY; + + LASSERT(h != NULL); + LASSERT(list_empty(&h->h_link)); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ + spin_lock(&handle_base_lock); + handle_base += HANDLE_INCR; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + h->h_cookie = handle_base; + spin_unlock(&handle_base_lock); + + h->h_ops = ops; + spin_lock_init(&h->h_lock); + + bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n", + h, h->h_cookie); + EXIT; +} +EXPORT_SYMBOL(class_handle_hash); + +static void class_handle_unhash_nolock(struct portals_handle *h) +{ + if (list_empty(&h->h_link)) { + CERROR("removing an already-removed handle ("LPX64")\n", + h->h_cookie); + return; + } + + CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n", + h, h->h_cookie); + + spin_lock(&h->h_lock); + if (h->h_in == 0) { + spin_unlock(&h->h_lock); + return; + } + h->h_in = 0; + spin_unlock(&h->h_lock); + list_del_rcu(&h->h_link); +} + +void class_handle_unhash(struct portals_handle *h) +{ + struct handle_bucket *bucket; + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + class_handle_unhash_nolock(h); + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_unhash); + +void class_handle_hash_back(struct portals_handle *h) +{ + struct handle_bucket *bucket; + ENTRY; + + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + EXIT; +} +EXPORT_SYMBOL(class_handle_hash_back); + +void *class_handle2object(__u64 cookie) +{ + struct handle_bucket *bucket; + struct portals_handle *h; + void *retval = NULL; + ENTRY; + + LASSERT(handle_hash != NULL); + + /* Be careful when you want to change this code. See the + * rcu_read_lock() definition on top this file. - jxiong */ + bucket = handle_hash + (cookie & HANDLE_HASH_MASK); + + rcu_read_lock(); + list_for_each_entry_rcu(h, &bucket->head, h_link) { + if (h->h_cookie != cookie) + continue; + + spin_lock(&h->h_lock); + if (likely(h->h_in != 0)) { + h->h_ops->hop_addref(h); + retval = h; + } + spin_unlock(&h->h_lock); + break; + } + rcu_read_unlock(); + + RETURN(retval); +} +EXPORT_SYMBOL(class_handle2object); + +void class_handle_free_cb(cfs_rcu_head_t *rcu) +{ + struct portals_handle *h = RCU2HANDLE(rcu); + void *ptr = (void *)(unsigned long)h->h_cookie; + + if (h->h_ops->hop_free != NULL) + h->h_ops->hop_free(ptr, h->h_size); + else + OBD_FREE(ptr, h->h_size); +} +EXPORT_SYMBOL(class_handle_free_cb); + +int class_handle_init(void) +{ + struct handle_bucket *bucket; + struct timeval tv; + int seed[2]; + + LASSERT(handle_hash == NULL); + + OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE); + if (handle_hash == NULL) + return -ENOMEM; + + spin_lock_init(&handle_base_lock); + for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; + bucket--) { + INIT_LIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + /** bug 21430: add randomness to the initial base */ + cfs_get_random_bytes(seed, sizeof(seed)); + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + + cfs_get_random_bytes(&handle_base, sizeof(handle_base)); + LASSERT(handle_base != 0ULL); + + return 0; +} + +static int cleanup_all_handles(void) +{ + int rc; + int i; + + for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { + struct portals_handle *h; + + spin_lock(&handle_hash[i].lock); + list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) { + CERROR("force clean handle "LPX64" addr %p ops %p\n", + h->h_cookie, h, h->h_ops); + + class_handle_unhash_nolock(h); + rc++; + } + spin_unlock(&handle_hash[i].lock); + } + + return rc; +} + +void class_handle_cleanup(void) +{ + int count; + LASSERT(handle_hash != NULL); + + count = cleanup_all_handles(); + + OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE); + handle_hash = NULL; + + if (count != 0) + CERROR("handle_count at cleanup: %d\n", count); +} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c new file mode 100644 index 000000000000..2fa2589dc8eb --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c @@ -0,0 +1,218 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include + +#define NIDS_MAX 32 + +struct uuid_nid_data { + struct list_head un_list; + struct obd_uuid un_uuid; + int un_nid_count; + lnet_nid_t un_nids[NIDS_MAX]; +}; + +/* FIXME: This should probably become more elegant than a global linked list */ +static struct list_head g_uuid_list; +static spinlock_t g_uuid_lock; + +void class_init_uuidlist(void) +{ + INIT_LIST_HEAD(&g_uuid_list); + spin_lock_init(&g_uuid_lock); +} + +void class_exit_uuidlist(void) +{ + /* delete all */ + class_del_uuid(NULL); +} + +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) +{ + struct uuid_nid_data *data; + struct obd_uuid tmp; + int rc = -ENOENT; + + obd_str2uuid(&tmp, uuid); + spin_lock(&g_uuid_lock); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + if (index >= data->un_nid_count) + break; + + rc = 0; + *peer_nid = data->un_nids[index]; + break; + } + } + spin_unlock(&g_uuid_lock); + return rc; +} +EXPORT_SYMBOL(lustre_uuid_to_peer); + +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ +int class_add_uuid(const char *uuid, __u64 nid) +{ + struct uuid_nid_data *data, *entry; + int found = 0; + + LASSERT(nid != 0); /* valid newconfig NID is never zero */ + + if (strlen(uuid) > UUID_MAX - 1) + return -EOVERFLOW; + + OBD_ALLOC_PTR(data); + if (data == NULL) + return -ENOMEM; + + obd_str2uuid(&data->un_uuid, uuid); + data->un_nids[0] = nid; + data->un_nid_count = 1; + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { + int i; + + found = 1; + for (i = 0; i < entry->un_nid_count; i++) + if (nid == entry->un_nids[i]) + break; + + if (i == entry->un_nid_count) { + LASSERT(entry->un_nid_count < NIDS_MAX); + entry->un_nids[entry->un_nid_count++] = nid; + } + break; + } + } + if (!found) + list_add(&data->un_list, &g_uuid_list); + spin_unlock(&g_uuid_lock); + + if (found) { + CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, + libcfs_nid2str(nid), entry->un_nid_count); + OBD_FREE(data, sizeof(*data)); + } else { + CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); + } + return 0; +} +EXPORT_SYMBOL(class_add_uuid); + +/* Delete the nids for one uuid if specified, otherwise delete all */ +int class_del_uuid(const char *uuid) +{ + LIST_HEAD(deathrow); + struct uuid_nid_data *data; + + spin_lock(&g_uuid_lock); + if (uuid != NULL) { + struct obd_uuid tmp; + + obd_str2uuid(&tmp, uuid); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + list_move(&data->un_list, &deathrow); + break; + } + } + } else + list_splice_init(&g_uuid_list, &deathrow); + spin_unlock(&g_uuid_lock); + + if (uuid != NULL && list_empty(&deathrow)) { + CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); + return -EINVAL; + } + + while (!list_empty(&deathrow)) { + data = list_entry(deathrow.next, struct uuid_nid_data, + un_list); + list_del(&data->un_list); + + CDEBUG(D_INFO, "del uuid %s %s/%d\n", + obd_uuid2str(&data->un_uuid), + libcfs_nid2str(data->un_nids[0]), + data->un_nid_count); + + OBD_FREE(data, sizeof(*data)); + } + + return 0; +} + +/* check if @nid exists in nid list of @uuid */ +int class_check_uuid(struct obd_uuid *uuid, __u64 nid) +{ + struct uuid_nid_data *entry; + int found = 0; + ENTRY; + + CDEBUG(D_INFO, "check if uuid %s has %s.\n", + obd_uuid2str(uuid), libcfs_nid2str(nid)); + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + int i; + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + /* found the uuid, check if it has @nid */ + for (i = 0; i < entry->un_nid_count; i++) { + if (entry->un_nids[i] == nid) { + found = 1; + break; + } + } + break; + } + spin_unlock(&g_uuid_lock); + RETURN(found); +} +EXPORT_SYMBOL(class_check_uuid); diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c new file mode 100644 index 000000000000..b71344a04c7e --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + * Use is subject to license terms. + * + * Author: Johann Lombardi + */ + +#include +#include +#include + +/** + * Initialize new \a lma. Only fid is stored. + * + * \param lma - is the new LMA structure to be initialized + * \param fid - is the FID of the object this LMA belongs to + * \param incompat - features that MDS must understand to access object + */ +void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid, + __u32 incompat) +{ + lma->lma_compat = 0; + lma->lma_incompat = incompat; + lma->lma_self_fid = *fid; + + /* If a field is added in struct lustre_mdt_attrs, zero it explicitly + * and change the test below. */ + LASSERT(sizeof(*lma) == + (offsetof(struct lustre_mdt_attrs, lma_self_fid) + + sizeof(lma->lma_self_fid))); +}; +EXPORT_SYMBOL(lustre_lma_init); + +/** + * Swab, if needed, LMA structure which is stored on-disk in little-endian order. + * + * \param lma - is a pointer to the LMA structure to be swabbed. + */ +void lustre_lma_swab(struct lustre_mdt_attrs *lma) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&lma->lma_compat); + __swab32s(&lma->lma_incompat); + lustre_swab_lu_fid(&lma->lma_self_fid); + } +}; +EXPORT_SYMBOL(lustre_lma_swab); + +/** + * Swab, if needed, SOM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the SOM structure to be swabbed. + */ +void lustre_som_swab(struct som_attrs *attrs) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&attrs->som_compat); + __swab32s(&attrs->som_incompat); + __swab64s(&attrs->som_ioepoch); + __swab64s(&attrs->som_size); + __swab64s(&attrs->som_blocks); + __swab64s(&attrs->som_mountid); + } +}; +EXPORT_SYMBOL(lustre_som_swab); + +/* + * Swab and extract SOM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk SOM extended attribute. + * \param rc - is the SOM xattr stored in \a buf + * \param msd - is the md_som_data structure where to extract SOM attributes. + */ +int lustre_buf2som(void *buf, int rc, struct md_som_data *msd) +{ + struct som_attrs *attrs = (struct som_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no SOM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* check SOM compatibility */ + if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP)) + RETURN(-ENODATA); + + /* unpack SOM attributes */ + lustre_som_swab(attrs); + + /* fill in-memory msd structure */ + msd->msd_compat = attrs->som_compat; + msd->msd_incompat = attrs->som_incompat; + msd->msd_ioepoch = attrs->som_ioepoch; + msd->msd_size = attrs->som_size; + msd->msd_blocks = attrs->som_blocks; + msd->msd_mountid = attrs->som_mountid; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2som); + +/** + * Swab, if needed, HSM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the HSM structure to be swabbed. + */ +void lustre_hsm_swab(struct hsm_attrs *attrs) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&attrs->hsm_compat); + __swab32s(&attrs->hsm_flags); + __swab64s(&attrs->hsm_arch_id); + __swab64s(&attrs->hsm_arch_ver); + } +}; +EXPORT_SYMBOL(lustre_hsm_swab); + +/* + * Swab and extract HSM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk HSM extended attribute. + * \param rc - is the HSM xattr stored in \a buf + * \param mh - is the md_hsm structure where to extract HSM attributes. + */ +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no HSM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* unpack HSM attributes */ + lustre_hsm_swab(attrs); + + /* fill md_hsm structure */ + mh->mh_compat = attrs->hsm_compat; + mh->mh_flags = attrs->hsm_flags; + mh->mh_arch_id = attrs->hsm_arch_id; + mh->mh_arch_ver = attrs->hsm_arch_ver; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2hsm); + +/* + * Pack HSM attributes. + * + * \param buf - is the output buffer where to pack the on-disk HSM xattr. + * \param mh - is the md_hsm structure to pack. + */ +void lustre_hsm2buf(void *buf, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + /* copy HSM attributes */ + attrs->hsm_compat = mh->mh_compat; + attrs->hsm_flags = mh->mh_flags; + attrs->hsm_arch_id = mh->mh_arch_id; + attrs->hsm_arch_ver = mh->mh_arch_ver; + + /* pack xattr */ + lustre_hsm_swab(attrs); +} +EXPORT_SYMBOL(lustre_hsm2buf); diff --git a/drivers/staging/lustre/lustre/obdclass/md_local_object.c b/drivers/staging/lustre/lustre/obdclass/md_local_object.c new file mode 100644 index 000000000000..ac5f44f19715 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/md_local_object.c @@ -0,0 +1,459 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/md_local_object.c + * + * Lustre Local Object create APIs + * 'create on first mount' facility. Files registed under llo module will + * be created on first mount. + * + * Author: Pravin Shelar + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include + + +/** List head to hold list of objects to be created. */ +static struct list_head llo_lobj_list; + +/** Lock to protect list manipulations */ +static struct mutex llo_lock; + +/** + * Structure used to maintain state of path parsing. + * \see llo_find_entry, llo_store_resolve + */ +struct llo_find_hint { + struct lu_fid *lfh_cfid; + struct md_device *lfh_md; + struct md_object *lfh_pobj; +}; + +/** + * Thread Local storage for this module. + */ +struct llo_thread_info { + /** buffer to resolve path */ + char lti_buf[DT_MAX_PATH]; + /** used for path resolve */ + struct lu_fid lti_fid; + /** used to pass child object fid */ + struct lu_fid lti_cfid; + struct llo_find_hint lti_lfh; + struct md_op_spec lti_spc; + struct md_attr lti_ma; + struct lu_name lti_lname; +}; + +LU_KEY_INIT(llod_global, struct llo_thread_info); +LU_KEY_FINI(llod_global, struct llo_thread_info); + +static struct lu_context_key llod_key = { + .lct_tags = LCT_MD_THREAD, + .lct_init = llod_global_key_init, + .lct_fini = llod_global_key_fini +}; + +static inline struct llo_thread_info * llo_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &llod_key); +} + +/** + * Search md object for given fid. + */ +static struct md_object *llo_locate(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *fid) +{ + struct lu_object *obj; + struct md_object *mdo; + + obj = lu_object_find(env, &md->md_lu_dev, fid, NULL); + if (!IS_ERR(obj)) { + obj = lu_object_locate(obj->lo_header, md->md_lu_dev.ld_type); + LASSERT(obj != NULL); + mdo = (struct md_object *) obj; + } else + mdo = (struct md_object *)obj; + return mdo; +} + +/** + * Lookup FID for object named \a name in directory \a pobj. + */ +static int llo_lookup(const struct lu_env *env, + struct md_object *pobj, + const char *name, + struct lu_fid *fid) +{ + struct llo_thread_info *info = llo_env_info(env); + struct lu_name *lname = &info->lti_lname; + struct md_op_spec *spec = &info->lti_spc; + + spec->sp_feat = NULL; + spec->sp_cr_flags = 0; + spec->sp_cr_lookup = 0; + spec->sp_cr_mode = 0; + + lname->ln_name = name; + lname->ln_namelen = strlen(name); + + return mdo_lookup(env, pobj, lname, fid, spec); +} + +/** + * Function to look up path component, this is passed to parsing + * function. \see llo_store_resolve + * + * \retval rc returns error code for lookup or locate operation + * + * pointer to object is returned in data (lfh->lfh_pobj) + */ +static int llo_find_entry(const struct lu_env *env, + const char *name, void *data) +{ + struct llo_find_hint *lfh = data; + struct md_device *md = lfh->lfh_md; + struct lu_fid *fid = lfh->lfh_cfid; + struct md_object *obj = lfh->lfh_pobj; + int result; + + /* lookup fid for object */ + result = llo_lookup(env, obj, name, fid); + lu_object_put(env, &obj->mo_lu); + + if (result == 0) { + /* get md object for fid that we got in lookup */ + obj = llo_locate(env, md, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + + lfh->lfh_pobj = obj; + return result; +} + +static struct md_object *llo_reg_open(const struct lu_env *env, + struct md_device *md, + struct md_object *p, + const char *name, + struct lu_fid *fid) +{ + struct md_object *o; + int result; + + result = llo_lookup(env, p, name, fid); + if (result == 0) + o = llo_locate(env, md, fid); + else + o = ERR_PTR(result); + + return o; +} + +/** + * Resolve given \a path, on success function returns + * md object for last directory and \a fid points to + * its fid. + */ +struct md_object *llo_store_resolve(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *path, + struct lu_fid *fid) +{ + struct llo_thread_info *info = llo_env_info(env); + struct llo_find_hint *lfh = &info->lti_lfh; + char *local = info->lti_buf; + struct md_object *obj; + int result; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; + + lfh->lfh_md = md; + lfh->lfh_cfid = fid; + /* start path resolution from backend fs root. */ + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + /* get md object for root */ + obj = llo_locate(env, md, fid); + if (!IS_ERR(obj)) { + /* start path parser from root md */ + lfh->lfh_pobj = obj; + result = dt_path_parser(env, local, llo_find_entry, lfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = lfh->lfh_pobj; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} +EXPORT_SYMBOL(llo_store_resolve); + +/** + * Returns md object for \a objname in given \a dirname. + */ +struct md_object *llo_store_open(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + struct lu_fid *fid) +{ + struct md_object *obj; + struct md_object *dir; + + /* search md object for parent dir */ + dir = llo_store_resolve(env, md, dt, dirname, fid); + if (!IS_ERR(dir)) { + obj = llo_reg_open(env, md, dir, objname, fid); + lu_object_put(env, &dir->mo_lu); + } else + obj = dir; + + return obj; +} +EXPORT_SYMBOL(llo_store_open); + +static struct md_object *llo_create_obj(const struct lu_env *env, + struct md_device *md, + struct md_object *dir, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat) +{ + struct llo_thread_info *info = llo_env_info(env); + struct md_object *mdo; + struct md_attr *ma = &info->lti_ma; + struct md_op_spec *spec = &info->lti_spc; + struct lu_name *lname = &info->lti_lname; + struct lu_attr *la = &ma->ma_attr; + int rc; + + mdo = llo_locate(env, md, fid); + if (IS_ERR(mdo)) + return mdo; + + lname->ln_name = objname; + lname->ln_namelen = strlen(objname); + + spec->sp_feat = feat; + spec->sp_cr_flags = 0; + spec->sp_cr_lookup = 1; + spec->sp_cr_mode = 0; + + if (feat == &dt_directory_features) + la->la_mode = S_IFDIR | S_IXUGO; + else + la->la_mode = S_IFREG; + + la->la_mode |= S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + la->la_uid = la->la_gid = 0; + la->la_valid = LA_MODE | LA_UID | LA_GID; + + ma->ma_valid = 0; + ma->ma_need = 0; + + rc = mdo_create(env, dir, lname, mdo, spec, ma); + + if (rc) { + lu_object_put(env, &mdo->mo_lu); + mdo = ERR_PTR(rc); + } + + return mdo; +} + +/** + * Create md object, object could be diretcory or + * special index defined by \a feat in \a directory. + * + * \param md device + * \param dirname parent directory + * \param objname file name + * \param fid object fid + * \param feat index features required for directory create + */ + +struct md_object *llo_store_create_index(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat) +{ + struct llo_thread_info *info = llo_env_info(env); + struct md_object *obj; + struct md_object *dir; + struct lu_fid *ignore = &info->lti_fid; + + dir = llo_store_resolve(env, md, dt, dirname, ignore); + if (!IS_ERR(dir)) { + obj = llo_create_obj(env, md, dir, objname, fid, feat); + lu_object_put(env, &dir->mo_lu); + } else { + obj = dir; + } + return obj; +} + +EXPORT_SYMBOL(llo_store_create_index); + +/** + * Create md object for regular file in \a directory. + * + * \param md device + * \param dirname parent directory + * \param objname file name + * \param fid object fid. + */ + +struct md_object *llo_store_create(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid) +{ + return llo_store_create_index(env, md, dt, dirname, + objname, fid, NULL); +} + +EXPORT_SYMBOL(llo_store_create); + +/** + * Register object for 'create on first mount' facility. + * objects are created in order of registration. + */ + +void llo_local_obj_register(struct lu_local_obj_desc *llod) +{ + mutex_lock(&llo_lock); + list_add_tail(&llod->llod_linkage, &llo_lobj_list); + mutex_unlock(&llo_lock); +} + +EXPORT_SYMBOL(llo_local_obj_register); + +void llo_local_obj_unregister(struct lu_local_obj_desc *llod) +{ + mutex_lock(&llo_lock); + list_del(&llod->llod_linkage); + mutex_unlock(&llo_lock); +} + +EXPORT_SYMBOL(llo_local_obj_unregister); + +/** + * Created registed objects. + */ + +int llo_local_objects_setup(const struct lu_env *env, + struct md_device * md, + struct dt_device *dt) +{ + struct llo_thread_info *info = llo_env_info(env); + struct lu_fid *fid; + struct lu_local_obj_desc *scan; + struct md_object *mdo; + const char *dir; + int rc = 0; + + fid = &info->lti_cfid; + mutex_lock(&llo_lock); + + list_for_each_entry(scan, &llo_lobj_list, llod_linkage) { + lu_local_obj_fid(fid, scan->llod_oid); + dir = ""; + if (scan->llod_dir) + dir = scan->llod_dir; + + if (scan->llod_is_index) + mdo = llo_store_create_index(env, md, dt , + dir, scan->llod_name, + fid, + scan->llod_feat); + else + mdo = llo_store_create(env, md, dt, + dir, scan->llod_name, + fid); + if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) { + rc = PTR_ERR(mdo); + CERROR("creating obj [%s] fid = "DFID" rc = %d\n", + scan->llod_name, PFID(fid), rc); + goto out; + } + + if (!IS_ERR(mdo)) + lu_object_put(env, &mdo->mo_lu); + } + +out: + mutex_unlock(&llo_lock); + return rc; +} + +EXPORT_SYMBOL(llo_local_objects_setup); + +int llo_global_init(void) +{ + int result; + + INIT_LIST_HEAD(&llo_lobj_list); + mutex_init(&llo_lock); + + LU_CONTEXT_KEY_INIT(&llod_key); + result = lu_context_key_register(&llod_key); + return result; +} + +void llo_global_fini(void) +{ + lu_context_key_degister(&llod_key); + LASSERT(list_empty(&llo_lobj_list)); +} diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c new file mode 100644 index 000000000000..c4f0dbc23611 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/mea.c @@ -0,0 +1,112 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include +#include /* for request_module() */ +#include +#include +#include +#include +#include + +static int mea_last_char_hash(int count, char *name, int namelen) +{ + unsigned int c; + + c = name[namelen - 1]; + if (c == 0) + CWARN("looks like wrong len is passed\n"); + c = c % count; + return c; +} + +static int mea_all_chars_hash(int count, char *name, int namelen) +{ + unsigned int c = 0; + + while (--namelen >= 0) + c += name[namelen]; + c = c % count; + return c; +} + +int raw_name2idx(int hashtype, int count, const char *name, int namelen) +{ + unsigned int c = 0; + int idx; + + LASSERT(namelen > 0); + + if (filename_is_volatile(name, namelen, &idx)) { + if ((idx >= 0) && (idx < count)) + return idx; + goto hashchoice; + } + + if (count <= 1) + return 0; + +hashchoice: + switch (hashtype) { + case MEA_MAGIC_LAST_CHAR: + c = mea_last_char_hash(count, (char *)name, namelen); + break; + case MEA_MAGIC_ALL_CHARS: + c = mea_all_chars_hash(count, (char *)name, namelen); + break; + case MEA_MAGIC_HASH_SEGMENT: + CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n"); + break; + default: + CERROR("Unknown hash type 0x%x\n", hashtype); + } + + LASSERT(c < count); + return c; +} +EXPORT_SYMBOL(raw_name2idx); + +int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen) +{ + unsigned int c; + + LASSERT(mea && mea->mea_count); + + c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen); + + LASSERT(c < mea->mea_count); + return c; +} +EXPORT_SYMBOL(mea_name2idx); diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c new file mode 100644 index 000000000000..9636aa9efed0 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c @@ -0,0 +1,1899 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_config.c + * + * Config API + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include +#include +#include +#include +#include + +#include "llog_internal.h" + +static cfs_hash_ops_t uuid_hash_ops; +static cfs_hash_ops_t nid_hash_ops; +static cfs_hash_ops_t nid_stat_hash_ops; + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + if ((ptr = strstr(buf, key)) == NULL) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_find_param); + +/** + * Check whether the proc parameter \a param is an old parameter or not from + * the array \a ptr which contains the mapping from old parameters to new ones. + * If it's an old one, then return the pointer to the cfg_interop_param struc- + * ture which contains both the old and new parameters. + * + * \param param proc parameter + * \param ptr an array which contains the mapping from + * old parameters to new ones + * + * \retval valid-pointer pointer to the cfg_interop_param structure + * which contains the old and new parameters + * \retval NULL \a param or \a ptr is NULL, + * or \a param is not an old parameter + */ +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr) +{ + char *value = NULL; + int name_len = 0; + + if (param == NULL || ptr == NULL) + RETURN(NULL); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + while (ptr->old_param != NULL) { + if (strncmp(param, ptr->old_param, name_len) == 0 && + name_len == strlen(ptr->old_param)) + RETURN(ptr); + ptr++; + } + + RETURN(NULL); +} +EXPORT_SYMBOL(class_find_old_param); + +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (q1 == NULL) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (q2 == NULL) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} +EXPORT_SYMBOL(class_get_next_param); + +/* returns 0 if this is the first key in the buffer, else 1. + valp points to first char after key. */ +int class_match_param(char *buf, char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_match_param); + +static int parse_nid(char *buf, void *value, int quiet) +{ + lnet_nid_t *nid = (lnet_nid_t *)value; + + *nid = libcfs_str2nid(buf); + if (*nid != LNET_NID_ANY) + return 0; + + if (!quiet) + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); + return -EINVAL; +} + +static int parse_net(char *buf, void *value) +{ + __u32 *net = (__u32 *)value; + + *net = libcfs_str2net(buf); + CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); + return 0; +} + +enum { + CLASS_PARSE_NID = 1, + CLASS_PARSE_NET, +}; + +/* 0 is good nid, + 1 not found + < 0 error + endh is set to next separator */ +static int class_parse_value(char *buf, int opc, void *value, char **endh, + int quiet) +{ + char *endp; + char tmp; + int rc = 0; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* nid separators or end of nids */ + endp = strpbrk(buf, ",: /"); + if (endp == NULL) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + switch (opc) { + default: + LBUG(); + case CLASS_PARSE_NID: + rc = parse_nid(buf, value, quiet); + break; + case CLASS_PARSE_NET: + rc = parse_net(buf, value); + break; + } + *endp = tmp; + if (rc != 0) + return rc; + if (endh) + *endh = endp; + return 0; +} + +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); +} +EXPORT_SYMBOL(class_parse_nid); + +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); +} +EXPORT_SYMBOL(class_parse_nid_quiet); + +int class_parse_net(char *buf, __u32 *net, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0); +} +EXPORT_SYMBOL(class_parse_net); + +/* 1 param contains key and match + * 0 param contains key and not match + * -1 param does not contain key + */ +int class_match_nid(char *buf, char *key, lnet_nid_t nid) +{ + lnet_nid_t tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified nids */ + while (class_parse_nid(buf, &tmp, &buf) == 0) { + if (tmp == nid) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_nid); + +int class_match_net(char *buf, char *key, __u32 net) +{ + __u32 tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified networks */ + while (class_parse_net(buf, &tmp, &buf) == 0) { + if (tmp == net) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_net); + +/********************** class fns **********************/ + +/** + * Create a new obd device and set the type, name and uuid. If successful, + * the new device can be accessed by either name or uuid. + */ +int class_attach(struct lustre_cfg *lcfg) +{ + struct obd_device *obd = NULL; + char *typename, *name, *uuid; + int rc, len; + ENTRY; + + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("No type passed!\n"); + RETURN(-EINVAL); + } + typename = lustre_cfg_string(lcfg, 1); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { + CERROR("No name passed!\n"); + RETURN(-EINVAL); + } + name = lustre_cfg_string(lcfg, 0); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { + CERROR("No UUID passed!\n"); + RETURN(-EINVAL); + } + uuid = lustre_cfg_string(lcfg, 2); + + CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", + MKSTR(typename), MKSTR(name), MKSTR(uuid)); + + obd = class_newdev(typename, name); + if (IS_ERR(obd)) { + /* Already exists or out of obds */ + rc = PTR_ERR(obd); + obd = NULL; + CERROR("Cannot create device %s of type %s : %d\n", + name, typename, rc); + GOTO(out, rc); + } + LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n", + name, typename); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08X != %08X\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, + "%p obd_name %s != %s\n", obd, obd->obd_name, name); + + rwlock_init(&obd->obd_pool_lock); + obd->obd_pool_limit = 0; + obd->obd_pool_slv = 0; + + INIT_LIST_HEAD(&obd->obd_exports); + INIT_LIST_HEAD(&obd->obd_unlinked_exports); + INIT_LIST_HEAD(&obd->obd_delayed_exports); + INIT_LIST_HEAD(&obd->obd_exports_timed); + INIT_LIST_HEAD(&obd->obd_nid_stats); + spin_lock_init(&obd->obd_nid_lock); + spin_lock_init(&obd->obd_dev_lock); + mutex_init(&obd->obd_dev_mutex); + spin_lock_init(&obd->obd_osfs_lock); + /* obd->obd_osfs_age must be set to a value in the distant + * past to guarantee a fresh statfs is fetched on mount. */ + obd->obd_osfs_age = cfs_time_shift_64(-1000); + + /* XXX belongs in setup not attach */ + init_rwsem(&obd->obd_observer_link_sem); + /* recovery data */ + cfs_init_timer(&obd->obd_recovery_timer); + spin_lock_init(&obd->obd_recovery_task_lock); + init_waitqueue_head(&obd->obd_next_transno_waitq); + init_waitqueue_head(&obd->obd_evict_inprogress_waitq); + INIT_LIST_HEAD(&obd->obd_req_replay_queue); + INIT_LIST_HEAD(&obd->obd_lock_replay_queue); + INIT_LIST_HEAD(&obd->obd_final_req_queue); + INIT_LIST_HEAD(&obd->obd_evict_list); + + llog_group_init(&obd->obd_olg, FID_SEQ_LLOG); + + obd->obd_conn_inprogress = 0; + + len = strlen(uuid); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("uuid must be < %d bytes long\n", + (int)sizeof(obd->obd_uuid)); + GOTO(out, rc = -EINVAL); + } + memcpy(obd->obd_uuid.uuid, uuid, len); + + /* do the attach */ + if (OBP(obd, attach)) { + rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg); + if (rc) + GOTO(out, rc = -EINVAL); + } + + /* Detach drops this */ + spin_lock(&obd->obd_dev_lock); + atomic_set(&obd->obd_refcount, 1); + spin_unlock(&obd->obd_dev_lock); + lu_ref_init(&obd->obd_reference); + lu_ref_add(&obd->obd_reference, "attach", obd); + + obd->obd_attached = 1; + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); + RETURN(0); + out: + if (obd != NULL) { + class_release_dev(obd); + } + return rc; +} +EXPORT_SYMBOL(class_attach); + +/** Create hashes, self-export, and call type-specific setup. + * Setup is effectively the "start this obd" call. + */ +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + struct obd_export *exp; + ENTRY; + + LASSERT(obd != NULL); + LASSERTF(obd == class_num2obd(obd->obd_minor), + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, class_num2obd(obd->obd_minor)); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + + /* have we attached a type to this device? */ + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + + if (obd->obd_set_up) { + CERROR("Device %d already setup (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + + /* is someone else setting us up right now? (attach inits spinlock) */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_starting) { + spin_unlock(&obd->obd_dev_lock); + CERROR("Device %d setup in progress (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + /* just leave this on forever. I can't use obd_set_up here because + other fns check that status, and we're not actually set up yet. */ + obd->obd_starting = 1; + obd->obd_uuid_hash = NULL; + obd->obd_nid_hash = NULL; + obd->obd_nid_stats_hash = NULL; + spin_unlock(&obd->obd_dev_lock); + + /* create an uuid-export lustre hash */ + obd->obd_uuid_hash = cfs_hash_create("UUID_HASH", + HASH_UUID_CUR_BITS, + HASH_UUID_MAX_BITS, + HASH_UUID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &uuid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_uuid_hash) + GOTO(err_hash, err = -ENOMEM); + + /* create a nid-export lustre hash */ + obd->obd_nid_hash = cfs_hash_create("NID_HASH", + HASH_NID_CUR_BITS, + HASH_NID_MAX_BITS, + HASH_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_hash) + GOTO(err_hash, err = -ENOMEM); + + /* create a nid-stats lustre hash */ + obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS", + HASH_NID_STATS_CUR_BITS, + HASH_NID_STATS_MAX_BITS, + HASH_NID_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_stat_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_stats_hash) + GOTO(err_hash, err = -ENOMEM); + + exp = class_new_export(obd, &obd->obd_uuid); + if (IS_ERR(exp)) + GOTO(err_hash, err = PTR_ERR(exp)); + + obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); + class_export_put(exp); + + err = obd_setup(obd, lcfg); + if (err) + GOTO(err_exp, err); + + obd->obd_set_up = 1; + + spin_lock(&obd->obd_dev_lock); + /* cleanup drops this */ + class_incref(obd, "setup", obd); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + RETURN(0); +err_exp: + if (obd->obd_self_export) { + class_unlink_export(obd->obd_self_export); + obd->obd_self_export = NULL; + } +err_hash: + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + obd->obd_starting = 0; + CERROR("setup %s failed (%d)\n", obd->obd_name, err); + return err; +} +EXPORT_SYMBOL(class_setup); + +/** We have finished using this obd and are ready to destroy it. + * There can be no more references to this obd. + */ +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + ENTRY; + + if (obd->obd_set_up) { + CERROR("OBD device %d still set up\n", obd->obd_minor); + RETURN(-EBUSY); + } + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_attached) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + obd->obd_attached = 0; + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + class_decref(obd, "attach", obd); + RETURN(0); +} +EXPORT_SYMBOL(class_detach); + +/** Start shutting down the obd. There may be in-progess ops when + * this is called. We tell them to start shutting down with a call + * to class_disconnect_exports(). + */ +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + char *flag; + ENTRY; + + OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + RETURN(-ENODEV); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD %d already stopping\n", obd->obd_minor); + RETURN(-ENODEV); + } + /* Leave this on forever */ + obd->obd_stopping = 1; + + /* wait for already-arrived-connections to finish. */ + while (obd->obd_conn_inprogress > 0) { + spin_unlock(&obd->obd_dev_lock); + + cond_resched(); + + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); + + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) + switch (*flag) { + case 'F': + obd->obd_force = 1; + break; + case 'A': + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); + obd->obd_fail = 1; + obd->obd_no_transno = 1; + obd->obd_no_recov = 1; + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + } + break; + default: + CERROR("Unrecognised flag '%c'\n", *flag); + } + } + + LASSERT(obd->obd_self_export); + + /* The three references that should be remaining are the + * obd_self_export and the attach and setup references. */ + if (atomic_read(&obd->obd_refcount) > 3) { + /* refcounf - 3 might be the number of real exports + (excluding self export). But class_incref is called + by other things as well, so don't count on it. */ + CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n", + obd->obd_name, atomic_read(&obd->obd_refcount) - 3); + dump_exports(obd, 0); + class_disconnect_exports(obd); + } + + /* Precleanup, we must make sure all exports get destroyed. */ + err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS); + if (err) + CERROR("Precleanup %s returned %d\n", + obd->obd_name, err); + + /* destroy an uuid-export hash body */ + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + + /* destroy a nid-export hash body */ + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + + /* destroy a nid-stats hash body */ + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + + class_decref(obd, "setup", obd); + obd->obd_set_up = 0; + + RETURN(0); +} +EXPORT_SYMBOL(class_cleanup); + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source) +{ + lu_ref_add_atomic(&obd->obd_reference, scope, source); + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount)); + + return obd; +} +EXPORT_SYMBOL(class_incref); + +void class_decref(struct obd_device *obd, const char *scope, const void *source) +{ + int err; + int refs; + + spin_lock(&obd->obd_dev_lock); + atomic_dec(&obd->obd_refcount); + refs = atomic_read(&obd->obd_refcount); + spin_unlock(&obd->obd_dev_lock); + lu_ref_del(&obd->obd_reference, scope, source); + + CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs); + + if ((refs == 1) && obd->obd_stopping) { + /* All exports have been destroyed; there should + be no more in-progress ops by this point.*/ + + spin_lock(&obd->obd_self_export->exp_lock); + obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); + spin_unlock(&obd->obd_self_export->exp_lock); + + /* note that we'll recurse into class_decref again */ + class_unlink_export(obd->obd_self_export); + return; + } + + if (refs == 0) { + CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", + obd->obd_name, obd->obd_uuid.uuid); + LASSERT(!obd->obd_attached); + if (obd->obd_stopping) { + /* If we're not stopping, we were never set up */ + err = obd_cleanup(obd); + if (err) + CERROR("Cleanup %s returned %d\n", + obd->obd_name, err); + } + if (OBP(obd, detach)) { + err = OBP(obd, detach)(obd); + if (err) + CERROR("Detach returned %d\n", err); + } + class_release_dev(obd); + } +} +EXPORT_SYMBOL(class_decref); + +/** Add a failover nid location. + * Client obd types contact server obd types using this nid list. + */ +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("can't add connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to add conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); + + RETURN(rc); +} +EXPORT_SYMBOL(class_add_conn); + +/** Remove a failover nid location. + */ +int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("can't del connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to del conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_del_conn(imp, &uuid); + + RETURN(rc); +} + +LIST_HEAD(lustre_profile_list); + +struct lustre_profile *class_get_profile(const char * prof) +{ + struct lustre_profile *lprof; + + ENTRY; + list_for_each_entry(lprof, &lustre_profile_list, lp_list) { + if (!strcmp(lprof->lp_profile, prof)) { + RETURN(lprof); + } + } + RETURN(NULL); +} +EXPORT_SYMBOL(class_get_profile); + +/** Create a named "profile". + * This defines the mdc and osc names to use for a client. + * This also is used to define the lov to be used by a mdt. + */ +int class_add_profile(int proflen, char *prof, int osclen, char *osc, + int mdclen, char *mdc) +{ + struct lustre_profile *lprof; + int err = 0; + ENTRY; + + CDEBUG(D_CONFIG, "Add profile %s\n", prof); + + OBD_ALLOC(lprof, sizeof(*lprof)); + if (lprof == NULL) + RETURN(-ENOMEM); + INIT_LIST_HEAD(&lprof->lp_list); + + LASSERT(proflen == (strlen(prof) + 1)); + OBD_ALLOC(lprof->lp_profile, proflen); + if (lprof->lp_profile == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_profile, prof, proflen); + + LASSERT(osclen == (strlen(osc) + 1)); + OBD_ALLOC(lprof->lp_dt, osclen); + if (lprof->lp_dt == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_dt, osc, osclen); + + if (mdclen > 0) { + LASSERT(mdclen == (strlen(mdc) + 1)); + OBD_ALLOC(lprof->lp_md, mdclen); + if (lprof->lp_md == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_md, mdc, mdclen); + } + + list_add(&lprof->lp_list, &lustre_profile_list); + RETURN(err); + +out: + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, mdclen); + if (lprof->lp_dt) + OBD_FREE(lprof->lp_dt, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); + RETURN(err); +} + +void class_del_profile(const char *prof) +{ + struct lustre_profile *lprof; + ENTRY; + + CDEBUG(D_CONFIG, "Del profile %s\n", prof); + + lprof = class_get_profile(prof); + if (lprof) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof *lprof); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profile); + +/* COMPAT_146 */ +void class_del_profiles(void) +{ + struct lustre_profile *lprof, *n; + ENTRY; + + list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof *lprof); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profiles); + +static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg) +{ + ENTRY; + if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0) + at_min = val; + else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0) + at_max = val; + else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0) + at_extra = val; + else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0) + at_early_margin = val; + else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0) + at_history = val; + else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0) + strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2), + JOBSTATS_JOBID_VAR_MAX_LEN + 1); + else + RETURN(-EINVAL); + + CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val); + RETURN(0); +} + + +/* We can't call ll_process_config or lquota_process_config directly because + * it lives in a module that must be loaded after this one. */ +static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL; +static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL; + +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)) +{ + client_process_config = cpc; +} +EXPORT_SYMBOL(lustre_register_client_process_config); + +/** + * Rename the proc parameter in \a cfg with a new name \a new_name. + * + * \param cfg config structure which contains the proc parameter + * \param new_name new name of the proc parameter + * + * \retval valid-pointer pointer to the newly-allocated config structure + * which contains the renamed proc parameter + * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does + * not contain a proc parameter + * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs + */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *new_cfg = NULL; + char *param = NULL; + char *new_param = NULL; + char *value = NULL; + int name_len = 0; + int new_len = 0; + ENTRY; + + if (cfg == NULL || new_name == NULL) + RETURN(ERR_PTR(-EINVAL)); + + param = lustre_cfg_string(cfg, 1); + if (param == NULL) + RETURN(ERR_PTR(-EINVAL)); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len; + + OBD_ALLOC(new_param, new_len); + if (new_param == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + strcpy(new_param, new_name); + if (value != NULL) + strcat(new_param, value); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) { + OBD_FREE(new_param, new_len); + RETURN(ERR_PTR(-ENOMEM)); + } + + lustre_cfg_bufs_reset(bufs, NULL); + lustre_cfg_bufs_init(bufs, cfg); + lustre_cfg_bufs_set_string(bufs, 1, new_param); + + new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs); + + OBD_FREE(new_param, new_len); + OBD_FREE_PTR(bufs); + if (new_cfg == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + new_cfg->lcfg_num = cfg->lcfg_num; + new_cfg->lcfg_flags = cfg->lcfg_flags; + new_cfg->lcfg_nid = cfg->lcfg_nid; + new_cfg->lcfg_nal = cfg->lcfg_nal; + + RETURN(new_cfg); +} +EXPORT_SYMBOL(lustre_cfg_rename); + +void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg)) +{ + quota_process_config = qpc; +} +EXPORT_SYMBOL(lustre_register_quota_process_config); + +/** Process configuration commands given in lustre_cfg form. + * These may come from direct calls (e.g. class_manual_cleanup) + * or processing the config llog, or ioctl from lctl. + */ +int class_process_config(struct lustre_cfg *lcfg) +{ + struct obd_device *obd; + int err; + + LASSERT(lcfg && !IS_ERR(lcfg)); + CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); + + /* Commands that don't need a device */ + switch(lcfg->lcfg_command) { + case LCFG_ATTACH: { + err = class_attach(lcfg); + GOTO(out, err); + } + case LCFG_ADD_UUID: { + CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64 + " (%s)\n", lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid)); + + err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid); + GOTO(out, err); + } + case LCFG_DEL_UUID: { + CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) + ? "" : lustre_cfg_string(lcfg, 1)); + + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); + GOTO(out, err); + } + case LCFG_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + /* set these mount options somewhere, so ll_fill_super + * can find them. */ + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err); + } + case LCFG_DEL_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); + class_del_profile(lustre_cfg_string(lcfg, 1)); + GOTO(out, err = 0); + } + case LCFG_SET_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", + obd_timeout, lcfg->lcfg_num); + obd_timeout = max(lcfg->lcfg_num, 1U); + obd_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_LDLM_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n", + ldlm_timeout, lcfg->lcfg_num); + ldlm_timeout = max(lcfg->lcfg_num, 1U); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + ldlm_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_UPCALL: { + LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); + /* COMPAT_146 Don't fail on old configs */ + GOTO(out, err = 0); + } + case LCFG_MARKER: { + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_tgtname, marker->cm_comment); + GOTO(out, err = 0); + } + case LCFG_PARAM: { + char *tmp; + /* llite has no obd */ + if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_LLITE, 0) == 0) && + client_process_config) { + err = (*client_process_config)(lcfg); + GOTO(out, err); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_SYS, &tmp) == 0)) { + /* Global param settings */ + err = class_set_global(tmp, lcfg->lcfg_num, lcfg); + /* + * Client or server should not fail to mount if + * it hits an unknown configuration parameter. + */ + if (err != 0) + CWARN("Ignoring unknown param %s\n", tmp); + + GOTO(out, err = 0); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_QUOTA, &tmp) == 0) && + quota_process_config) { + err = (*quota_process_config)(lcfg); + GOTO(out, err); + } + /* Fall through */ + break; + } + } + + /* Commands that require a device */ + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (obd == NULL) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) + CERROR("this lcfg command requires a device name\n"); + else + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); + + GOTO(out, err = -EINVAL); + } + + switch(lcfg->lcfg_command) { + case LCFG_SETUP: { + err = class_setup(obd, lcfg); + GOTO(out, err); + } + case LCFG_DETACH: { + err = class_detach(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_CLEANUP: { + err = class_cleanup(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_ADD_CONN: { + err = class_add_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_DEL_CONN: { + err = class_del_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } + default: { + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + GOTO(out, err); + + } + } +out: + if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Ignoring error %d on optional command %#x\n", err, + lcfg->lcfg_command); + err = 0; + } + return err; +} +EXPORT_SYMBOL(class_process_config); + +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data) +{ + struct lprocfs_vars *var; + char *key, *sval; + int i, keylen, vallen; + int matched = 0, j = 0; + int rc = 0; + int skip = 0; + ENTRY; + + if (lcfg->lcfg_command != LCFG_PARAM) { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + RETURN(-EINVAL); + } + + /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt + or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar + or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + key = lustre_cfg_buf(lcfg, i); + /* Strip off prefix */ + class_match_param(key, prefix, &key); + sval = strchr(key, '='); + if (!sval || (*(sval + 1) == 0)) { + CERROR("Can't parse param %s (missing '=')\n", key); + /* rc = -EINVAL; continue parsing other params */ + continue; + } + keylen = sval - key; + sval++; + vallen = strlen(sval); + matched = 0; + j = 0; + /* Search proc entries */ + while (lvars[j].name) { + var = &lvars[j]; + if (class_match_param(key, (char *)var->name, 0) == 0 && + keylen == strlen(var->name)) { + matched++; + rc = -EROFS; + if (var->write_fptr) { + mm_segment_t oldfs; + oldfs = get_fs(); + set_fs(KERNEL_DS); + rc = (var->write_fptr)(NULL, sval, + vallen, data); + set_fs(oldfs); + } + break; + } + j++; + } + if (!matched) { + /* If the prefix doesn't match, return error so we + can pass it down the stack */ + if (strnchr(key, keylen, '.')) + RETURN(-ENOSYS); + CERROR("%s: unknown param %s\n", + (char *)lustre_cfg_string(lcfg, 0), key); + /* rc = -EINVAL; continue parsing other params */ + skip++; + } else if (rc < 0) { + CERROR("writing proc entry %s err %d\n", + var->name, rc); + rc = 0; + } else { + CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n", + lustre_cfg_string(lcfg, 0), + (int)strlen(prefix) - 1, prefix, + (int)(sval - key - 1), key, sval); + } + } + + if (rc > 0) + rc = 0; + if (!rc && skip) + rc = skip; + RETURN(rc); +} +EXPORT_SYMBOL(class_process_proc_param); + +extern int lustre_check_exclusion(struct super_block *sb, char *svname); + +/** Parse a configuration llog, doing various manipulations on them + * for various reasons, (modifications for compatibility, skip obsolete + * records, change uuids, etc), then class_process_config() resulting + * net records. + */ +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *clli = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char*) (rec + 1); + int rc = 0; + ENTRY; + + //class_config_dump_handler(handle, rec, data); + + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; + char *inst_name = NULL; + int inst_len = 0; + int inst = 0, swab = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", + clli->cfg_flags, marker->cm_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + clli->cfg_flags = CFG_F_MARKER; + if (marker->cm_flags & CM_SKIP) { + clli->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if ((marker->cm_flags & CM_EXCLUDE) || + (clli->cfg_sb && + lustre_check_exclusion(clli->cfg_sb, + marker->cm_tgtname))) { + clli->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + clli->cfg_flags = 0; + } + } + /* A config command without a start marker before it is + illegal (post 146) */ + if (!(clli->cfg_flags & CFG_F_COMPAT146) && + !(clli->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Config not inside markers, ignoring! " + "(inst: %p, uuid: %s, flags: %#x)\n", + clli->cfg_instance, + clli->cfg_uuid.uuid, clli->cfg_flags); + clli->cfg_flags |= CFG_F_SKIP; + } + if (clli->cfg_flags & CFG_F_SKIP) { + CDEBUG(D_CONFIG, "skipping %#x\n", + clli->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" obd device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd " + "type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CDEBUG(D_INFO, "For 1.8 interoperability, " + "set this index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + + + if ((clli->cfg_flags & CFG_F_EXCLUDE) && + (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + + lustre_cfg_bufs_init(&bufs, lcfg); + + if (clli && clli->cfg_instance && + LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){ + inst = 1; + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + + sizeof(clli->cfg_instance) * 2 + 4; + OBD_ALLOC(inst_name, inst_len); + if (inst_name == NULL) + GOTO(out, rc = -ENOMEM); + sprintf(inst_name, "%s-%p", + lustre_cfg_string(lcfg, 0), + clli->cfg_instance); + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); + } + + /* we override the llog's uuid for clients, to insure they + are unique */ + if (clli && clli->cfg_instance != NULL && + lcfg->lcfg_command == LCFG_ATTACH) { + lustre_cfg_bufs_set_string(&bufs, 2, + clli->cfg_uuid.uuid); + } + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (clli && clli->cfg_instance == NULL && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + clli->cfg_obdname); + } + + lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); + + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + + /* XXX Hack to try to remain binary compatible with + * pre-newconfig logs */ + if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ + (lcfg->lcfg_nid >> 32) == 0) { + __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); + + lcfg_new->lcfg_nid = + LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); + CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", + lcfg->lcfg_nal, addr, + libcfs_nid2str(lcfg_new->lcfg_nid)); + } else { + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + } + + lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ + + rc = class_process_config(lcfg_new); + lustre_cfg_free(lcfg_new); + + if (inst) + OBD_FREE(inst_name, inst_len); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; + } +out: + if (rc) { + CERROR("%s: cfg command failed: rc = %d\n", + handle->lgh_ctxt->loc_obd->obd_name, rc); + class_config_dump_handler(NULL, handle, rec, data); + } + RETURN(rc); +} +EXPORT_SYMBOL(class_config_llog_handler); + +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_process_cat_data cd = {0, 0}; + struct llog_handle *llh; + llog_cb_t callback; + int rc; + ENTRY; + + CDEBUG(D_INFO, "looking up llog %s\n", name); + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + /* continue processing from where we last stopped to end-of-log */ + if (cfg) { + cd.lpcd_first_idx = cfg->cfg_last_idx; + callback = cfg->cfg_callback; + LASSERT(callback != NULL); + } else { + callback = class_config_llog_handler; + } + + cd.lpcd_last_idx = 0; + + rc = llog_process(env, llh, callback, cfg, &cd); + + CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.lpcd_last_idx; + +parse_out: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_parse_llog); + +/** + * parse config record and output dump in supplied buffer. + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + */ +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0; + + ENTRY; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + RETURN(rc); + + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + if (lcfg->lcfg_flags) + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + + if (lcfg->lcfg_num) + ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + + if (lcfg->lcfg_nid) + ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n ", + libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_tgtname, marker->cm_comment); + } else { + int i; + + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + /* return consumed bytes */ + rc = ptr - buf; + RETURN(rc); +} + +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + char *outstr; + int rc = 0; + + ENTRY; + + OBD_ALLOC(outstr, 256); + if (outstr == NULL) + RETURN(-ENOMEM); + + if (rec->lrh_type == OBD_CFG_REC) { + class_config_parse_rec(rec, outstr, 256); + LCONSOLE(D_WARNING, " %s\n", outstr); + } else { + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; + } + + OBD_FREE(outstr, 256); + RETURN(rc); +} + +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_handle *llh; + int rc; + + ENTRY; + + LCONSOLE_INFO("Dumping config log %s\n", name); + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL); +parse_out: + llog_close(env, llh); + + LCONSOLE_INFO("End config log %s\n", name); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_dump_llog); + +/** Call class_cleanup and class_detach. + * "Manual" only in the sense that we're faking lcfg commands. + */ +int class_manual_cleanup(struct obd_device *obd) +{ + char flags[3] = ""; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + int rc; + ENTRY; + + if (!obd) { + CERROR("empty cleanup\n"); + RETURN(-EALREADY); + } + + if (obd->obd_force) + strcat(flags, "F"); + if (obd->obd_fail) + strcat(flags, "A"); + + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + obd->obd_name, flags); + + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + if (!lcfg) + RETURN(-ENOMEM); + + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + GOTO(out, rc); + } + + /* the lcfg is almost the same for both ops */ + lcfg->lcfg_command = LCFG_DETACH; + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: + lustre_cfg_free(lcfg); + RETURN(rc); +} +EXPORT_SYMBOL(class_manual_cleanup); + +/* + * uuid<->export lustre hash operations + */ + +static unsigned +uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid, + sizeof(((struct obd_uuid *)key)->uuid), mask); +} + +static void * +uuid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return &exp->exp_client_uuid; +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +uuid_keycmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return obd_uuid_equals(key, &exp->exp_client_uuid) && + !exp->exp_failed; +} + +static void * +uuid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_uuid_hash); +} + +static void +uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_get(exp); +} + +static void +uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t uuid_hash_ops = { + .hs_hash = uuid_hash, + .hs_key = uuid_key, + .hs_keycmp = uuid_keycmp, + .hs_object = uuid_export_object, + .hs_get = uuid_export_get, + .hs_put_locked = uuid_export_put_locked, +}; + + +/* + * nid<->export hash operations + */ + +static unsigned +nid_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static void * +nid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(&exp->exp_connection->c_peer.nid); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +nid_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key && + !exp->exp_failed); +} + +static void * +nid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_nid_hash); +} + +static void +nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_get(exp); +} + +static void +nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t nid_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nid_key, + .hs_keycmp = nid_kepcmp, + .hs_object = nid_export_object, + .hs_get = nid_export_get, + .hs_put_locked = nid_export_put_locked, +}; + + +/* + * nid<->nidstats hash operations + */ + +static void * +nidstats_key(struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + + return &ns->nid; +} + +static int +nidstats_keycmp(const void *key, struct hlist_node *hnode) +{ + return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key; +} + +static void * +nidstats_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nid_stat, nid_hash); +} + +static void +nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_getref(ns); +} + +static void +nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_putref(ns); +} + +static cfs_hash_ops_t nid_stat_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nidstats_key, + .hs_keycmp = nidstats_keycmp, + .hs_object = nidstats_object, + .hs_get = nidstats_get, + .hs_put_locked = nidstats_put_locked, +}; diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c new file mode 100644 index 000000000000..99adad9793c5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c @@ -0,0 +1,1321 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount.c + * + * Client mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define PRINT_CMD CDEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int (*client_fill_super)(struct super_block *sb, + struct vfsmount *mnt); + +static void (*kill_super_cb)(struct super_block *sb); + +/**************** config llog ********************/ + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Continue to process new statements appended to the logs + * (whenever the config lock is revoked) until lustre_end_log + * is called. + * @param sb The superblock is used by the MGC to write to the local copy of + * the config log + * @param logname The name of the llog to replicate from the MGS + * @param cfg Since the same mgc may be used to follow multiple config logs + * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for + * this log, and is added to the mgc's list of logs to follow. + */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs *bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + LASSERT(mgc); + LASSERT(cfg); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + RETURN(-ENOMEM); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, logname); + lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); + lcfg = lustre_cfg_new(LCFG_LOG_START, bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + + OBD_FREE_PTR(bufs); + + if (rc == -EINVAL) + LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'" + "failed from the MGS (%d). Make sure this " + "client and the MGS are running compatible " + "versions of Lustre.\n", + mgc->obd_name, logname, rc); + + if (rc) + LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' " + "failed (%d). This may be the result of " + "communication errors between this node and " + "the MGS, a bad configuration, or other " + "errors. See the syslog for more " + "information.\n", mgc->obd_name, logname, + rc); + + /* class_obd_list(); */ + RETURN(rc); +} +EXPORT_SYMBOL(lustre_process_log); + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + if (!mgc) + RETURN(-ENOENT); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_end_log); + +/**************** obd start *******************/ + +/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from + * lctl (and do for echo cli/srv. + */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg * lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + return(rc); +} +EXPORT_SYMBOL(do_lcfg); + +/** Call class_attach and class_setup. These methods in turn call + * obd type-specific methods. + */ +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4) +{ + int rc; + CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return rc; + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0); + } + return rc; +} + +DEFINE_MUTEX(mgc_start_lock); + +/** Set up a mgc obd to process startup logs + * + * \param sb [in] super block of the mgc obd + * + * \retval 0 success, otherwise error code + */ +int lustre_start_mgc(struct super_block *sb) +{ + struct obd_connect_data *data = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid; + class_uuid_t uuidc; + lnet_nid_t nid; + char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *ptr; + int recov_bk; + int rc = 0, i = 0, j, len; + ENTRY; + + LASSERT(lsi->lsi_lmd); + + /* Find the first non-lo MGS nid for our MGC name */ + if (IS_SERVER(lsi)) { + /* mount -o mgsnode=nid */ + ptr = lsi->lsi_lmd->lmd_mgs; + if (lsi->lsi_lmd->lmd_mgs && + (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi)) { + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + nid = id.nid; + i++; + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + RETURN(-EINVAL); + } + + mutex_lock(&mgc_start_lock); + + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (!mgcname || !niduuid) + GOTO(out_free, rc = -ENOMEM); + sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); + + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out_free, rc = -ENOMEM); + + obd = class_name2obd(mgcname); + if (obd && !obd->obd_stopping) { + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Re-using an existing MGC */ + atomic_inc(&obd->u.cli.cl_mgc_refcount); + + /* IR compatibility check, only for clients */ + if (lmd_is_client(lsi->lsi_lmd)) { + int has_ir; + int vallen = sizeof(*data); + __u32 *flags = &lsi->lsi_lmd->lmd_flags; + + rc = obd_get_info(NULL, obd->obd_self_export, + strlen(KEY_CONN_DATA), KEY_CONN_DATA, + &vallen, data, NULL); + LASSERT(rc == 0); + has_ir = OCD_HAS_FLAG(data, IMP_RECOV); + if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { + /* LMD_FLG_NOIR is for test purpose only */ + LCONSOLE_WARN( + "Trying to mount a client with IR setting " + "not compatible with current mgc. " + "Force to use current mgc setting that is " + "IR %s.\n", + has_ir ? "enabled" : "disabled"); + if (has_ir) + *flags &= ~LMD_FLG_NOIR; + else + *flags |= LMD_FLG_NOIR; + } + } + + recov_bk = 0; + /* If we are restarting the MGS, don't try to keep the MGC's + old connection, or registration will fail. */ + if (IS_MGS(lsi)) { + CDEBUG(D_MOUNT, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* Try all connections, but only once (again). + We don't want to block another target from starting + (using its local copy of the log), but we do want to connect + if at all possible. */ + recov_bk++; + CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + GOTO(out, rc = 0); + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); + + /* Add the primary nids for the MGS */ + i = 0; + sprintf(niduuid, "%s_%x", mgcname, i); + if (IS_SERVER(lsi)) { + ptr = lsi->lsi_lmd->lmd_mgs; + if (IS_MGS(lsi)) { + /* Use local nids (including LO) */ + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(mgcname, id.nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + } + } else { + /* Use mgsnode= nids */ + /* mount -o mgsnode=nid */ + if (lsi->lsi_lmd->lmd_mgs) { + ptr = lsi->lsi_lmd->lmd_mgs; + } else if (class_find_param(ptr, PARAM_MGSNODE, + &ptr) != 0) { + CERROR("No MGS nids given.\n"); + GOTO(out_free, rc = -EINVAL); + } + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + i++; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + i++; + /* Stop at the first failover nid */ + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + GOTO(out_free, rc = -EINVAL); + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + ll_generate_random_uuid(uuidc); + class_uuid_unparse(uuidc, uuid); + + /* Start the MGC */ + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + niduuid, 0, 0); + OBD_FREE_PTR(uuid); + if (rc) + GOTO(out_free, rc); + + /* Add any failover MGS nids */ + i = 1; + while (ptr && ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { + /* New failover node */ + sprintf(niduuid, "%s_%x", mgcname, i); + j = 0; + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + j++; + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, + niduuid, 0, 0, 0); + i++; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(mgcname); + if (!obd) { + CERROR("Can't find mgcobd %s\n", mgcname); + GOTO(out_free, rc = -ENOTCONN); + } + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Keep a refcount of servers/clients who started with "mount", + so we know when we can get rid of the mgc. */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + + /* Try all connections, but only once. */ + recov_bk = 1; + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + if (rc) + /* nonfatal */ + CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | + OBD_CONNECT_LVB_TYPE; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + if (lmd_is_client(lsi->lsi_lmd) && + lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) + data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL); + if (rc) { + CERROR("connect failed %d\n", rc); + GOTO(out, rc); + } + + obd->u.cli.cl_mgc_mgsexp = exp; + +out: + /* Keep the mgc info in the sb. Note that many lsi's can point + to the same mgc.*/ + lsi->lsi_mgc = obd; +out_free: + mutex_unlock(&mgc_start_lock); + + if (data) + OBD_FREE_PTR(data); + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); + RETURN(rc); +} + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *niduuid = 0, *ptr = 0; + int i, rc = 0, len = 0; + ENTRY; + + if (!lsi) + RETURN(-ENOENT); + obd = lsi->lsi_mgc; + if (!obd) + RETURN(-ENOENT); + lsi->lsi_mgc = NULL; + + mutex_lock(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* This is not fatal, every client that stops + will call in here. */ + CDEBUG(D_MOUNT, "mgc still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + GOTO(out, rc = -EBUSY); + } + + /* The MGC has no recoverable data in any case. + * force shotdown set in umount_begin */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) { + /* An error is not fatal, if we are unable to send the + disconnect mgs ping evictor cleans up the export */ + rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + if (rc) + CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); + } + + /* Save the obdname for cleaning the nid uuids, which are + obdname_XX */ + len = strlen(obd->obd_name) + 6; + OBD_ALLOC(niduuid, len); + if (niduuid) { + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + } + + rc = class_manual_cleanup(obd); + if (rc) + GOTO(out, rc); + + /* Clean the nid uuids */ + if (!niduuid) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(ptr, "_%x", i); + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, + niduuid, 0, 0, 0); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } +out: + if (niduuid) + OBD_FREE(niduuid, len); + + /* class_import_put will get rid of the additional connections */ + mutex_unlock(&mgc_start_lock); + RETURN(rc); +} + +/***************** lustre superblock **************/ + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi; + ENTRY; + + OBD_ALLOC_PTR(lsi); + if (!lsi) + RETURN(NULL); + OBD_ALLOC_PTR(lsi->lsi_lmd); + if (!lsi->lsi_lmd) { + OBD_FREE_PTR(lsi); + RETURN(NULL); + } + + lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + + RETURN(lsi); +} + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + if (lsi->lsi_lmd->lmd_mgs != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgs, + strlen(lsi->lsi_lmd->lmd_mgs) + 1); + if (lsi->lsi_lmd->lmd_osd_type != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_osd_type, + strlen(lsi->lsi_lmd->lmd_osd_type) + 1); + if (lsi->lsi_lmd->lmd_params != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_params, 4096); + + OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + } + + LASSERT(lsi->lsi_llsbi == NULL); + OBD_FREE(lsi, sizeof(*lsi)); + s2lsi_nocast(sb) = NULL; + + RETURN(0); +} + +/* The lsi has one reference for every server that is using the disk - + e.g. MDT, MGS, and potentially MGC */ +int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + if (IS_SERVER(lsi) && lsi->lsi_osd_exp) { + obd_disconnect(lsi->lsi_osd_exp); + /* wait till OSD is gone */ + obd_zombie_barrier(); + } + lustre_free_lsi(sb); + RETURN(1); + } + RETURN(0); +} + +/** Get the fsname ("lustre") from the server name ("lustre-OST003F"). + * @param [in] svname server name including type and index + * @param [out] fsname Buffer to copy filesystem name prefix into. + * Must have at least 'strlen(fsname) + 1' chars. + * @param [out] endptr if endptr isn't NULL it is set to end of fsname + * rc < 0 on error + */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr) +{ + const char *dash = strrchr(svname, '-'); + if (!dash) { + dash = strrchr(svname, ':'); + if (!dash) + return -EINVAL; + } + + /* interpret -MDTXXXXX-mdc as mdt, the better way is to pass + * in the fsname, then determine the server index */ + if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) { + dash--; + for (; dash > svname && *dash != '-' && *dash != ':'; dash--) + ; + if (dash == svname) + return -EINVAL; + } + + if (fsname != NULL) { + strncpy(fsname, svname, dash - svname); + fsname[dash - svname] = '\0'; + } + + if (endptr != NULL) + *endptr = dash; + + return 0; +} +EXPORT_SYMBOL(server_name2fsname); + +/** + * Get service name (svname) from string + * rc < 0 on error + * if endptr isn't NULL it is set to end of fsname * + */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize) +{ + int rc; + const const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(label, NULL, &dash); + if (rc != 0) + return rc; + + if (*dash != '-') + return -1; + + if (strlcpy(svname, dash + 1, svsize) >= svsize) + return -E2BIG; + + return 0; +} +EXPORT_SYMBOL(server_name2svname); + + +/* Get the index from the obd name. + rc = server type, or + rc < 0 on error + if endptr isn't NULL it is set to end of name */ +int server_name2index(const char *svname, __u32 *idx, const char **endptr) +{ + unsigned long index; + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + if (*dash != '-') + return -EINVAL; + + dash++; + + if (strncmp(dash, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return -EINVAL; + + dash += 3; + + if (strcmp(dash, "all") == 0) + return rc | LDD_F_SV_ALL; + + index = simple_strtoul(dash, (char **)endptr, 16); + *idx = index; + + return rc; +} +EXPORT_SYMBOL(server_name2index); + +/*************** mount common betweeen server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + /* Drop a ref to the MGC */ + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + RETURN(rc); + } + /* BUSY just means that there's some other obd that + needs the mgc. Let him clean it up. */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + /* Drop a ref to the mounted disk */ + lustre_put_lsi(sb); + lu_types_stop(); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_common_put_super); + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(D_MOUNT, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); + + if (lmd->lmd_opts) + PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + ENTRY; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + /* Only exclude OSTs */ + RETURN(0); + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for(i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + RETURN(1); + } + } + RETURN(0); +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) +{ + const char *s1 = ptr, *s2; + __u32 index, *exclude_list; + int rc = 0, devmax; + ENTRY; + + /* The shortest an ost name can be is 8 chars: -OST0000. + We don't actually know the fsname at this time, so in fact + a user could specify any fsname. */ + devmax = strlen(ptr) / 8 + 1; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC(exclude_list, sizeof(index) * devmax); + if (!exclude_list) + RETURN(-ENOMEM); + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse server name '%s'\n", s1); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1); + s1 = s2; + /* now we are pointing at ':' (next exclude) + or ',' (end of excludes) */ + if (lmd->lmd_exclude_count >= devmax) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE(exclude_list, sizeof(index) * devmax); + RETURN(rc); +} + +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + +static int lmd_parse_string(char **handle, char *ptr) +{ + char *tail; + int length; + + if ((handle == NULL) || (ptr == NULL)) + return -EINVAL; + + if (*handle != NULL) { + OBD_FREE(*handle, strlen(*handle) + 1); + *handle = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(*handle, length + 1); + if (*handle == NULL) + return -ENOMEM; + + memcpy(*handle, ptr, length); + (*handle)[length] = '\0'; + + return 0; +} + +/* Collect multiple values for mgsnid specifiers */ +static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) +{ + lnet_nid_t nid; + char *tail = *ptr; + char *mgsnid; + int length; + int oldlen = 0; + + /* Find end of nidlist */ + while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {} + length = tail - *ptr; + if (length == 0) { + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); + return -EINVAL; + } + + if (lmd->lmd_mgs != NULL) + oldlen = strlen(lmd->lmd_mgs) + 1; + + OBD_ALLOC(mgsnid, oldlen + length + 1); + if (mgsnid == NULL) + return -ENOMEM; + + if (lmd->lmd_mgs != NULL) { + /* Multiple mgsnid= are taken to mean failover locations */ + memcpy(mgsnid, lmd->lmd_mgs, oldlen); + mgsnid[oldlen - 1] = ':'; + OBD_FREE(lmd->lmd_mgs, oldlen); + } + memcpy(mgsnid + oldlen, *ptr, length); + mgsnid[oldlen + length] = '\0'; + lmd->lmd_mgs = mgsnid; + *ptr = tail; + + return 0; +} + +/** Parse mount line options + * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre + * dev is passed as device=uml1:/lustre by mount.lustre + */ +static int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + ENTRY; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that " + "/sbin/mount.lustre is installed.\n"); + RETURN(-EINVAL); + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR_MSG(0x163, "You're using an old version of " + "/sbin/mount.lustre. Please install " + "version %s\n", LUSTRE_VERSION_STRING); + RETURN(-EINVAL); + } + lmd->lmd_magic = LMD_MAGIC; + + OBD_ALLOC(lmd->lmd_params, 4096); + if (lmd->lmd_params == NULL) + RETURN(-ENOMEM); + lmd->lmd_params[0] = '\0'; + + /* Set default flags here */ + + s1 = options; + while (*s1) { + int clear = 0; + int time_min = OBD_RECOVERY_TIME_MIN; + + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + + /* Client options are parsed in ll_options: eg. flock, + user_xattr, acl */ + + /* Parse non-ldiskfs options here. Rather than modifying + ldiskfs, we just zero these out here */ + if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "noir", 4) == 0) { + lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ + clear++; + } else if (strncmp(s1, "nosvc", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSVC; + clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; + } else if (strncmp(s1, "noscrub", 7) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSCRUB; + clear++; + } else if (strncmp(s1, PARAM_MGSNODE, + sizeof(PARAM_MGSNODE) - 1) == 0) { + s2 = s1 + sizeof(PARAM_MGSNODE) - 1; + /* Assume the next mount opt is the first + invalid nid we get to. */ + rc = lmd_parse_mgs(lmd, &s2); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "writeconf", 9) == 0) { + lmd->lmd_flags |= LMD_FLG_WRITECONF; + clear++; + } else if (strncmp(s1, "update", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_UPDATE; + clear++; + } else if (strncmp(s1, "virgin", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_VIRGIN; + clear++; + } else if (strncmp(s1, "noprimnode", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; + clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + /* ost exclusion list */ + } else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "mgs", 3) == 0) { + /* We are an MGS */ + lmd->lmd_flags |= LMD_FLG_MGS; + clear++; + } else if (strncmp(s1, "svname=", 7) == 0) { + rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "param=", 6) == 0) { + int length; + char *tail = strchr(s1 + 6, ','); + if (tail == NULL) + length = strlen(s1); + else + length = tail - s1; + length -= 6; + strncat(lmd->lmd_params, s1 + 6, length); + strcat(lmd->lmd_params, " "); + clear++; + } else if (strncmp(s1, "osd=", 4) == 0) { + rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); + if (rc) + goto invalid; + clear++; + } + /* Linux 2.4 doesn't pass the device, so we stuck it at the + end of the options. */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* terminate options right before device. device + must be the last one. */ + *s1 = '\0'; + break; + } + + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) { + if (clear) + *s1 = '\0'; + break; + } + s2++; + if (clear) + memmove(s1, s2, strlen(s2) + 1); + else + s1 = s2; + } + + if (!devname) { + LCONSOLE_ERROR_MSG(0x164, "Can't find the device name " + "(need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strstr(devname, ":/"); + if (s1) { + ++s1; + lmd->lmd_flags |= LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') ; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8); + if (!lmd->lmd_profile) + RETURN(-ENOMEM); + sprintf(lmd->lmd_profile, "%s-client", s1); + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + RETURN(-ENOMEM); + strcpy(lmd->lmd_dev, devname); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + RETURN(-ENOMEM); + strcpy(lmd->lmd_opts, options); + } + + lmd_print(lmd); + lmd->lmd_magic = LMD_MAGIC; + + RETURN(rc); + +invalid: + CERROR("Bad mount options %s\n", options); + RETURN(-EINVAL); +} + +struct lustre_mount_data2 { + void *lmd2_data; + struct vfsmount *lmd2_mnt; +}; + +/** This is the entry point for the mount call into Lustre. + * This is called when a server or client is mounted, + * and this is where we start setting things up. + * @param data Mount options (e.g. -o flock,abort_recov) + */ +int lustre_fill_super(struct super_block *sb, void *data, int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_mount_data2 *lmd2 = data; + struct lustre_sb_info *lsi; + int rc; + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * `special'. + */ + lockdep_off(); + + /* + * LU-639: the obd cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) { + lustre_put_lsi(sb); + GOTO(out, rc = -EINVAL); + } + + if (lmd_is_client(lmd)) { + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + if (!client_fill_super) { + LCONSOLE_ERROR_MSG(0x165, "Nothing registered for " + "client mount! Is the 'lustre' " + "module loaded?\n"); + lustre_put_lsi(sb); + rc = -ENODEV; + } else { + rc = lustre_start_mgc(sb); + if (rc) { + lustre_put_lsi(sb); + GOTO(out, rc); + } + /* Connect and start */ + /* (should always be ll_fill_super) */ + rc = (*client_fill_super)(sb, lmd2->lmd2_mnt); + /* c_f_s will call lustre_common_put_super on failure */ + } + } else { + CERROR("This is client-side-only module, " + "cannot handle server mount.\n"); + rc = -EINVAL; + } + + /* If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. */ + GOTO(out, rc); +out: + if (rc) { + CERROR("Unable to mount %s (%d)\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "Mount %s complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + + +/* We can't call ll_fill_super by name because it lives in a module that + must be loaded after this one. */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)) +{ + client_fill_super = cfs; +} +EXPORT_SYMBOL(lustre_register_client_fill_super); + +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)) +{ + kill_super_cb = cfs; +} +EXPORT_SYMBOL(lustre_register_kill_super_cb); + +/***************** FS registration ******************/ +struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, + const char *devname, void *data) +{ + struct lustre_mount_data2 lmd2 = { data, NULL }; + + return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super); +} + +void lustre_kill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (kill_super_cb && lsi && !IS_SERVER(lsi)) + (*kill_super_cb)(sb); + + kill_anon_super(sb); +} + +/** Register the "lustre" fs type + */ +struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .mount = lustre_mount, + .kill_sb = lustre_kill_super, + .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV | + FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE, +}; + +int lustre_register_fs(void) +{ + return register_filesystem(&lustre_fs_type); +} + +int lustre_unregister_fs(void) +{ + return unregister_filesystem(&lustre_fs_type); +} diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c b/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c new file mode 100644 index 000000000000..a3a44091c433 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obd_mount_server.c @@ -0,0 +1,1783 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount_server.c + * + * Server mount routines + * + * Author: Nathan Rutman + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER | D_CONFIG /* | D_WARNING */) +#define PRINT_CMD CDEBUG +#define PRINT_MASK (D_SUPER | D_CONFIG) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/*********** mount lookup *********/ + +DEFINE_MUTEX(lustre_mount_info_lock); +static LIST_HEAD(server_mount_info_list); + +static struct lustre_mount_info *server_find_mount(const char *name) +{ + struct list_head *tmp; + struct lustre_mount_info *lmi; + ENTRY; + + list_for_each(tmp, &server_mount_info_list) { + lmi = list_entry(tmp, struct lustre_mount_info, + lmi_list_chain); + if (strcmp(name, lmi->lmi_name) == 0) + RETURN(lmi); + } + RETURN(NULL); +} + +/* we must register an obd for a mount before we call the setup routine. + *_setup will call lustre_get_mount to get the mnt struct + by obd_name, since we can't pass the pointer to setup. */ +static int server_register_mount(const char *name, struct super_block *sb, + struct vfsmount *mnt) +{ + struct lustre_mount_info *lmi; + char *name_cp; + ENTRY; + + LASSERT(sb); + + OBD_ALLOC(lmi, sizeof(*lmi)); + if (!lmi) + RETURN(-ENOMEM); + OBD_ALLOC(name_cp, strlen(name) + 1); + if (!name_cp) { + OBD_FREE(lmi, sizeof(*lmi)); + RETURN(-ENOMEM); + } + strcpy(name_cp, name); + + mutex_lock(&lustre_mount_info_lock); + + if (server_find_mount(name)) { + mutex_unlock(&lustre_mount_info_lock); + OBD_FREE(lmi, sizeof(*lmi)); + OBD_FREE(name_cp, strlen(name) + 1); + CERROR("Already registered %s\n", name); + RETURN(-EEXIST); + } + lmi->lmi_name = name_cp; + lmi->lmi_sb = sb; + lmi->lmi_mnt = mnt; + list_add(&lmi->lmi_list_chain, &server_mount_info_list); + + mutex_unlock(&lustre_mount_info_lock); + + CDEBUG(D_MOUNT, "reg_mnt %p from %s\n", lmi->lmi_mnt, name); + + RETURN(0); +} + +/* when an obd no longer needs a mount */ +static int server_deregister_mount(const char *name) +{ + struct lustre_mount_info *lmi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + if (!lmi) { + mutex_unlock(&lustre_mount_info_lock); + CERROR("%s not registered\n", name); + RETURN(-ENOENT); + } + + CDEBUG(D_MOUNT, "dereg_mnt %p from %s\n", lmi->lmi_mnt, name); + + OBD_FREE(lmi->lmi_name, strlen(lmi->lmi_name) + 1); + list_del(&lmi->lmi_list_chain); + OBD_FREE(lmi, sizeof(*lmi)); + mutex_unlock(&lustre_mount_info_lock); + + RETURN(0); +} + +/* obd's look up a registered mount using their obdname. This is just + for initial obd setup to find the mount struct. It should not be + called every time you want to mntget. */ +struct lustre_mount_info *server_get_mount(const char *name) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(NULL); + } + lsi = s2lsi(lmi->lmi_sb); + + atomic_inc(&lsi->lsi_mounts); + + CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d\n", lmi->lmi_mnt, + name, atomic_read(&lsi->lsi_mounts)); + + RETURN(lmi); +} +EXPORT_SYMBOL(server_get_mount); + +/* + * Used by mdt to get mount_info from obdname. + * There are no blocking when using the mount_info. + * Do not use server_get_mount for this purpose. + */ +struct lustre_mount_info *server_get_mount_2(const char *name) +{ + struct lustre_mount_info *lmi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) + CERROR("Can't find mount for %s\n", name); + + RETURN(lmi); +} +EXPORT_SYMBOL(server_get_mount_2); + +/* to be called from obd_cleanup methods */ +int server_put_mount(const char *name, struct vfsmount *mnt) +{ + struct lustre_mount_info *lmi; + struct lustre_sb_info *lsi; + ENTRY; + + mutex_lock(&lustre_mount_info_lock); + lmi = server_find_mount(name); + mutex_unlock(&lustre_mount_info_lock); + if (!lmi) { + CERROR("Can't find mount for %s\n", name); + RETURN(-ENOENT); + } + lsi = s2lsi(lmi->lmi_sb); + + CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d\n", + lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts)); + + if (lustre_put_lsi(lmi->lmi_sb)) + CDEBUG(D_MOUNT, "Last put of mnt %p from %s\n", + lmi->lmi_mnt, name); + + /* this obd should never need the mount again */ + server_deregister_mount(name); + + RETURN(0); +} +EXPORT_SYMBOL(server_put_mount); + +/* Corresponding to server_get_mount_2 */ +int server_put_mount_2(const char *name, struct vfsmount *mnt) +{ + ENTRY; + RETURN(0); +} +EXPORT_SYMBOL(server_put_mount_2); + +/* Set up a MGS to serve startup logs */ +static int server_start_mgs(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct vfsmount *mnt = lsi->lsi_srv_mnt; + struct lustre_mount_info *lmi; + int rc = 0; + ENTRY; + + /* It is impossible to have more than 1 MGS per node, since + MGC wouldn't know which to connect to */ + lmi = server_find_mount(LUSTRE_MGS_OBDNAME); + if (lmi) { + lsi = s2lsi(lmi->lmi_sb); + LCONSOLE_ERROR_MSG(0x15d, "The MGS service was already started" + " from server\n"); + RETURN(-EALREADY); + } + + CDEBUG(D_CONFIG, "Start MGS service %s\n", LUSTRE_MGS_OBDNAME); + + rc = server_register_mount(LUSTRE_MGS_OBDNAME, sb, mnt); + + if (!rc) { + rc = lustre_start_simple(LUSTRE_MGS_OBDNAME, LUSTRE_MGS_NAME, + LUSTRE_MGS_OBDNAME, 0, 0, + lsi->lsi_osd_obdname, 0); + /* Do NOT call server_deregister_mount() here. This leads to + * inability cleanup cleanly and free lsi and other stuff when + * mgs calls server_put_mount() in error handling case. -umka */ + } + + if (rc) + LCONSOLE_ERROR_MSG(0x15e, "Failed to start MGS '%s' (%d). " + "Is the 'mgs' module loaded?\n", + LUSTRE_MGS_OBDNAME, rc); + RETURN(rc); +} + +static int server_stop_mgs(struct super_block *sb) +{ + struct obd_device *obd; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Stop MGS service %s\n", LUSTRE_MGS_OBDNAME); + + /* There better be only one MGS */ + obd = class_name2obd(LUSTRE_MGS_OBDNAME); + if (!obd) { + CDEBUG(D_CONFIG, "mgs %s not running\n", LUSTRE_MGS_OBDNAME); + RETURN(-EALREADY); + } + + /* The MGS should always stop when we say so */ + obd->obd_force = 1; + rc = class_manual_cleanup(obd); + RETURN(rc); +} + +/* Since there's only one mgc per node, we have to change it's fs to get + access to the right disk. */ +static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Set mgc disk for %s\n", lsi->lsi_lmd->lmd_dev); + + /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ + rc = obd_set_info_async(NULL, mgc->obd_self_export, + sizeof(KEY_SET_FS), KEY_SET_FS, + sizeof(*sb), sb, NULL); + if (rc != 0) + CERROR("can't set_fs %d\n", rc); + + RETURN(rc); +} + +static int server_mgc_clear_fs(struct obd_device *mgc) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "Unassign mgc disk\n"); + + rc = obd_set_info_async(NULL, mgc->obd_self_export, + sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS, + 0, NULL, NULL); + RETURN(rc); +} + +static int is_mdc_device(const char *devname) +{ + char *ptr; + + ptr = strrchr(devname, '-'); + if (ptr != NULL && strcmp(ptr, "-mdc") == 0) + return 1; + + return 0; +} + +static inline int tgt_is_mdt0(const char *tgtname) +{ + __u32 idx; + int type; + + type = server_name2index(tgtname, &idx, NULL); + if (type != LDD_F_SV_TYPE_MDT) + return 0; + + return idx == 0; +} + +static inline int is_mdc_for_mdt0(const char *devname) +{ + char *ptr; + + if (!is_mdc_device(devname)) + return 0; + + ptr = strrchr(devname, '-'); + if (ptr == NULL) + return 0; + + *ptr = 0; + if (tgt_is_mdt0(devname)) { + *ptr = '-'; + return 1; + } + *ptr = '-'; + return 0; +} + +/** + * Convert OST/MDT name(fsname-OSTxxxx) to a lwp name + * (fsname-MDT0000-lwp-OSTxxxx) + **/ +int tgt_name2lwpname(const char *svname, char *lwpname) +{ + char *fsname; + const char *tgt; + int rc; + ENTRY; + + OBD_ALLOC(fsname, MTI_NAME_MAXLEN); + if (fsname == NULL) + RETURN(-ENOMEM); + + rc = server_name2fsname(svname, fsname, &tgt); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname. %d\n", + svname, rc); + GOTO(cleanup, rc); + } + + if (*tgt != '-' && *tgt != ':') { + CERROR("%s: invalid svname name!\n", svname); + GOTO(cleanup, rc = -EINVAL); + } + + tgt++; + if (strncmp(tgt, "OST", 3) != 0 && strncmp(tgt, "MDT", 3) != 0) { + CERROR("%s is not an OST or MDT target!\n", svname); + GOTO(cleanup, rc = -EINVAL); + } + sprintf(lwpname, "%s-MDT0000-%s-%s", fsname, LUSTRE_LWP_NAME, tgt); +cleanup: + if (fsname != NULL) + OBD_FREE(fsname, MTI_NAME_MAXLEN); + RETURN(rc); +} +EXPORT_SYMBOL(tgt_name2lwpname); + +static LIST_HEAD(lwp_register_list); +DEFINE_MUTEX(lwp_register_list_lock); + +int lustre_register_lwp_item(const char *lwpname, struct obd_export **exp, + register_lwp_cb cb_func, void *cb_data) +{ + struct obd_device *lwp; + struct lwp_register_item *lri; + ENTRY; + + LASSERTF(strlen(lwpname) < MTI_NAME_MAXLEN, "lwpname is too long %s\n", + lwpname); + LASSERT(exp != NULL && *exp == NULL); + + OBD_ALLOC_PTR(lri); + if (lri == NULL) + RETURN(-ENOMEM); + + mutex_lock(&lwp_register_list_lock); + + lwp = class_name2obd(lwpname); + if (lwp != NULL && lwp->obd_set_up == 1) { + struct obd_uuid *uuid; + + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) { + mutex_unlock(&lwp_register_list_lock); + OBD_FREE_PTR(lri); + RETURN(-ENOMEM); + } + memcpy(uuid->uuid, lwpname, strlen(lwpname)); + *exp = cfs_hash_lookup(lwp->obd_uuid_hash, uuid); + OBD_FREE_PTR(uuid); + } + + memcpy(lri->lri_name, lwpname, strlen(lwpname)); + lri->lri_exp = exp; + lri->lri_cb_func = cb_func; + lri->lri_cb_data = cb_data; + INIT_LIST_HEAD(&lri->lri_list); + list_add(&lri->lri_list, &lwp_register_list); + + if (*exp != NULL && cb_func != NULL) + cb_func(cb_data); + + mutex_unlock(&lwp_register_list_lock); + RETURN(0); +} +EXPORT_SYMBOL(lustre_register_lwp_item); + +void lustre_deregister_lwp_item(struct obd_export **exp) +{ + struct lwp_register_item *lri, *tmp; + + mutex_lock(&lwp_register_list_lock); + list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) { + if (exp == lri->lri_exp) { + if (*exp) + class_export_put(*exp); + list_del(&lri->lri_list); + OBD_FREE_PTR(lri); + break; + } + } + mutex_unlock(&lwp_register_list_lock); +} +EXPORT_SYMBOL(lustre_deregister_lwp_item); + +static void lustre_notify_lwp_list(struct obd_export *exp) +{ + struct lwp_register_item *lri, *tmp; + LASSERT(exp != NULL); + + mutex_lock(&lwp_register_list_lock); + list_for_each_entry_safe(lri, tmp, &lwp_register_list, lri_list) { + if (strcmp(exp->exp_obd->obd_name, lri->lri_name)) + continue; + if (*lri->lri_exp != NULL) + continue; + *lri->lri_exp = class_export_get(exp); + if (lri->lri_cb_func != NULL) + lri->lri_cb_func(lri->lri_cb_data); + } + mutex_unlock(&lwp_register_list_lock); +} + +static int lustre_lwp_connect(struct obd_device *lwp) +{ + struct lu_env env; + struct lu_context session_ctx; + struct obd_export *exp; + struct obd_uuid *uuid = NULL; + struct obd_connect_data *data = NULL; + int rc; + ENTRY; + + /* log has been fully processed, let clients connect */ + rc = lu_env_init(&env, lwp->obd_lu_dev->ld_type->ldt_ctx_tags); + if (rc != 0) + RETURN(rc); + + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX; + data->ocd_version = LUSTRE_VERSION_CODE; + data->ocd_connect_flags |= OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | + OBD_CONNECT_AT | OBD_CONNECT_LRU_RESIZE | + OBD_CONNECT_FULL20 | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LIGHTWEIGHT; + OBD_ALLOC_PTR(uuid); + if (uuid == NULL) + GOTO(out, rc = -ENOMEM); + + if (strlen(lwp->obd_name) > sizeof(uuid->uuid)) { + CERROR("%s: Too long lwp name %s, max_size is %d\n", + lwp->obd_name, lwp->obd_name, (int)sizeof(uuid->uuid)); + GOTO(out, rc = -EINVAL); + } + + /* Use lwp name as the uuid, so we find the export by lwp name later */ + memcpy(uuid->uuid, lwp->obd_name, strlen(lwp->obd_name)); + rc = obd_connect(&env, &exp, lwp, uuid, data, NULL); + if (rc != 0) + CERROR("%s: connect failed: rc = %d\n", lwp->obd_name, rc); + else + lustre_notify_lwp_list(exp); + +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (uuid != NULL) + OBD_FREE_PTR(uuid); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + RETURN(rc); +} + +/** + * lwp is used by slaves (Non-MDT0 targets) to manage the connection + * to MDT0. + **/ +static int lustre_lwp_setup(struct lustre_cfg *lcfg, struct lustre_sb_info *lsi) +{ + struct obd_connect_data *data = NULL; + struct obd_device *obd; + char *lwpname = NULL; + char *lwpuuid = NULL; + int rc; + ENTRY; + + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc) { + CERROR("%s: Can't add uuid: rc =%d\n", lsi->lsi_svname, rc); + GOTO(out, rc); + } + + OBD_ALLOC(lwpname, MTI_NAME_MAXLEN); + if (lwpname == NULL) + GOTO(out, rc = -ENOMEM); + + rc = tgt_name2lwpname(lsi->lsi_svname, lwpname); + if (rc != 0) { + CERROR("%s: failed to generate lwp name. %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc); + } + + OBD_ALLOC(lwpuuid, MTI_NAME_MAXLEN); + if (lwpuuid == NULL) + GOTO(out, rc = -ENOMEM); + + sprintf(lwpuuid, "%s_UUID", lwpname); + rc = lustre_start_simple(lwpname, LUSTRE_LWP_NAME, + lwpuuid, lustre_cfg_string(lcfg, 1), + 0, 0, 0); + if (rc) { + CERROR("%s: setup up failed: rc %d\n", lwpname, rc); + GOTO(out, rc); + } + + obd = class_name2obd(lwpname); + LASSERT(obd != NULL); + + rc = lustre_lwp_connect(obd); + if (rc != 0) + CERROR("%s: connect failed: rc = %d\n", lwpname, rc); +out: + if (data != NULL) + OBD_FREE_PTR(data); + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + if (lwpuuid != NULL) + OBD_FREE(lwpuuid, MTI_NAME_MAXLEN); + + RETURN(rc); +} + +/* the caller is responsible for memory free */ +static struct obd_device *lustre_find_lwp(struct lustre_sb_info *lsi, + char **lwpname, char **logname) +{ + struct obd_device *lwp; + int rc = 0; + ENTRY; + + LASSERT(lwpname != NULL); + LASSERT(IS_OST(lsi) || IS_MDT(lsi)); + + OBD_ALLOC(*lwpname, MTI_NAME_MAXLEN); + if (*lwpname == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + if (logname != NULL) { + OBD_ALLOC(*logname, MTI_NAME_MAXLEN); + if (*logname == NULL) + GOTO(out, rc = -ENOMEM); + rc = server_name2fsname(lsi->lsi_svname, *lwpname, NULL); + if (rc != 0) { + CERROR("%s: failed to get fsname from svname. %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + sprintf(*logname, "%s-client", *lwpname); + } + + rc = tgt_name2lwpname(lsi->lsi_svname, *lwpname); + if (rc != 0) { + CERROR("%s: failed to generate lwp name. %d\n", + lsi->lsi_svname, rc); + GOTO(out, rc = -EINVAL); + } + + lwp = class_name2obd(*lwpname); + +out: + if (rc != 0) { + if (*lwpname != NULL) { + OBD_FREE(*lwpname, MTI_NAME_MAXLEN); + *lwpname = NULL; + } + if (logname != NULL && *logname != NULL) { + OBD_FREE(*logname, MTI_NAME_MAXLEN); + *logname = NULL; + } + lwp = ERR_PTR(rc); + } + + RETURN(lwp != NULL ? lwp : ERR_PTR(-ENOENT)); +} + +static int lustre_lwp_add_conn(struct lustre_cfg *cfg, + struct lustre_sb_info *lsi) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *lcfg = NULL; + char *lwpname = NULL; + struct obd_device *lwp; + int rc; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, NULL); + if (IS_ERR(lwp)) { + CERROR("%s: can't find lwp device.\n", lsi->lsi_svname); + GOTO(out, rc = PTR_ERR(lwp)); + } + LASSERT(lwpname != NULL); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_bufs_reset(bufs, lwpname); + lustre_cfg_bufs_set_string(bufs, 1, + lustre_cfg_string(cfg, 1)); + + lcfg = lustre_cfg_new(LCFG_ADD_CONN, bufs); + + rc = class_add_conn(lwp, lcfg); + if (rc) + CERROR("%s: can't add conn: rc = %d\n", lwpname, rc); + +out: + if (bufs != NULL) + OBD_FREE_PTR(bufs); + if (lcfg != NULL) + lustre_cfg_free(lcfg); + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + RETURN(rc); +} + +/** + * Retrieve MDT nids from the client log, then start the lwp device. + * there are only two scenarios which would include mdt nid. + * 1. + * marker 5 (flags=0x01, v2.1.54.0) lustre-MDT0000 'add mdc' xxx- + * add_uuid nid=192.168.122.162@tcp(0x20000c0a87aa2) 0: 1:192.168.122.162@tcp + * attach 0:lustre-MDT0000-mdc 1:mdc 2:lustre-clilmv_UUID + * setup 0:lustre-MDT0000-mdc 1:lustre-MDT0000_UUID 2:192.168.122.162@tcp + * add_uuid nid=192.168.172.1@tcp(0x20000c0a8ac01) 0: 1:192.168.172.1@tcp + * add_conn 0:lustre-MDT0000-mdc 1:192.168.172.1@tcp + * modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID xxxx + * marker 5 (flags=0x02, v2.1.54.0) lustre-MDT0000 'add mdc' xxxx- + * 2. + * marker 7 (flags=0x01, v2.1.54.0) lustre-MDT0000 'add failnid' xxxx- + * add_uuid nid=192.168.122.2@tcp(0x20000c0a87a02) 0: 1:192.168.122.2@tcp + * add_conn 0:lustre-MDT0000-mdc 1:192.168.122.2@tcp + * marker 7 (flags=0x02, v2.1.54.0) lustre-MDT0000 'add failnid' xxxx- + **/ +static int client_lwp_config_process(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *clli = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char *) (rec + 1); + struct lustre_cfg *lcfg = NULL; + struct lustre_sb_info *lsi; + int rc = 0, swab = 0; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + RETURN(-EINVAL); + } + + LASSERT(clli->cfg_sb != NULL); + lsi = s2lsi(clli->cfg_sb); + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + switch (lcfg->lcfg_command) { + case LCFG_MARKER: { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + if (marker->cm_flags & CM_SKIP || + marker->cm_flags & CM_EXCLUDE) + GOTO(out, rc = 0); + + if (!tgt_is_mdt0(marker->cm_tgtname)) + GOTO(out, rc = 0); + + if (!strncmp(marker->cm_comment, "add mdc", 7) || + !strncmp(marker->cm_comment, "add failnid", 11)) { + if (marker->cm_flags & CM_START) { + clli->cfg_flags = CFG_F_MARKER; + /* This hack is to differentiate the + * ADD_UUID is come from "add mdc" record + * or from "add failnid" record. */ + if (!strncmp(marker->cm_comment, + "add failnid", 11)) + clli->cfg_flags |= CFG_F_SKIP; + } else if (marker->cm_flags & CM_END) { + clli->cfg_flags = 0; + } + } + break; + } + case LCFG_ADD_UUID: { + if (clli->cfg_flags == CFG_F_MARKER) { + rc = lustre_lwp_setup(lcfg, lsi); + /* XXX: process only the first nid as + * we don't need another instance of lwp */ + clli->cfg_flags |= CFG_F_SKIP; + } else if (clli->cfg_flags == (CFG_F_MARKER | CFG_F_SKIP)) { + rc = class_add_uuid(lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid); + if (rc) + CERROR("%s: Fail to add uuid, rc:%d\n", + lsi->lsi_svname, rc); + } + break; + } + case LCFG_ADD_CONN: { + if (is_mdc_for_mdt0(lustre_cfg_string(lcfg, 0))) + rc = lustre_lwp_add_conn(lcfg, lsi); + break; + } + default: + break; + } +out: + RETURN(rc); +} + +static int lustre_disconnect_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp; + char *lwpname = NULL; + char *logname = NULL; + struct lustre_cfg *lcfg = NULL; + struct lustre_cfg_bufs *bufs = NULL; + struct config_llog_instance *cfg = NULL; + int rc; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, &logname); + if (IS_ERR(lwp) && PTR_ERR(lwp) != -ENOENT) + GOTO(out, rc = PTR_ERR(lwp)); + + LASSERT(lwpname != NULL); + LASSERT(logname != NULL); + + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + /* end log first */ + cfg->cfg_instance = sb; + rc = lustre_end_log(sb, logname, cfg); + if (rc != 0) { + CERROR("%s: Can't end config log %s.\n", lwpname, logname); + GOTO(out, rc); + } + + if (PTR_ERR(lwp) == -ENOENT) { + CDEBUG(D_CONFIG, "%s: lwp device wasn't started.\n", + lsi->lsi_svname); + GOTO(out, rc = 0); + } + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + GOTO(out, rc = -ENOMEM); + + lustre_cfg_bufs_reset(bufs, lwp->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, NULL); + lcfg = lustre_cfg_new(LCFG_CLEANUP, bufs); + if (!lcfg) + GOTO(out, rc = -ENOMEM); + + /* Disconnect import first. NULL is passed for the '@env', since + * it will not be used. */ + rc = lwp->obd_lu_dev->ld_ops->ldo_process_config(NULL, lwp->obd_lu_dev, + lcfg); +out: + if (lcfg) + lustre_cfg_free(lcfg); + if (bufs) + OBD_FREE_PTR(bufs); + if (cfg) + OBD_FREE_PTR(cfg); + if (lwpname) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + if (logname) + OBD_FREE(logname, MTI_NAME_MAXLEN); + RETURN(rc); +} + +/** + * Stop the lwp for an OST/MDT target. + **/ +static int lustre_stop_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *lwp = NULL; + char *lwpname = NULL; + int rc = 0; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, NULL); + if (IS_ERR(lwp)) { + CDEBUG(PTR_ERR(lwp) == -ENOENT ? D_CONFIG : D_ERROR, + "%s: lwp wasn't started.\n", lsi->lsi_svname); + GOTO(out, rc = 0); + } + + lwp->obd_force = 1; + rc = class_manual_cleanup(lwp); + +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + RETURN(rc); +} + +/** + * Start the lwp(fsname-MDT0000-lwp-OSTxxxx) for an OST or MDT target, + * which would be used to establish connection from OST to MDT0. + **/ +static int lustre_start_lwp(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance *cfg = NULL; + struct obd_device *lwp; + char *lwpname = NULL; + char *logname = NULL; + int rc; + ENTRY; + + lwp = lustre_find_lwp(lsi, &lwpname, &logname); + + /* the lwp device already stared */ + if (lwp && !IS_ERR(lwp)) + GOTO(out, rc = 0); + + if (PTR_ERR(lwp) != -ENOENT) + GOTO(out, rc = PTR_ERR(lwp)); + + LASSERT(lwpname != NULL); + LASSERT(logname != NULL); + + OBD_ALLOC_PTR(cfg); + if (cfg == NULL) + GOTO(out, rc = -ENOMEM); + + cfg->cfg_callback = client_lwp_config_process; + cfg->cfg_instance = sb; + + rc = lustre_process_log(sb, logname, cfg); +out: + if (lwpname != NULL) + OBD_FREE(lwpname, MTI_NAME_MAXLEN); + if (logname != NULL) + OBD_FREE(logname, MTI_NAME_MAXLEN); + if (cfg != NULL) + OBD_FREE_PTR(cfg); + RETURN(rc); +} + +DEFINE_MUTEX(server_start_lock); + +/* Stop MDS/OSS if nobody is using them */ +static int server_stop_servers(int lsiflags) +{ + struct obd_device *obd = NULL; + struct obd_type *type = NULL; + int rc = 0; + ENTRY; + + mutex_lock(&server_start_lock); + + /* Either an MDT or an OST or neither */ + /* if this was an MDT, and there are no more MDT's, clean up the MDS */ + if (lsiflags & LDD_F_SV_TYPE_MDT) { + obd = class_name2obd(LUSTRE_MDS_OBDNAME); + if (obd != NULL) + type = class_search_type(LUSTRE_MDT_NAME); + } + + /* if this was an OST, and there are no more OST's, clean up the OSS */ + if (lsiflags & LDD_F_SV_TYPE_OST) { + obd = class_name2obd(LUSTRE_OSS_OBDNAME); + if (obd != NULL) + type = class_search_type(LUSTRE_OST_NAME); + } + + if (obd != NULL && (type == NULL || type->typ_refcnt == 0)) { + int err; + + obd->obd_force = 1; + /* obd_fail doesn't mean much on a server obd */ + err = class_manual_cleanup(obd); + if (rc != 0) + rc = err; + } + + mutex_unlock(&server_start_lock); + + RETURN(rc); +} + +int server_mti_print(const char *title, struct mgs_target_info *mti) +{ + PRINT_CMD(PRINT_MASK, "mti %s\n", title); + PRINT_CMD(PRINT_MASK, "server: %s\n", mti->mti_svname); + PRINT_CMD(PRINT_MASK, "fs: %s\n", mti->mti_fsname); + PRINT_CMD(PRINT_MASK, "uuid: %s\n", mti->mti_uuid); + PRINT_CMD(PRINT_MASK, "ver: %d flags: %#x\n", + mti->mti_config_ver, mti->mti_flags); + return 0; +} +EXPORT_SYMBOL(server_mti_print); + +/* Generate data for registration */ +static int server_lsi2mti(struct lustre_sb_info *lsi, + struct mgs_target_info *mti) +{ + lnet_process_id_t id; + int rc, i = 0; + int cplen = 0; + ENTRY; + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + if (strlcpy(mti->mti_svname, lsi->lsi_svname, sizeof(mti->mti_svname)) + >= sizeof(mti->mti_svname)) + RETURN(-E2BIG); + + mti->mti_nid_count = 0; + while (LNetGetId(i++, &id) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + + /* server use --servicenode param, only allow specified + * nids be registered */ + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) != 0 && + class_match_nid(lsi->lsi_lmd->lmd_params, + PARAM_FAILNODE, id.nid) < 1) + continue; + + /* match specified network */ + if (!class_match_net(lsi->lsi_lmd->lmd_params, + PARAM_NETWORK, LNET_NIDNET(id.nid))) + continue; + + mti->mti_nids[mti->mti_nid_count] = id.nid; + mti->mti_nid_count++; + if (mti->mti_nid_count >= MTI_NIDS_MAX) { + CWARN("Only using first %d nids for %s\n", + mti->mti_nid_count, mti->mti_svname); + break; + } + } + + mti->mti_lustre_ver = LUSTRE_VERSION_CODE; + mti->mti_config_ver = 0; + + rc = server_name2fsname(lsi->lsi_svname, mti->mti_fsname, NULL); + if (rc != 0) + return rc; + + rc = server_name2index(lsi->lsi_svname, &mti->mti_stripe_index, NULL); + if (rc < 0) + return rc; + /* Orion requires index to be set */ + LASSERT(!(rc & LDD_F_NEED_INDEX)); + /* keep only LDD flags */ + mti->mti_flags = lsi->lsi_flags & LDD_F_MASK; + if (mti->mti_flags & (LDD_F_WRITECONF | LDD_F_VIRGIN)) + mti->mti_flags |= LDD_F_UPDATE; + cplen = strlcpy(mti->mti_params, lsi->lsi_lmd->lmd_params, + sizeof(mti->mti_params)); + if (cplen >= sizeof(mti->mti_params)) + return -E2BIG; + return 0; +} + +/* Register an old or new target with the MGS. If needed MGS will construct + startup logs and assign index */ +static int server_register_target(struct lustre_sb_info *lsi) +{ + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + bool writeconf; + int rc; + ENTRY; + + LASSERT(mgc); + + if (!IS_SERVER(lsi)) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + CDEBUG(D_MOUNT, "Registration %s, fs=%s, %s, index=%04x, flags=%#x\n", + mti->mti_svname, mti->mti_fsname, + libcfs_nid2str(mti->mti_nids[0]), mti->mti_stripe_index, + mti->mti_flags); + + /* if write_conf is true, the registration must succeed */ + writeconf = !!(lsi->lsi_flags & (LDD_F_NEED_INDEX | LDD_F_UPDATE)); + mti->mti_flags |= LDD_F_OPC_REG; + + /* Register the target */ + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + if (rc) { + if (mti->mti_flags & LDD_F_ERROR) { + LCONSOLE_ERROR_MSG(0x160, + "%s: the MGS refuses to allow this server " + "to start: rc = %d. Please see messages on " + "the MGS.\n", lsi->lsi_svname, rc); + } else if (writeconf) { + LCONSOLE_ERROR_MSG(0x15f, + "%s: cannot register this server with the MGS: " + "rc = %d. Is the MGS running?\n", + lsi->lsi_svname, rc); + } else { + CERROR("%s: error registering with the MGS: rc = %d " + "(not fatal)\n", lsi->lsi_svname, rc); + /* reset the error code for non-fatal error. */ + rc = 0; + } + GOTO(out, rc); + } + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); +} + +/** + * Notify the MGS that this target is ready. + * Used by IR - if the MGS receives this message, it will notify clients. + */ +static int server_notify_target(struct super_block *sb, struct obd_device *obd) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + struct mgs_target_info *mti = NULL; + int rc; + ENTRY; + + LASSERT(mgc); + + if (!(IS_SERVER(lsi))) + RETURN(-EINVAL); + + OBD_ALLOC_PTR(mti); + if (!mti) + RETURN(-ENOMEM); + rc = server_lsi2mti(lsi, mti); + if (rc) + GOTO(out, rc); + + mti->mti_instance = obd->u.obt.obt_instance; + mti->mti_flags |= LDD_F_OPC_READY; + + /* FIXME use mgc_process_config instead */ + rc = obd_set_info_async(NULL, mgc->u.cli.cl_mgc_mgsexp, + sizeof(KEY_REGISTER_TARGET), + KEY_REGISTER_TARGET, + sizeof(*mti), mti, NULL); + + /* Imperative recovery: if the mgs informs us to use IR? */ + if (!rc && !(mti->mti_flags & LDD_F_ERROR) && + (mti->mti_flags & LDD_F_IR_CAPABLE)) + lsi->lsi_flags |= LDD_F_IR_CAPABLE; + +out: + if (mti) + OBD_FREE_PTR(mti); + RETURN(rc); + +} + +/** Start server targets: MDTs and OSTs + */ +static int server_start_targets(struct super_block *sb, struct vfsmount *mnt) +{ + struct obd_device *obd; + struct lustre_sb_info *lsi = s2lsi(sb); + struct config_llog_instance cfg; + struct lu_env env; + struct lu_device *dev; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "starting target %s\n", lsi->lsi_svname); + + if (IS_MDT(lsi)) { + /* make sure the MDS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(LUSTRE_MDS_OBDNAME); + if (!obd) { + rc = lustre_start_simple(LUSTRE_MDS_OBDNAME, + LUSTRE_MDS_NAME, + LUSTRE_MDS_OBDNAME"_uuid", + 0, 0, 0, 0); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start MDS: %d\n", rc); + RETURN(rc); + } + } + mutex_unlock(&server_start_lock); + } + + /* If we're an OST, make sure the global OSS is running */ + if (IS_OST(lsi)) { + /* make sure OSS is started */ + mutex_lock(&server_start_lock); + obd = class_name2obd(LUSTRE_OSS_OBDNAME); + if (!obd) { + rc = lustre_start_simple(LUSTRE_OSS_OBDNAME, + LUSTRE_OSS_NAME, + LUSTRE_OSS_OBDNAME"_uuid", + 0, 0, 0, 0); + if (rc) { + mutex_unlock(&server_start_lock); + CERROR("failed to start OSS: %d\n", rc); + RETURN(rc); + } + } + mutex_unlock(&server_start_lock); + } + + /* Set the mgc fs to our server disk. This allows the MGC to + * read and write configs locally, in case it can't talk to the MGS. */ + if (lsi->lsi_srv_mnt) { + rc = server_mgc_set_fs(lsi->lsi_mgc, sb); + if (rc) + GOTO(out_stop_service, rc); + } + + /* Register with MGS */ + rc = server_register_target(lsi); + if (rc) + GOTO(out_mgc, rc); + + /* Let the target look up the mount using the target's name + (we can't pass the sb or mnt through class_process_config.) */ + rc = server_register_mount(lsi->lsi_svname, sb, mnt); + if (rc) + GOTO(out_mgc, rc); + + /* Start targets using the llog named for the target */ + memset(&cfg, 0, sizeof(cfg)); + cfg.cfg_callback = class_config_llog_handler; + rc = lustre_process_log(sb, lsi->lsi_svname, &cfg); + if (rc) { + CERROR("failed to start server %s: %d\n", + lsi->lsi_svname, rc); + /* Do NOT call server_deregister_mount() here. This makes it + * impossible to find mount later in cleanup time and leaves + * @lsi and othder stuff leaked. -umka */ + GOTO(out_mgc, rc); + } + + obd = class_name2obd(lsi->lsi_svname); + if (!obd) { + CERROR("no server named %s was started\n", lsi->lsi_svname); + GOTO(out_mgc, rc = -ENXIO); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + rc = lustre_start_lwp(sb); + if (rc) { + CERROR("%s: failed to start LWP: %d\n", + lsi->lsi_svname, rc); + GOTO(out_mgc, rc); + } + } + + server_notify_target(sb, obd); + + /* calculate recovery timeout, do it after lustre_process_log */ + server_calc_timeout(lsi, obd); + + /* log has been fully processed */ + obd_notify(obd, NULL, OBD_NOTIFY_CONFIG, (void *)CONFIG_LOG); + + /* log has been fully processed, let clients connect */ + dev = obd->obd_lu_dev; + if (dev && dev->ld_ops->ldo_prepare) { + rc = lu_env_init(&env, dev->ld_type->ldt_ctx_tags); + if (rc == 0) { + struct lu_context session_ctx; + + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + env.le_ses = &session_ctx; + + rc = dev->ld_ops->ldo_prepare(&env, NULL, dev); + + lu_env_fini(&env); + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + } + } + + /* abort recovery only on the complete stack: + * many devices can be involved */ + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_ABORT_RECOV) && + (OBP(obd, iocontrol))) { + obd_iocontrol(OBD_IOC_ABORT_RECOVERY, obd->obd_self_export, 0, + NULL, NULL); + } + +out_mgc: + /* Release the mgc fs for others to use */ + if (lsi->lsi_srv_mnt) + server_mgc_clear_fs(lsi->lsi_mgc); + +out_stop_service: + if (rc != 0) + server_stop_servers(lsi->lsi_flags); + + RETURN(rc); +} + +static int lsi_prepare(struct lustre_sb_info *lsi) +{ + __u32 index; + int rc; + ENTRY; + + LASSERT(lsi); + LASSERT(lsi->lsi_lmd); + + /* The server name is given as a mount line option */ + if (lsi->lsi_lmd->lmd_profile == NULL) { + LCONSOLE_ERROR("Can't determine server name\n"); + RETURN(-EINVAL); + } + + if (strlen(lsi->lsi_lmd->lmd_profile) >= sizeof(lsi->lsi_svname)) + RETURN(-ENAMETOOLONG); + + strcpy(lsi->lsi_svname, lsi->lsi_lmd->lmd_profile); + + /* Determine osd type */ + if (lsi->lsi_lmd->lmd_osd_type != NULL) { + if (strlen(lsi->lsi_lmd->lmd_osd_type) >= + sizeof(lsi->lsi_osd_type)) + RETURN(-ENAMETOOLONG); + + strcpy(lsi->lsi_osd_type, lsi->lsi_lmd->lmd_osd_type); + } else { + strcpy(lsi->lsi_osd_type, LUSTRE_OSD_LDISKFS_NAME); + } + + /* XXX: a temp. solution for components using fsfilt + * to be removed in one of the subsequent patches */ + if (!strcmp(lsi->lsi_lmd->lmd_osd_type, "osd-ldiskfs")) + strcpy(lsi->lsi_fstype, "ldiskfs"); + else + strcpy(lsi->lsi_fstype, lsi->lsi_lmd->lmd_osd_type); + + /* Determine server type */ + rc = server_name2index(lsi->lsi_svname, &index, NULL); + if (rc < 0) { + if (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) { + /* Assume we're a bare MGS */ + rc = 0; + lsi->lsi_lmd->lmd_flags |= LMD_FLG_NOSVC; + } else { + LCONSOLE_ERROR("Can't determine server type of '%s'\n", + lsi->lsi_svname); + RETURN(rc); + } + } + lsi->lsi_flags |= rc; + + /* Add mount line flags that used to be in ldd: + * writeconf, mgs, anything else? + */ + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_WRITECONF) ? + LDD_F_WRITECONF : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_VIRGIN) ? + LDD_F_VIRGIN : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_UPDATE) ? + LDD_F_UPDATE : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_MGS) ? + LDD_F_SV_TYPE_MGS : 0; + lsi->lsi_flags |= (lsi->lsi_lmd->lmd_flags & LMD_FLG_NO_PRIMNODE) ? + LDD_F_NO_PRIMNODE : 0; + + RETURN(0); +} + +/*************** server mount ******************/ + +/** Start the shutdown of servers at umount. + */ +static void server_put_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *tmpname, *extraname = NULL; + int tmpname_sz; + int lsiflags = lsi->lsi_flags; + ENTRY; + + LASSERT(IS_SERVER(lsi)); + + tmpname_sz = strlen(lsi->lsi_svname) + 1; + OBD_ALLOC(tmpname, tmpname_sz); + memcpy(tmpname, lsi->lsi_svname, tmpname_sz); + CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); + if (IS_MDT(lsi) && (lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC)) + snprintf(tmpname, tmpname_sz, "MGS"); + + /* disconnect the lwp first to drain off the inflight request */ + if (IS_OST(lsi) || IS_MDT(lsi)) { + int rc; + + rc = lustre_disconnect_lwp(sb); + if (rc && rc != ETIMEDOUT) + CERROR("%s: failed to disconnect lwp. (rc=%d)\n", + tmpname, rc); + } + + /* Stop the target */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_MDT(lsi) || IS_OST(lsi))) { + struct lustre_profile *lprof = NULL; + + /* tell the mgc to drop the config log */ + lustre_end_log(sb, lsi->lsi_svname, NULL); + + /* COMPAT_146 - profile may get deleted in mgc_cleanup. + If there are any setup/cleanup errors, save the lov + name for safety cleanup later. */ + lprof = class_get_profile(lsi->lsi_svname); + if (lprof && lprof->lp_dt) { + OBD_ALLOC(extraname, strlen(lprof->lp_dt) + 1); + strcpy(extraname, lprof->lp_dt); + } + + obd = class_name2obd(lsi->lsi_svname); + if (obd) { + CDEBUG(D_MOUNT, "stopping %s\n", obd->obd_name); + if (lsiflags & LSI_UMOUNT_FAILOVER) + obd->obd_fail = 1; + /* We can't seem to give an error return code + * to .put_super, so we better make sure we clean up! */ + obd->obd_force = 1; + class_manual_cleanup(obd); + } else { + CERROR("no obd %s\n", lsi->lsi_svname); + server_deregister_mount(lsi->lsi_svname); + } + } + + /* If they wanted the mgs to stop separately from the mdt, they + should have put it on a different device. */ + if (IS_MGS(lsi)) { + /* if MDS start with --nomgs, don't stop MGS then */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) + server_stop_mgs(sb); + } + + if (IS_OST(lsi) || IS_MDT(lsi)) { + if (lustre_stop_lwp(sb) < 0) + CERROR("%s: failed to stop lwp!\n", tmpname); + } + + /* Clean the mgc and sb */ + lustre_common_put_super(sb); + + /* wait till all in-progress cleanups are done + * specifically we're interested in ofd cleanup + * as it pins OSS */ + obd_zombie_barrier(); + + /* Stop the servers (MDS, OSS) if no longer needed. We must wait + until the target is really gone so that our type refcount check + is right. */ + server_stop_servers(lsiflags); + + /* In case of startup or cleanup err, stop related obds */ + if (extraname) { + obd = class_name2obd(extraname); + if (obd) { + CWARN("Cleaning orphaned obd %s\n", extraname); + obd->obd_force = 1; + class_manual_cleanup(obd); + } + OBD_FREE(extraname, strlen(extraname) + 1); + } + + LCONSOLE_WARN("server umount %s complete\n", tmpname); + OBD_FREE(tmpname, tmpname_sz); + EXIT; +} + +/** Called only for 'umount -f' + */ +static void server_umount_begin(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + CDEBUG(D_MOUNT, "umount -f\n"); + /* umount = failover + umount -f = force + no third way to do non-force, non-failover */ + lsi->lsi_flags &= ~LSI_UMOUNT_FAILOVER; + EXIT; +} + +static int server_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_statfs statfs; + int rc; + ENTRY; + + if (lsi->lsi_dt_dev) { + rc = dt_statfs(NULL, lsi->lsi_dt_dev, &statfs); + if (rc == 0) { + statfs_unpack(buf, &statfs); + buf->f_type = sb->s_magic; + RETURN(0); + } + } + + /* just return 0 */ + buf->f_type = sb->s_magic; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = 1; + buf->f_bfree = 0; + buf->f_bavail = 0; + buf->f_files = 1; + buf->f_ffree = 0; + buf->f_namelen = NAME_MAX; + RETURN(0); +} + +/** The operations we support directly on the superblock: + * mount, umount, and df. + */ +static struct super_operations server_ops = { + .put_super = server_put_super, + .umount_begin = server_umount_begin, /* umount -f */ + .statfs = server_statfs, +}; + +#define log2(n) ffz(~(n)) +#define LUSTRE_SUPER_MAGIC 0x0BD00BD1 + +static int server_fill_super_common(struct super_block *sb) +{ + struct inode *root = 0; + ENTRY; + + CDEBUG(D_MOUNT, "Server sb, dev=%d\n", (int)sb->s_dev); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = log2(sb->s_blocksize); + sb->s_magic = LUSTRE_SUPER_MAGIC; + sb->s_maxbytes = 0; /* we don't allow file IO on server mountpoints */ + sb->s_flags |= MS_RDONLY; + sb->s_op = &server_ops; + + root = new_inode(sb); + if (!root) { + CERROR("Can't make root inode\n"); + RETURN(-EIO); + } + + /* returns -EIO for every operation */ + /* make_bad_inode(root); -- badness - can't umount */ + /* apparently we need to be a directory for the mount to finish */ + root->i_mode = S_IFDIR; + + sb->s_root = d_make_root(root); + if (!sb->s_root) { + CERROR("%s: can't make root dentry\n", sb->s_id); + RETURN(-EIO); + } + + RETURN(0); +} + +static int osd_start(struct lustre_sb_info *lsi, unsigned long mflags) +{ + struct lustre_mount_data *lmd = lsi->lsi_lmd; + struct obd_device *obd; + struct dt_device_param p; + char flagstr[16]; + int rc; + ENTRY; + + CDEBUG(D_MOUNT, + "Attempting to start %s, type=%s, lsifl=%x, mountfl=%lx\n", + lsi->lsi_svname, lsi->lsi_osd_type, lsi->lsi_flags, mflags); + + sprintf(lsi->lsi_osd_obdname, "%s-osd", lsi->lsi_svname); + strcpy(lsi->lsi_osd_uuid, lsi->lsi_osd_obdname); + strcat(lsi->lsi_osd_uuid, "_UUID"); + sprintf(flagstr, "%lu:%lu", mflags, (unsigned long) lmd->lmd_flags); + + obd = class_name2obd(lsi->lsi_osd_obdname); + if (obd == NULL) { + rc = lustre_start_simple(lsi->lsi_osd_obdname, + lsi->lsi_osd_type, + lsi->lsi_osd_uuid, lmd->lmd_dev, + flagstr, lsi->lsi_lmd->lmd_opts, + lsi->lsi_svname); + if (rc) + GOTO(out, rc); + obd = class_name2obd(lsi->lsi_osd_obdname); + LASSERT(obd); + } + + rc = obd_connect(NULL, &lsi->lsi_osd_exp, + obd, &obd->obd_uuid, NULL, NULL); + if (rc) { + obd->obd_force = 1; + class_manual_cleanup(obd); + lsi->lsi_dt_dev = NULL; + } + + /* XXX: to keep support old components relying on lsi_srv_mnt + * we get this info from OSD just started */ + LASSERT(obd->obd_lu_dev); + lsi->lsi_dt_dev = lu2dt_dev(obd->obd_lu_dev); + LASSERT(lsi->lsi_dt_dev); + + dt_conf_get(NULL, lsi->lsi_dt_dev, &p); + + lsi->lsi_srv_mnt = p.ddp_mnt; + +out: + RETURN(rc); +} + +/** Fill in the superblock info for a Lustre server. + * Mount the device with the correct options. + * Read the on-disk config file. + * Start the services. + */ +int server_fill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + int rc; + ENTRY; + + rc = lsi_prepare(lsi); + if (rc) + RETURN(rc); + + /* Start low level OSD */ + rc = osd_start(lsi, sb->s_flags); + if (rc) { + CERROR("Unable to start osd on %s: %d\n", + lsi->lsi_lmd->lmd_dev, rc); + lustre_put_lsi(sb); + RETURN(rc); + } + + CDEBUG(D_MOUNT, "Found service %s on device %s\n", + lsi->lsi_svname, lsi->lsi_lmd->lmd_dev); + + if (class_name2obd(lsi->lsi_svname)) { + LCONSOLE_ERROR_MSG(0x161, "The target named %s is already " + "running. Double-mount may have compromised" + " the disk journal.\n", + lsi->lsi_svname); + lustre_put_lsi(sb); + RETURN(-EALREADY); + } + + /* Start MGS before MGC */ + if (IS_MGS(lsi) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) { + rc = server_start_mgs(sb); + if (rc) + GOTO(out_mnt, rc); + } + + /* Start MGC before servers */ + rc = lustre_start_mgc(sb); + if (rc) + GOTO(out_mnt, rc); + + /* Set up all obd devices for service */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_OST(lsi) || IS_MDT(lsi))) { + rc = server_start_targets(sb, lsi->lsi_srv_mnt); + if (rc < 0) { + CERROR("Unable to start targets: %d\n", rc); + GOTO(out_mnt, rc); + } + /* FIXME overmount client here, or can we just start a + * client log and client_fill_super on this sb? We + * need to make sure server_put_super gets called too + * - ll_put_super calls lustre_common_put_super; check + * there for LSI_SERVER flag, call s_p_s if so. + * + * Probably should start client from new thread so we + * can return. Client will not finish until all + * servers are connected. Note - MGS-only server does + * NOT get a client, since there is no lustre fs + * associated - the MGS is for all lustre fs's */ + } + + rc = server_fill_super_common(sb); + if (rc) + GOTO(out_mnt, rc); + + RETURN(0); +out_mnt: + /* We jump here in case of failure while starting targets or MGS. + * In this case we can't just put @mnt and have to do real cleanup + * with stoping targets, etc. */ + server_put_super(sb); + return rc; +} + +/* + * Calculate timeout value for a target. + */ +void server_calc_timeout(struct lustre_sb_info *lsi, struct obd_device *obd) +{ + struct lustre_mount_data *lmd; + int soft = 0; + int hard = 0; + int factor = 0; + bool has_ir = !!(lsi->lsi_flags & LDD_F_IR_CAPABLE); + int min = OBD_RECOVERY_TIME_MIN; + + LASSERT(IS_SERVER(lsi)); + + lmd = lsi->lsi_lmd; + if (lmd) { + soft = lmd->lmd_recovery_time_soft; + hard = lmd->lmd_recovery_time_hard; + has_ir = has_ir && !(lmd->lmd_flags & LMD_FLG_NOIR); + obd->obd_no_ir = !has_ir; + } + + if (soft == 0) + soft = OBD_RECOVERY_TIME_SOFT; + if (hard == 0) + hard = OBD_RECOVERY_TIME_HARD; + + /* target may have ir_factor configured. */ + factor = OBD_IR_FACTOR_DEFAULT; + if (obd->obd_recovery_ir_factor) + factor = obd->obd_recovery_ir_factor; + + if (has_ir) { + int new_soft = soft; + int new_hard = hard; + + /* adjust timeout value by imperative recovery */ + + new_soft = (soft * factor) / OBD_IR_FACTOR_MAX; + new_hard = (hard * factor) / OBD_IR_FACTOR_MAX; + + /* make sure the timeout is not too short */ + new_soft = max(min, new_soft); + new_hard = max(new_soft, new_hard); + + LCONSOLE_INFO("%s: Imperative Recovery enabled, recovery " + "window shrunk from %d-%d down to %d-%d\n", + obd->obd_name, soft, hard, new_soft, new_hard); + + soft = new_soft; + hard = new_hard; + } + + /* we're done */ + obd->obd_recovery_timeout = max(obd->obd_recovery_timeout, soft); + obd->obd_recovery_time_hard = hard; + obd->obd_recovery_ir_factor = factor; +} +EXPORT_SYMBOL(server_calc_timeout); diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c new file mode 100644 index 000000000000..01a0e1f83a68 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obdo.c @@ -0,0 +1,362 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) +{ + dst->o_parent_oid = fid_oid(parent); + dst->o_parent_seq = fid_seq(parent); + dst->o_parent_ver = fid_ver(parent); + dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID; +} +EXPORT_SYMBOL(obdo_set_parent_fid); + +/* WARNING: the file systems must take care not to tinker with + attributes they don't manage (such as blocks). */ +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) +{ + obd_flag newvalid = 0; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", + valid, LTIME_S(src->i_mtime), + LTIME_S(src->i_ctime)); + + if (valid & OBD_MD_FLATIME) { + dst->o_atime = LTIME_S(src->i_atime); + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->o_mtime = LTIME_S(src->i_mtime); + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->o_ctime = LTIME_S(src->i_ctime); + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->o_size = i_size_read(src); + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = src->i_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ + dst->o_blksize = ll_inode_blksize(src); + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (src->i_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (src->i_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { + dst->o_uid = src->i_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { + dst->o_gid = src->i_gid; + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->o_flags = ll_inode_flags(src); + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_inode); + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid) +{ + CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n", + POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi)); + if (valid & OBD_MD_FLATIME) + dst->o_atime = src->o_atime; + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + dst->o_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = src->o_flags; + if (valid & OBD_MD_FLFID) { + dst->o_parent_seq = src->o_parent_seq; + dst->o_parent_ver = src->o_parent_ver; + } + if (valid & OBD_MD_FLGENER) + dst->o_parent_oid = src->o_parent_oid; + if (valid & OBD_MD_FLHANDLE) + dst->o_handle = src->o_handle; + if (valid & OBD_MD_FLCOOKIE) + dst->o_lcookie = src->o_lcookie; + + dst->o_valid |= valid; +} +EXPORT_SYMBOL(obdo_cpy_md); + +/* returns FALSE if comparison (by flags) is same, TRUE if changed */ +int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare) +{ + int res = 0; + + if ( compare & OBD_MD_FLATIME ) + res = (res || (dst->o_atime != src->o_atime)); + if ( compare & OBD_MD_FLMTIME ) + res = (res || (dst->o_mtime != src->o_mtime)); + if ( compare & OBD_MD_FLCTIME ) + res = (res || (dst->o_ctime != src->o_ctime)); + if ( compare & OBD_MD_FLSIZE ) + res = (res || (dst->o_size != src->o_size)); + if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */ + res = (res || (dst->o_blocks != src->o_blocks)); + if ( compare & OBD_MD_FLBLKSZ ) + res = (res || (dst->o_blksize != src->o_blksize)); + if ( compare & OBD_MD_FLTYPE ) + res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0)); + if ( compare & OBD_MD_FLMODE ) + res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0)); + if ( compare & OBD_MD_FLUID ) + res = (res || (dst->o_uid != src->o_uid)); + if ( compare & OBD_MD_FLGID ) + res = (res || (dst->o_gid != src->o_gid)); + if ( compare & OBD_MD_FLFLAGS ) + res = (res || (dst->o_flags != src->o_flags)); + if ( compare & OBD_MD_FLNLINK ) + res = (res || (dst->o_nlink != src->o_nlink)); + if ( compare & OBD_MD_FLFID ) { + res = (res || (dst->o_parent_seq != src->o_parent_seq)); + res = (res || (dst->o_parent_ver != src->o_parent_ver)); + } + if ( compare & OBD_MD_FLGENER ) + res = (res || (dst->o_parent_oid != src->o_parent_oid)); + /* XXX Don't know if thses should be included here - wasn't previously + if ( compare & OBD_MD_FLINLINE ) + res = (res || memcmp(dst->o_inline, src->o_inline)); + */ + return res; +} +EXPORT_SYMBOL(obdo_cmp_md); + +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj) +{ + ioobj->ioo_oid = oa->o_oi; + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) + ostid_set_seq_mdt0(&ioobj->ioo_oid); + + /* Since 2.4 this does not contain o_mode in the low 16 bits. + * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */ + ioobj->ioo_max_brw = 0; +} +EXPORT_SYMBOL(obdo_to_ioobj); + +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid) +{ + if (ia_valid & ATTR_ATIME) { + oa->o_atime = LTIME_S(attr->ia_atime); + oa->o_valid |= OBD_MD_FLATIME; + } + if (ia_valid & ATTR_MTIME) { + oa->o_mtime = LTIME_S(attr->ia_mtime); + oa->o_valid |= OBD_MD_FLMTIME; + } + if (ia_valid & ATTR_CTIME) { + oa->o_ctime = LTIME_S(attr->ia_ctime); + oa->o_valid |= OBD_MD_FLCTIME; + } + if (ia_valid & ATTR_SIZE) { + oa->o_size = attr->ia_size; + oa->o_valid |= OBD_MD_FLSIZE; + } + if (ia_valid & ATTR_MODE) { + oa->o_mode = attr->ia_mode; + oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE; + if (!current_is_in_group(oa->o_gid) && + !cfs_capable(CFS_CAP_FSETID)) + oa->o_mode &= ~S_ISGID; + } + if (ia_valid & ATTR_UID) { + oa->o_uid = attr->ia_uid; + oa->o_valid |= OBD_MD_FLUID; + } + if (ia_valid & ATTR_GID) { + oa->o_gid = attr->ia_gid; + oa->o_valid |= OBD_MD_FLGID; + } +} +EXPORT_SYMBOL(obdo_from_iattr); + +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid) +{ + valid &= oa->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n", + oa->o_valid, oa->o_mtime, oa->o_ctime); + + attr->ia_valid = 0; + if (valid & OBD_MD_FLATIME) { + LTIME_S(attr->ia_atime) = oa->o_atime; + attr->ia_valid |= ATTR_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + LTIME_S(attr->ia_mtime) = oa->o_mtime; + attr->ia_valid |= ATTR_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + LTIME_S(attr->ia_ctime) = oa->o_ctime; + attr->ia_valid |= ATTR_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + attr->ia_size = oa->o_size; + attr->ia_valid |= ATTR_SIZE; + } +#if 0 /* you shouldn't be able to change a file's type with setattr */ + if (valid & OBD_MD_FLTYPE) { + attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT); + attr->ia_valid |= ATTR_MODE; + } +#endif + if (valid & OBD_MD_FLMODE) { + attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT); + attr->ia_valid |= ATTR_MODE; + if (!current_is_in_group(oa->o_gid) && + !cfs_capable(CFS_CAP_FSETID)) + attr->ia_mode &= ~S_ISGID; + } + if (valid & OBD_MD_FLUID) { + attr->ia_uid = oa->o_uid; + attr->ia_valid |= ATTR_UID; + } + if (valid & OBD_MD_FLGID) { + attr->ia_gid = oa->o_gid; + attr->ia_valid |= ATTR_GID; + } +} +EXPORT_SYMBOL(iattr_from_obdo); + +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid) +{ + iattr_from_obdo(&op_data->op_attr, oa, valid); + if (valid & OBD_MD_FLBLOCKS) { + op_data->op_attr_blocks = oa->o_blocks; + op_data->op_attr.ia_valid |= ATTR_BLOCKS; + } + if (valid & OBD_MD_FLFLAGS) { + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = + oa->o_flags; + op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; + } +} +EXPORT_SYMBOL(md_from_obdo); + +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid) +{ + obdo_from_iattr(oa, &op_data->op_attr, valid); + if (valid & ATTR_BLOCKS) { + oa->o_blocks = op_data->op_attr_blocks; + oa->o_valid |= OBD_MD_FLBLOCKS; + } + if (valid & ATTR_ATTR_FLAG) { + oa->o_flags = + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags; + oa->o_valid |= OBD_MD_FLFLAGS; + } +} +EXPORT_SYMBOL(obdo_from_md); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = cpu_to_le64(sobdo->o_size); + dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime); + dobdo->o_atime = cpu_to_le64(sobdo->o_atime); + dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime); + dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks); + dobdo->o_mode = cpu_to_le32(sobdo->o_mode); + dobdo->o_uid = cpu_to_le32(sobdo->o_uid); + dobdo->o_gid = cpu_to_le32(sobdo->o_gid); + dobdo->o_flags = cpu_to_le32(sobdo->o_flags); + dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink); + dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize); + dobdo->o_valid = cpu_to_le64(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_cpu_to_le); + +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = le64_to_cpu(sobdo->o_size); + dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime); + dobdo->o_atime = le64_to_cpu(sobdo->o_atime); + dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime); + dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks); + dobdo->o_mode = le32_to_cpu(sobdo->o_mode); + dobdo->o_uid = le32_to_cpu(sobdo->o_uid); + dobdo->o_gid = le32_to_cpu(sobdo->o_gid); + dobdo->o_flags = le32_to_cpu(sobdo->o_flags); + dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink); + dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize); + dobdo->o_valid = le64_to_cpu(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_le_to_cpu); diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c new file mode 100644 index 000000000000..c3b7a78dba50 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/statfs_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include +#include + +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs) +{ + memset(osfs, 0, sizeof(*osfs)); + osfs->os_type = sfs->f_type; + osfs->os_blocks = sfs->f_blocks; + osfs->os_bfree = sfs->f_bfree; + osfs->os_bavail = sfs->f_bavail; + osfs->os_files = sfs->f_files; + osfs->os_ffree = sfs->f_ffree; + osfs->os_bsize = sfs->f_bsize; + osfs->os_namelen = sfs->f_namelen; +} +EXPORT_SYMBOL(statfs_pack); + +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) +{ + memset(sfs, 0, sizeof(*sfs)); + sfs->f_type = osfs->os_type; + sfs->f_blocks = osfs->os_blocks; + sfs->f_bfree = osfs->os_bfree; + sfs->f_bavail = osfs->os_bavail; + sfs->f_files = osfs->os_files; + sfs->f_ffree = osfs->os_ffree; + sfs->f_bsize = osfs->os_bsize; + sfs->f_namelen = osfs->os_namelen; +} +EXPORT_SYMBOL(statfs_unpack); diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c new file mode 100644 index 000000000000..af5f27f82bc5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/uuid.c @@ -0,0 +1,82 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/uuid.c + * + * Public include file for the UUID library + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +# include + +#include +#include + + +static inline __u32 consume(int nob, __u8 **ptr) +{ + __u32 value; + + LASSERT(nob <= sizeof value); + + for (value = 0; nob > 0; --nob) + value = (value << 8) | *((*ptr)++); + return value; +} + +#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr)) + +static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr) +{ + __u8 *ptr = in; + + LASSERT(nr * sizeof *uu == sizeof(class_uuid_t)); + + while (nr-- > 0) + CONSUME(uu[nr], &ptr); +} + +void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) +{ + /* uu as an array of __u16's */ + __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)]; + + CLASSERT(ARRAY_SIZE(uuid) == 8); + + uuid_unpack(uu, uuid, ARRAY_SIZE(uuid)); + sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7]); +} +EXPORT_SYMBOL(class_uuid_unparse); diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile new file mode 100644 index 000000000000..4c48e2432f9b --- /dev/null +++ b/drivers/staging/lustre/lustre/obdecho/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_LUSTRE_FS) += obdecho.o +obdecho-y := echo_client.o lproc_echo.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/obdecho/echo.c b/drivers/staging/lustre/lustre/obdecho/echo.c new file mode 100644 index 000000000000..9e64939af9dc --- /dev/null +++ b/drivers/staging/lustre/lustre/obdecho/echo.c @@ -0,0 +1,679 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdecho/echo.c + * + * Author: Peter Braam + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#include +#include +#include +#include + +#include "echo_internal.h" + +/* The echo objid needs to be below 2^32, because regular FID numbers are + * limited to 2^32 objects in f_oid for the FID_SEQ_ECHO range. b=23335 */ +#define ECHO_INIT_OID 0x10000000ULL +#define ECHO_HANDLE_MAGIC 0xabcd0123fedc9876ULL + +#define ECHO_PERSISTENT_PAGES (ECHO_PERSISTENT_SIZE >> PAGE_CACHE_SHIFT) +static struct page *echo_persistent_pages[ECHO_PERSISTENT_PAGES]; + +enum { + LPROC_ECHO_READ_BYTES = 1, + LPROC_ECHO_WRITE_BYTES = 2, + LPROC_ECHO_LAST = LPROC_ECHO_WRITE_BYTES +1 +}; + +static int echo_connect(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *data, + void *localdata) +{ + struct lustre_handle conn = { 0 }; + int rc; + + data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED; + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("can't connect %d\n", rc); + return rc; + } + *exp = class_conn2export(&conn); + + return 0; +} + +static int echo_disconnect(struct obd_export *exp) +{ + LASSERT (exp != NULL); + + return server_disconnect_export(exp); +} + +static int echo_init_export(struct obd_export *exp) +{ + return ldlm_init_export(exp); +} + +static int echo_destroy_export(struct obd_export *exp) +{ + ENTRY; + + target_destroy_export(exp); + ldlm_destroy_export(exp); + + RETURN(0); +} + + static __u64 echo_next_id(struct obd_device *obddev) +{ + obd_id id; + + spin_lock(&obddev->u.echo.eo_lock); + id = ++obddev->u.echo.eo_lastino; + spin_unlock(&obddev->u.echo.eo_lock); + + return id; +} + +static int echo_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + struct obd_device *obd = class_exp2obd(exp); + + if (!obd) { + CERROR("invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + return -EINVAL; + } + + if (!(oa->o_mode && S_IFMT)) { + CERROR("echo obd: no type!\n"); + return -ENOENT; + } + + if (!(oa->o_valid & OBD_MD_FLTYPE)) { + CERROR("invalid o_valid "LPX64"\n", oa->o_valid); + return -EINVAL; + } + + ostid_set_seq_echo(&oa->o_oi); + ostid_set_id(&oa->o_oi, echo_next_id(obd)); + oa->o_valid = OBD_MD_FLID; + + return 0; +} + +static int echo_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa) +{ + struct obd_device *obd = class_exp2obd(exp); + + ENTRY; + if (!obd) { + CERROR("invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR("obdo missing FLID valid flag: "LPX64"\n", oa->o_valid); + RETURN(-EINVAL); + } + + if (ostid_id(&oa->o_oi) > obd->u.echo.eo_lastino || + ostid_id(&oa->o_oi) < ECHO_INIT_OID) { + CERROR("bad destroy objid: "DOSTID"\n", POSTID(&oa->o_oi)); + RETURN(-EINVAL); + } + + RETURN(0); +} + +static int echo_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + struct obd_device *obd = class_exp2obd(exp); + obd_id id = ostid_id(&oinfo->oi_oa->o_oi); + + ENTRY; + if (!obd) { + CERROR("invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) { + CERROR("obdo missing FLID valid flag: "LPX64"\n", + oinfo->oi_oa->o_valid); + RETURN(-EINVAL); + } + + obdo_cpy_md(oinfo->oi_oa, &obd->u.echo.eo_oa, oinfo->oi_oa->o_valid); + ostid_set_seq_echo(&oinfo->oi_oa->o_oi); + ostid_set_id(&oinfo->oi_oa->o_oi, id); + + RETURN(0); +} + +static int echo_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) +{ + struct obd_device *obd = class_exp2obd(exp); + + ENTRY; + if (!obd) { + CERROR("invalid client cookie "LPX64"\n", + exp->exp_handle.h_cookie); + RETURN(-EINVAL); + } + + if (!(oinfo->oi_oa->o_valid & OBD_MD_FLID)) { + CERROR("obdo missing FLID valid flag: "LPX64"\n", + oinfo->oi_oa->o_valid); + RETURN(-EINVAL); + } + + memcpy(&obd->u.echo.eo_oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa)); + + if (ostid_id(&oinfo->oi_oa->o_oi) & 4) { + /* Save lock to force ACKed reply */ + ldlm_lock_addref (&obd->u.echo.eo_nl_lock, LCK_NL); + oti->oti_ack_locks[0].mode = LCK_NL; + oti->oti_ack_locks[0].lock = obd->u.echo.eo_nl_lock; + } + + RETURN(0); +} + +static void +echo_page_debug_setup(struct page *page, int rw, obd_id id, + __u64 offset, int len) +{ + int page_offset = offset & ~CFS_PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + if (rw & OBD_BRW_READ) + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + else + block_debug_setup(addr, OBD_ECHO_BLOCK_SIZE, + 0xecc0ecc0ecc0ecc0ULL, + 0xecc0ecc0ecc0ecc0ULL); + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); +} + +static int +echo_page_debug_check(struct page *page, obd_id id, + __u64 offset, int len) +{ + int page_offset = offset & ~CFS_PAGE_MASK; + char *addr = ((char *)kmap(page)) + page_offset; + int rc = 0; + int rc2; + + if (len % OBD_ECHO_BLOCK_SIZE != 0) + CERROR("Unexpected block size %d\n", len); + + while (len > 0) { + rc2 = block_debug_check("echo", addr, OBD_ECHO_BLOCK_SIZE, + offset, id); + + if (rc2 != 0 && rc == 0) + rc = rc2; + + addr += OBD_ECHO_BLOCK_SIZE; + offset += OBD_ECHO_BLOCK_SIZE; + len -= OBD_ECHO_BLOCK_SIZE; + } + + kunmap(page); + + return (rc); +} + +/* This allows us to verify that desc_private is passed unmolested */ +#define DESC_PRIV 0x10293847 + +static int echo_map_nb_to_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *lb, int cmd, int *left) +{ + int gfp_mask = (ostid_id(&obj->ioo_oid) & 1) ? + GFP_HIGHUSER : GFP_IOFS; + int ispersistent = ostid_id(&obj->ioo_oid) == ECHO_PERSISTENT_OBJID; + int debug_setup = (!ispersistent && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + struct niobuf_local *res = lb; + obd_off offset = nb->offset; + int len = nb->len; + + while (len > 0) { + int plen = PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)); + if (len < plen) + plen = len; + + /* check for local buf overflow */ + if (*left == 0) + return -EINVAL; + + res->lnb_file_offset = offset; + res->len = plen; + LASSERT((res->lnb_file_offset & ~CFS_PAGE_MASK) + res->len <= + PAGE_CACHE_SIZE); + + if (ispersistent && + ((res->lnb_file_offset >> PAGE_CACHE_SHIFT) < + ECHO_PERSISTENT_PAGES)) { + res->page = + echo_persistent_pages[res->lnb_file_offset >> + PAGE_CACHE_SHIFT]; + /* Take extra ref so __free_pages() can be called OK */ + get_page (res->page); + } else { + OBD_PAGE_ALLOC(res->page, gfp_mask); + if (res->page == NULL) { + CERROR("can't get page for id " DOSTID"\n", + POSTID(&obj->ioo_oid)); + return -ENOMEM; + } + } + + CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n", + res->page, res->lnb_file_offset, res->len); + + if (cmd & OBD_BRW_READ) + res->rc = res->len; + + if (debug_setup) + echo_page_debug_setup(res->page, cmd, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, res->len); + + offset += plen; + len -= plen; + res++; + + (*left)--; + (*pages)++; + } + + return 0; +} + +static int echo_finalize_lb(struct obdo *oa, struct obd_ioobj *obj, + struct niobuf_remote *rb, int *pgs, + struct niobuf_local *lb, int verify) +{ + struct niobuf_local *res = lb; + obd_off start = rb->offset >> PAGE_CACHE_SHIFT; + obd_off end = (rb->offset + rb->len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int count = (int)(end - start); + int rc = 0; + int i; + + for (i = 0; i < count; i++, (*pgs) ++, res++) { + struct page *page = res->page; + void *addr; + + if (page == NULL) { + CERROR("null page objid "LPU64":%p, buf %d/%d\n", + ostid_id(&obj->ioo_oid), page, i, + obj->ioo_bufcnt); + return -EFAULT; + } + + addr = kmap(page); + + CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n", + res->page, addr, res->lnb_file_offset); + + if (verify) { + int vrc = echo_page_debug_check(page, + ostid_id(&obj->ioo_oid), + res->lnb_file_offset, + res->len); + /* check all the pages always */ + if (vrc != 0 && rc == 0) + rc = vrc; + } + + kunmap(page); + /* NB see comment above regarding persistent pages */ + OBD_PAGE_FREE(page); + } + + return rc; +} + +static int echo_preprw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *nb, int *pages, + struct niobuf_local *res, struct obd_trans_info *oti, + struct lustre_capa *unused) +{ + struct obd_device *obd; + int tot_bytes = 0; + int rc = 0; + int i, left; + ENTRY; + + obd = export->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + + /* Temp fix to stop falling foul of osc_announce_cached() */ + oa->o_valid &= ~(OBD_MD_FLBLOCKS | OBD_MD_FLGRANT); + + memset(res, 0, sizeof(*res) * *pages); + + CDEBUG(D_PAGE, "%s %d obdos with %d IOs\n", + cmd == OBD_BRW_READ ? "reading" : "writing", objcount, *pages); + + if (oti) + oti->oti_handle = (void *)DESC_PRIV; + + left = *pages; + *pages = 0; + + for (i = 0; i < objcount; i++, obj++) { + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++) { + + rc = echo_map_nb_to_lb(oa, obj, nb, pages, + res + *pages, cmd, &left); + if (rc) + GOTO(preprw_cleanup, rc); + + tot_bytes += nb->len; + } + } + + atomic_add(*pages, &obd->u.echo.eo_prep); + + if (cmd & OBD_BRW_READ) + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_READ_BYTES, + tot_bytes); + else + lprocfs_counter_add(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + tot_bytes); + + CDEBUG(D_PAGE, "%d pages allocated after prep\n", + atomic_read(&obd->u.echo.eo_prep)); + + RETURN(0); + +preprw_cleanup: + /* It is possible that we would rather handle errors by allow + * any already-set-up pages to complete, rather than tearing them + * all down again. I believe that this is what the in-kernel + * prep/commit operations do. + */ + CERROR("cleaning up %u pages (%d obdos)\n", *pages, objcount); + for (i = 0; i < *pages; i++) { + kunmap(res[i].page); + /* NB if this is a persistent page, __free_pages will just + * lose the extra ref gained above */ + OBD_PAGE_FREE(res[i].page); + res[i].page = NULL; + atomic_dec(&obd->u.echo.eo_prep); + } + + return rc; +} + +static int echo_commitrw(const struct lu_env *env, int cmd, + struct obd_export *export, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rb, int niocount, + struct niobuf_local *res, struct obd_trans_info *oti, + int rc) +{ + struct obd_device *obd; + int pgs = 0; + int i; + ENTRY; + + obd = export->exp_obd; + if (obd == NULL) + RETURN(-EINVAL); + + if (rc) + GOTO(commitrw_cleanup, rc); + + if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) { + CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n", + objcount, niocount); + } else { + CDEBUG(D_PAGE, "writing %d obdos with %d IOs\n", + objcount, niocount); + } + + if (niocount && res == NULL) { + CERROR("NULL res niobuf with niocount %d\n", niocount); + RETURN(-EINVAL); + } + + LASSERT(oti == NULL || oti->oti_handle == (void *)DESC_PRIV); + + for (i = 0; i < objcount; i++, obj++) { + int verify = (rc == 0 && + ostid_id(&obj->ioo_oid) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + int j; + + for (j = 0 ; j < obj->ioo_bufcnt ; j++, rb++) { + int vrc = echo_finalize_lb(oa, obj, rb, &pgs, &res[pgs], + verify); + if (vrc == 0) + continue; + + if (vrc == -EFAULT) + GOTO(commitrw_cleanup, rc = vrc); + + if (rc == 0) + rc = vrc; + } + + } + + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CDEBUG(D_PAGE, "%d pages remain after commit\n", + atomic_read(&obd->u.echo.eo_prep)); + RETURN(rc); + +commitrw_cleanup: + atomic_sub(pgs, &obd->u.echo.eo_prep); + + CERROR("cleaning up %d pages (%d obdos)\n", + niocount - pgs - 1, objcount); + + while (pgs < niocount) { + struct page *page = res[pgs++].page; + + if (page == NULL) + continue; + + /* NB see comment above regarding persistent pages */ + OBD_PAGE_FREE(page); + atomic_dec(&obd->u.echo.eo_prep); + } + return rc; +} + +static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars; + int rc; + __u64 lock_flags = 0; + struct ldlm_res_id res_id = {.name = {1}}; + char ns_name[48]; + ENTRY; + + obd->u.echo.eo_obt.obt_magic = OBT_MAGIC; + spin_lock_init(&obd->u.echo.eo_lock); + obd->u.echo.eo_lastino = ECHO_INIT_OID; + + sprintf(ns_name, "echotgt-%s", obd->obd_uuid.uuid); + obd->obd_namespace = ldlm_namespace_new(obd, ns_name, + LDLM_NAMESPACE_SERVER, + LDLM_NAMESPACE_MODEST, + LDLM_NS_TYPE_OST); + if (obd->obd_namespace == NULL) { + LBUG(); + RETURN(-ENOMEM); + } + + rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN, + NULL, LCK_NL, &lock_flags, NULL, + ldlm_completion_ast, NULL, NULL, 0, + LVB_T_NONE, NULL, &obd->u.echo.eo_nl_lock); + LASSERT (rc == ELDLM_OK); + + lprocfs_echo_init_vars(&lvars); + if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0 && + lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) { + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "read_bytes", "bytes"); + lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, + LPROCFS_CNTR_AVGMINMAX, + "write_bytes", "bytes"); + } + + ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "echo_ldlm_cb_client", &obd->obd_ldlm_client); + RETURN(0); +} + +static int echo_cleanup(struct obd_device *obd) +{ + int leaked; + ENTRY; + + lprocfs_obd_cleanup(obd); + lprocfs_free_obd_stats(obd); + + ldlm_lock_decref(&obd->u.echo.eo_nl_lock, LCK_NL); + + /* XXX Bug 3413; wait for a bit to ensure the BL callback has + * happened before calling ldlm_namespace_free() */ + schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, cfs_time_seconds(1)); + + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; + + leaked = atomic_read(&obd->u.echo.eo_prep); + if (leaked != 0) + CERROR("%d prep/commitrw pages leaked\n", leaked); + + RETURN(0); +} + +struct obd_ops echo_obd_ops = { + .o_owner = THIS_MODULE, + .o_connect = echo_connect, + .o_disconnect = echo_disconnect, + .o_init_export = echo_init_export, + .o_destroy_export = echo_destroy_export, + .o_create = echo_create, + .o_destroy = echo_destroy, + .o_getattr = echo_getattr, + .o_setattr = echo_setattr, + .o_preprw = echo_preprw, + .o_commitrw = echo_commitrw, + .o_setup = echo_setup, + .o_cleanup = echo_cleanup +}; + +void echo_persistent_pages_fini(void) +{ + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) + if (echo_persistent_pages[i] != NULL) { + OBD_PAGE_FREE(echo_persistent_pages[i]); + echo_persistent_pages[i] = NULL; + } +} + +int echo_persistent_pages_init(void) +{ + struct page *pg; + int i; + + for (i = 0; i < ECHO_PERSISTENT_PAGES; i++) { + int gfp_mask = (i < ECHO_PERSISTENT_PAGES/2) ? + GFP_IOFS : GFP_HIGHUSER; + + OBD_PAGE_ALLOC(pg, gfp_mask); + if (pg == NULL) { + echo_persistent_pages_fini (); + return (-ENOMEM); + } + + memset (kmap (pg), 0, PAGE_CACHE_SIZE); + kunmap (pg); + + echo_persistent_pages[i] = pg; + } + + return (0); +} diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c new file mode 100644 index 000000000000..0545d1666841 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c @@ -0,0 +1,3217 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_ECHO +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "echo_internal.h" + +/** \defgroup echo_client Echo Client + * @{ + */ + +struct echo_device { + struct cl_device ed_cl; + struct echo_client_obd *ed_ec; + + struct cl_site ed_site_myself; + struct cl_site *ed_site; + struct lu_device *ed_next; + int ed_next_islov; + int ed_next_ismd; + struct lu_client_seq *ed_cl_seq; +}; + +struct echo_object { + struct cl_object eo_cl; + struct cl_object_header eo_hdr; + + struct echo_device *eo_dev; + struct list_head eo_obj_chain; + struct lov_stripe_md *eo_lsm; + atomic_t eo_npages; + int eo_deleted; +}; + +struct echo_object_conf { + struct cl_object_conf eoc_cl; + struct lov_stripe_md **eoc_md; +}; + +struct echo_page { + struct cl_page_slice ep_cl; + struct mutex ep_lock; + struct page *ep_vmpage; +}; + +struct echo_lock { + struct cl_lock_slice el_cl; + struct list_head el_chain; + struct echo_object *el_object; + __u64 el_cookie; + atomic_t el_refcount; +}; + +struct echo_io { + struct cl_io_slice ei_cl; +}; + +#if 0 +struct echo_req { + struct cl_req_slice er_cl; +}; +#endif + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, + struct lustre_cfg *lcfg); +static int echo_client_cleanup(struct obd_device *obddev); + + +/** \defgroup echo_helpers Helper functions + * @{ + */ +static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) +{ + return container_of0(dev, struct echo_device, ed_cl); +} + +static inline struct cl_device *echo_dev2cl(struct echo_device *d) +{ + return &d->ed_cl; +} + +static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) +{ + return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); +} + +static inline struct cl_object *echo_obj2cl(struct echo_object *eco) +{ + return &eco->eo_cl; +} + +static inline struct echo_object *cl2echo_obj(const struct cl_object *o) +{ + return container_of(o, struct echo_object, eo_cl); +} + +static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) +{ + return container_of(s, struct echo_page, ep_cl); +} + +static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) +{ + return container_of(s, struct echo_lock, el_cl); +} + +static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) +{ + return ecl->el_cl.cls_lock; +} + +static struct lu_context_key echo_thread_key; +static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) +{ + struct echo_thread_info *info; + info = lu_context_key_get(&env->le_ctx, &echo_thread_key); + LASSERT(info != NULL); + return info; +} + +static inline +struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) +{ + return container_of(c, struct echo_object_conf, eoc_cl); +} + +/** @} echo_helpers */ + +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsm); +static int cl_echo_object_put(struct echo_object *eco); +static int cl_echo_enqueue (struct echo_object *eco, obd_off start, + obd_off end, int mode, __u64 *cookie); +static int cl_echo_cancel (struct echo_device *d, __u64 cookie); +static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset, + struct page **pages, int npages, int async); + +static struct echo_thread_info *echo_env_info(const struct lu_env *env); + +struct echo_thread_info { + struct echo_object_conf eti_conf; + struct lustre_md eti_md; + + struct cl_2queue eti_queue; + struct cl_io eti_io; + struct cl_lock_descr eti_descr; + struct lu_fid eti_fid; + struct lu_fid eti_fid2; + struct md_op_spec eti_spec; + struct lov_mds_md_v3 eti_lmm; + struct lov_user_md_v3 eti_lum; + struct md_attr eti_ma; + struct lu_name eti_lname; + /* per-thread values, can be re-used */ + void *eti_big_lmm; + int eti_big_lmmsize; + char eti_name[20]; + struct lu_buf eti_buf; + char eti_xattr_buf[LUSTRE_POSIX_ACL_MAX_SIZE]; +}; + +/* No session used right now */ +struct echo_session_info { + unsigned long dummy; +}; + +static struct kmem_cache *echo_lock_kmem; +static struct kmem_cache *echo_object_kmem; +static struct kmem_cache *echo_thread_kmem; +static struct kmem_cache *echo_session_kmem; +//static struct kmem_cache *echo_req_kmem; + +static struct lu_kmem_descr echo_caches[] = { + { + .ckd_cache = &echo_lock_kmem, + .ckd_name = "echo_lock_kmem", + .ckd_size = sizeof (struct echo_lock) + }, + { + .ckd_cache = &echo_object_kmem, + .ckd_name = "echo_object_kmem", + .ckd_size = sizeof (struct echo_object) + }, + { + .ckd_cache = &echo_thread_kmem, + .ckd_name = "echo_thread_kmem", + .ckd_size = sizeof (struct echo_thread_info) + }, + { + .ckd_cache = &echo_session_kmem, + .ckd_name = "echo_session_kmem", + .ckd_size = sizeof (struct echo_session_info) + }, +#if 0 + { + .ckd_cache = &echo_req_kmem, + .ckd_name = "echo_req_kmem", + .ckd_size = sizeof (struct echo_req) + }, +#endif + { + .ckd_cache = NULL + } +}; + +/** \defgroup echo_page Page operations + * + * Echo page operations. + * + * @{ + */ +static struct page *echo_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2echo_page(slice)->ep_vmpage; +} + +static int echo_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock) +{ + struct echo_page *ep = cl2echo_page(slice); + + if (!nonblock) + mutex_lock(&ep->ep_lock); + else if (!mutex_trylock(&ep->ep_lock)) + return -EAGAIN; + return 0; +} + +static void echo_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct echo_page *ep = cl2echo_page(slice); + + LASSERT(mutex_is_locked(&ep->ep_lock)); + mutex_unlock(&ep->ep_lock); +} + +static void echo_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + cl_page_delete(env, slice->cpl_page); +} + +static int echo_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + if (mutex_is_locked(&cl2echo_page(slice)->ep_lock)) + return -EBUSY; + return -ENODATA; +} + +static void echo_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + LASSERT(slice->cpl_page->cp_sync_io != NULL); +} + +static void echo_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct echo_page *ep = cl2echo_page(slice); + struct echo_object *eco = cl2echo_obj(slice->cpl_obj); + struct page *vmpage = ep->ep_vmpage; + ENTRY; + + atomic_dec(&eco->eo_npages); + page_cache_release(vmpage); + EXIT; +} + +static int echo_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + return 0; +} + +static int echo_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct echo_page *ep = cl2echo_page(slice); + + (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p %d vm@%p\n", + ep, mutex_is_locked(&ep->ep_lock), ep->ep_vmpage); + return 0; +} + +static const struct cl_page_operations echo_page_ops = { + .cpo_own = echo_page_own, + .cpo_disown = echo_page_disown, + .cpo_discard = echo_page_discard, + .cpo_vmpage = echo_page_vmpage, + .cpo_fini = echo_page_fini, + .cpo_print = echo_page_print, + .cpo_is_vmlocked = echo_page_is_vmlocked, + .io = { + [CRT_READ] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + } + } +}; +/** @} echo_page */ + +/** \defgroup echo_lock Locking + * + * echo lock operations + * + * @{ + */ +static void echo_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); + OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem); +} + +static void echo_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); +} + +static int echo_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *unused) +{ + return 1; +} + +static struct cl_lock_operations echo_lock_ops = { + .clo_fini = echo_lock_fini, + .clo_delete = echo_lock_delete, + .clo_fits_into = echo_lock_fits_into +}; + +/** @} echo_lock */ + +/** \defgroup echo_cl_ops cl_object operations + * + * operations for cl_object + * + * @{ + */ +static int echo_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct echo_page *ep = cl_object_page_slice(obj, page); + struct echo_object *eco = cl2echo_obj(obj); + ENTRY; + + ep->ep_vmpage = vmpage; + page_cache_get(vmpage); + mutex_init(&ep->ep_lock); + cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops); + atomic_inc(&eco->eo_npages); + RETURN(0); +} + +static int echo_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + return 0; +} + +static int echo_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct echo_lock *el; + ENTRY; + + OBD_SLAB_ALLOC_PTR_GFP(el, echo_lock_kmem, __GFP_IO); + if (el != NULL) { + cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); + el->el_object = cl2echo_obj(obj); + INIT_LIST_HEAD(&el->el_chain); + atomic_set(&el->el_refcount, 0); + } + RETURN(el == NULL ? -ENOMEM : 0); +} + +static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + return 0; +} + +static const struct cl_object_operations echo_cl_obj_ops = { + .coo_page_init = echo_page_init, + .coo_lock_init = echo_lock_init, + .coo_io_init = echo_io_init, + .coo_conf_set = echo_conf_set +}; +/** @} echo_cl_ops */ + +/** \defgroup echo_lu_ops lu_object operations + * + * operations for echo lu object. + * + * @{ + */ +static int echo_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + ENTRY; + + if (ed->ed_next) { + struct lu_object *below; + struct lu_device *under; + + under = ed->ed_next; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, + under); + if (below == NULL) + RETURN(-ENOMEM); + lu_object_add(obj, below); + } + + if (!ed->ed_next_ismd) { + const struct cl_object_conf *cconf = lu2cl_conf(conf); + struct echo_object_conf *econf = cl2echo_conf(cconf); + + LASSERT(econf->eoc_md); + eco->eo_lsm = *econf->eoc_md; + /* clear the lsm pointer so that it won't get freed. */ + *econf->eoc_md = NULL; + } else { + eco->eo_lsm = NULL; + } + + eco->eo_dev = ed; + atomic_set(&eco->eo_npages, 0); + cl_object_page_init(lu2cl(obj), sizeof(struct echo_page)); + + spin_lock(&ec->ec_lock); + list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); + spin_unlock(&ec->ec_lock); + + RETURN(0); +} + +/* taken from osc_unpackmd() */ +static int echo_alloc_memmd(struct echo_device *ed, + struct lov_stripe_md **lsmp) +{ + int lsm_size; + + ENTRY; + + /* If export is lov/osc then use their obd method */ + if (ed->ed_next != NULL) + return obd_alloc_memmd(ed->ed_ec->ec_exp, lsmp); + /* OFD has no unpackmd method, do everything here */ + lsm_size = lov_stripe_md_size(1); + + LASSERT(*lsmp == NULL); + OBD_ALLOC(*lsmp, lsm_size); + if (*lsmp == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if ((*lsmp)->lsm_oinfo[0] == NULL) { + OBD_FREE(*lsmp, lsm_size); + RETURN(-ENOMEM); + } + + loi_init((*lsmp)->lsm_oinfo[0]); + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + ostid_set_seq_echo(&(*lsmp)->lsm_oi); + + RETURN(lsm_size); +} + +static int echo_free_memmd(struct echo_device *ed, struct lov_stripe_md **lsmp) +{ + int lsm_size; + + ENTRY; + + /* If export is lov/osc then use their obd method */ + if (ed->ed_next != NULL) + return obd_free_memmd(ed->ed_ec->ec_exp, lsmp); + /* OFD has no unpackmd method, do everything here */ + lsm_size = lov_stripe_md_size(1); + + LASSERT(*lsmp != NULL); + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + RETURN(0); +} + +static void echo_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + ENTRY; + + LASSERT(atomic_read(&eco->eo_npages) == 0); + + spin_lock(&ec->ec_lock); + list_del_init(&eco->eo_obj_chain); + spin_unlock(&ec->ec_lock); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + + if (eco->eo_lsm) + echo_free_memmd(eco->eo_dev, &eco->eo_lsm); + OBD_SLAB_FREE_PTR(eco, echo_object_kmem); + EXIT; +} + +static int echo_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct echo_object *obj = cl2echo_obj(lu2cl(o)); + + return (*p)(env, cookie, "echoclient-object@%p", obj); +} + +static const struct lu_object_operations echo_lu_obj_ops = { + .loo_object_init = echo_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = echo_object_free, + .loo_object_print = echo_object_print, + .loo_object_invariant = NULL +}; +/** @} echo_lu_ops */ + +/** \defgroup echo_lu_dev_ops lu_device operations + * + * Operations for echo lu device. + * + * @{ + */ +static struct lu_object *echo_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct echo_object *eco; + struct lu_object *obj = NULL; + ENTRY; + + /* we're the top dev. */ + LASSERT(hdr == NULL); + OBD_SLAB_ALLOC_PTR_GFP(eco, echo_object_kmem, __GFP_IO); + if (eco != NULL) { + struct cl_object_header *hdr = &eco->eo_hdr; + + obj = &echo_obj2cl(eco)->co_lu; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + eco->eo_cl.co_ops = &echo_cl_obj_ops; + obj->lo_ops = &echo_lu_obj_ops; + } + RETURN(obj); +} + +static struct lu_device_operations echo_device_lu_ops = { + .ldo_object_alloc = echo_object_alloc, +}; + +/** @} echo_lu_dev_ops */ + +static struct cl_device_operations echo_device_cl_ops = { +}; + +/** \defgroup echo_init Setup and teardown + * + * Init and fini functions for echo client. + * + * @{ + */ +static int echo_site_init(const struct lu_env *env, struct echo_device *ed) +{ + struct cl_site *site = &ed->ed_site_myself; + int rc; + + /* initialize site */ + rc = cl_site_init(site, &ed->ed_cl); + if (rc) { + CERROR("Cannot initilize site for echo client(%d)\n", rc); + return rc; + } + + rc = lu_site_init_finish(&site->cs_lu); + if (rc) + return rc; + + ed->ed_site = site; + return 0; +} + +static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) +{ + if (ed->ed_site) { + if (!ed->ed_next_ismd) + cl_site_fini(ed->ed_site); + ed->ed_site = NULL; + } +} + +static void *echo_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, echo_thread_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void echo_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, echo_thread_kmem); +} + +static void echo_thread_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = echo_thread_key_init, + .lct_fini = echo_thread_key_fini, + .lct_exit = echo_thread_key_exit +}; + +static void *echo_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_session_info *session; + + OBD_SLAB_ALLOC_PTR_GFP(session, echo_session_kmem, __GFP_IO); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void echo_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_session_info *session = data; + OBD_SLAB_FREE_PTR(session, echo_session_kmem); +} + +static void echo_session_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = echo_session_key_init, + .lct_fini = echo_session_key_fini, + .lct_exit = echo_session_key_exit +}; + +LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); + +#define ECHO_SEQ_WIDTH 0xffffffff +static int echo_fid_init(struct echo_device *ed, char *obd_name, + struct seq_server_site *ss) +{ + char *prefix; + int rc; + ENTRY; + + OBD_ALLOC_PTR(ed->ed_cl_seq); + if (ed->ed_cl_seq == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (prefix == NULL) + GOTO(out_free_seq, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(ed->ed_cl_seq, NULL, + LUSTRE_SEQ_METADATA, + prefix, ss->ss_server_seq); + ed->ed_cl_seq->lcs_width = ECHO_SEQ_WIDTH; + OBD_FREE(prefix, MAX_OBD_NAME + 5); + if (rc) + GOTO(out_free_seq, rc); + + RETURN(0); + +out_free_seq: + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + RETURN(rc); +} + +static int echo_fid_fini(struct obd_device *obddev) +{ + struct echo_device *ed = obd2echo_dev(obddev); + ENTRY; + + if (ed->ed_cl_seq != NULL) { + seq_client_fini(ed->ed_cl_seq); + OBD_FREE_PTR(ed->ed_cl_seq); + ed->ed_cl_seq = NULL; + } + + RETURN(0); +} + +static struct lu_device *echo_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *next; + struct echo_device *ed; + struct cl_device *cd; + struct obd_device *obd = NULL; /* to keep compiler happy */ + struct obd_device *tgt; + const char *tgt_type_name; + int rc; + int cleanup = 0; + ENTRY; + + OBD_ALLOC_PTR(ed); + if (ed == NULL) + GOTO(out, rc = -ENOMEM); + + cleanup = 1; + cd = &ed->ed_cl; + rc = cl_device_init(cd, t); + if (rc) + GOTO(out, rc); + + cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; + cd->cd_ops = &echo_device_cl_ops; + + cleanup = 2; + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + LASSERT(env != NULL); + + tgt = class_name2obd(lustre_cfg_string(cfg, 1)); + if (tgt == NULL) { + CERROR("Can not find tgt device %s\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -ENODEV); + } + + next = tgt->obd_lu_dev; + if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { + ed->ed_next_ismd = 1; + } else { + ed->ed_next_ismd = 0; + rc = echo_site_init(env, ed); + if (rc) + GOTO(out, rc); + } + cleanup = 3; + + rc = echo_client_setup(env, obd, cfg); + if (rc) + GOTO(out, rc); + + ed->ed_ec = &obd->u.echo_client; + cleanup = 4; + + if (ed->ed_next_ismd) { + /* Suppose to connect to some Metadata layer */ + struct lu_site *ls; + struct lu_device *ld; + int found = 0; + + if (next == NULL) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + tgt_type_name = lustre_cfg_string(cfg, 2); + if (!tgt_type_name) { + CERROR("%s no type name for echo %s setup\n", + lustre_cfg_string(cfg, 1), + tgt->obd_type->typ_name); + GOTO(out, rc = -EINVAL); + } + + ls = next->ld_site; + + spin_lock(&ls->ls_ld_lock); + list_for_each_entry(ld, &ls->ls_ld_linkage, ld_linkage) { + if (strcmp(ld->ld_type->ldt_name, tgt_type_name) == 0) { + found = 1; + break; + } + } + spin_unlock(&ls->ls_ld_lock); + + if (found == 0) { + CERROR("%s is not lu device type!\n", + lustre_cfg_string(cfg, 1)); + GOTO(out, rc = -EINVAL); + } + + next = ld; + /* For MD echo client, it will use the site in MDS stack */ + ed->ed_site_myself.cs_lu = *ls; + ed->ed_site = &ed->ed_site_myself; + ed->ed_cl.cd_lu_dev.ld_site = &ed->ed_site_myself.cs_lu; + rc = echo_fid_init(ed, obd->obd_name, lu_site2seq(ls)); + if (rc) { + CERROR("echo fid init error %d\n", rc); + GOTO(out, rc); + } + } else { + /* if echo client is to be stacked upon ost device, the next is + * NULL since ost is not a clio device so far */ + if (next != NULL && !lu_device_is_cl(next)) + next = NULL; + + tgt_type_name = tgt->obd_type->typ_name; + if (next != NULL) { + LASSERT(next != NULL); + if (next->ld_site != NULL) + GOTO(out, rc = -EBUSY); + + next->ld_site = &ed->ed_site->cs_lu; + rc = next->ld_type->ldt_ops->ldto_device_init(env, next, + next->ld_type->ldt_name, + NULL); + if (rc) + GOTO(out, rc); + + /* Tricky case, I have to determine the obd type since + * CLIO uses the different parameters to initialize + * objects for lov & osc. */ + if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0) + ed->ed_next_islov = 1; + else + LASSERT(strcmp(tgt_type_name, + LUSTRE_OSC_NAME) == 0); + } else + LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); + } + + ed->ed_next = next; + RETURN(&cd->cd_lu_dev); +out: + switch(cleanup) { + case 4: { + int rc2; + rc2 = echo_client_cleanup(obd); + if (rc2) + CERROR("Cleanup obd device %s error(%d)\n", + obd->obd_name, rc2); + } + + case 3: + echo_site_fini(env, ed); + case 2: + cl_device_fini(&ed->ed_cl); + case 1: + OBD_FREE_PTR(ed); + case 0: + default: + break; + } + return(ERR_PTR(rc)); +} + +static int echo_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + LBUG(); + return 0; +} + +static struct lu_device *echo_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct lu_device *next = ed->ed_next; + + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_fini(env, next); + return NULL; +} + +static void echo_lock_release(const struct lu_env *env, + struct echo_lock *ecl, + int still_used) +{ + struct cl_lock *clk = echo_lock2cl(ecl); + + cl_lock_get(clk); + cl_unuse(env, clk); + cl_lock_release(env, clk, "ec enqueue", ecl->el_object); + if (!still_used) { + cl_lock_mutex_get(env, clk); + cl_lock_cancel(env, clk); + cl_lock_delete(env, clk); + cl_lock_mutex_put(env, clk); + } + cl_lock_put(env, clk); +} + +static struct lu_device *echo_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct lu_device *next = ed->ed_next; + + CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n", + ed, next); + + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + + /* check if there are objects still alive. + * It shouldn't have any object because lu_site_purge would cleanup + * all of cached objects. Anyway, probably the echo device is being + * parallelly accessed. + */ + spin_lock(&ec->ec_lock); + list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain) + eco->eo_deleted = 1; + spin_unlock(&ec->ec_lock); + + /* purge again */ + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + + CDEBUG(D_INFO, + "Waiting for the reference of echo object to be dropped\n"); + + /* Wait for the last reference to be dropped. */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_objects)) { + spin_unlock(&ec->ec_lock); + CERROR("echo_client still has objects at cleanup time, " + "wait for 1 second\n"); + schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, + cfs_time_seconds(1)); + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + LASSERT(list_empty(&ec->ec_locks)); + + CDEBUG(D_INFO, "No object exists, exiting...\n"); + + echo_client_cleanup(d->ld_obd); + echo_fid_fini(d->ld_obd); + while (next && !ed->ed_next_ismd) + next = next->ld_type->ldt_ops->ldto_device_free(env, next); + + LASSERT(ed->ed_site == lu2cl_site(d->ld_site)); + echo_site_fini(env, ed); + cl_device_fini(&ed->ed_cl); + OBD_FREE_PTR(ed); + + return NULL; +} + +static const struct lu_device_type_operations echo_device_type_ops = { + .ldto_init = echo_type_init, + .ldto_fini = echo_type_fini, + + .ldto_start = echo_type_start, + .ldto_stop = echo_type_stop, + + .ldto_device_alloc = echo_device_alloc, + .ldto_device_free = echo_device_free, + .ldto_device_init = echo_device_init, + .ldto_device_fini = echo_device_fini +}; + +static struct lu_device_type echo_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_ECHO_CLIENT_NAME, + .ldt_ops = &echo_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD | LCT_MD_THREAD | LCT_DT_THREAD, +}; +/** @} echo_init */ + +/** \defgroup echo_exports Exported operations + * + * exporting functions to echo client + * + * @{ + */ + +/* Interfaces to echo client obd device */ +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsmp) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct echo_object_conf *conf; + struct lov_stripe_md *lsm; + struct echo_object *eco; + struct cl_object *obj; + struct lu_fid *fid; + int refcheck; + int rc; + ENTRY; + + LASSERT(lsmp); + lsm = *lsmp; + LASSERT(lsm); + LASSERT(ostid_id(&lsm->lsm_oi) != 0); + LASSERT(ostid_seq(&lsm->lsm_oi) == FID_SEQ_ECHO); + + /* Never return an object if the obd is to be freed. */ + if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) + RETURN(ERR_PTR(-ENODEV)); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN((void *)env); + + info = echo_env_info(env); + conf = &info->eti_conf; + if (d->ed_next) { + if (!d->ed_next_islov) { + struct lov_oinfo *oinfo = lsm->lsm_oinfo[0]; + LASSERT(oinfo != NULL); + oinfo->loi_oi = lsm->lsm_oi; + conf->eoc_cl.u.coc_oinfo = oinfo; + } else { + struct lustre_md *md; + md = &info->eti_md; + memset(md, 0, sizeof *md); + md->lsm = lsm; + conf->eoc_cl.u.coc_md = md; + } + } + conf->eoc_md = lsmp; + + fid = &info->eti_fid; + rc = ostid_to_fid(fid, &lsm->lsm_oi, 0); + if (rc != 0) + GOTO(out, eco = ERR_PTR(rc)); + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); + if (IS_ERR(obj)) + GOTO(out, eco = (void*)obj); + + eco = cl2echo_obj(obj); + if (eco->eo_deleted) { + cl_object_put(env, obj); + eco = ERR_PTR(-EAGAIN); + } + +out: + cl_env_put(env, &refcheck); + RETURN(eco); +} + +static int cl_echo_object_put(struct echo_object *eco) +{ + struct lu_env *env; + struct cl_object *obj = echo_obj2cl(eco); + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + /* an external function to kill an object? */ + if (eco->eo_deleted) { + struct lu_object_header *loh = obj->co_lu.lo_header; + LASSERT(&eco->eo_hdr == luh2coh(loh)); + set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); + } + + cl_object_put(env, obj); + cl_env_put(env, &refcheck); + RETURN(0); +} + +static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco, + obd_off start, obd_off end, int mode, + __u64 *cookie , __u32 enqflags) +{ + struct cl_io *io; + struct cl_lock *lck; + struct cl_object *obj; + struct cl_lock_descr *descr; + struct echo_thread_info *info; + int rc = -ENOMEM; + ENTRY; + + info = echo_env_info(env); + io = &info->eti_io; + descr = &info->eti_descr; + obj = echo_obj2cl(eco); + + descr->cld_obj = obj; + descr->cld_start = cl_index(obj, start); + descr->cld_end = cl_index(obj, end); + descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; + descr->cld_enq_flags = enqflags; + io->ci_obj = obj; + + lck = cl_lock_request(env, io, descr, "ec enqueue", eco); + if (lck) { + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct echo_lock *el; + + rc = cl_wait(env, lck); + if (rc == 0) { + el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); + spin_lock(&ec->ec_lock); + if (list_empty(&el->el_chain)) { + list_add(&el->el_chain, &ec->ec_locks); + el->el_cookie = ++ec->ec_unique; + } + atomic_inc(&el->el_refcount); + *cookie = el->el_cookie; + spin_unlock(&ec->ec_lock); + } else { + cl_lock_release(env, lck, "ec enqueue", current); + } + } + RETURN(rc); +} + +static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end, + int mode, __u64 *cookie) +{ + struct echo_thread_info *info; + struct lu_env *env; + struct cl_io *io; + int refcheck; + int result; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + io = &info->eti_io; + + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, echo_obj2cl(eco)); + if (result < 0) + GOTO(out, result); + LASSERT(result == 0); + + result = cl_echo_enqueue0(env, eco, start, end, mode, cookie, 0); + cl_io_fini(env, io); + + EXIT; +out: + cl_env_put(env, &refcheck); + return result; +} + +static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, + __u64 cookie) +{ + struct echo_client_obd *ec = ed->ed_ec; + struct echo_lock *ecl = NULL; + struct list_head *el; + int found = 0, still_used = 0; + ENTRY; + + LASSERT(ec != NULL); + spin_lock(&ec->ec_lock); + list_for_each (el, &ec->ec_locks) { + ecl = list_entry (el, struct echo_lock, el_chain); + CDEBUG(D_INFO, "ecl: %p, cookie: "LPX64"\n", ecl, ecl->el_cookie); + found = (ecl->el_cookie == cookie); + if (found) { + if (atomic_dec_and_test(&ecl->el_refcount)) + list_del_init(&ecl->el_chain); + else + still_used = 1; + break; + } + } + spin_unlock(&ec->ec_lock); + + if (!found) + RETURN(-ENOENT); + + echo_lock_release(env, ecl, still_used); + RETURN(0); +} + +static int cl_echo_cancel(struct echo_device *ed, __u64 cookie) +{ + struct lu_env *env; + int refcheck; + int rc; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = cl_echo_cancel0(env, ed, cookie); + + cl_env_put(env, &refcheck); + RETURN(rc); +} + +static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type unused, struct cl_2queue *queue) +{ + struct cl_page *clp; + struct cl_page *temp; + int result = 0; + ENTRY; + + cl_page_list_for_each_safe(clp, temp, &queue->c2_qin) { + int rc; + rc = cl_page_cache_add(env, io, clp, CRT_WRITE); + if (rc == 0) + continue; + result = result ?: rc; + } + RETURN(result); +} + +static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset, + struct page **pages, int npages, int async) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct cl_object *obj = echo_obj2cl(eco); + struct echo_device *ed = eco->eo_dev; + struct cl_2queue *queue; + struct cl_io *io; + struct cl_page *clp; + struct lustre_handle lh = { 0 }; + int page_size = cl_page_size(obj); + int refcheck; + int rc; + int i; + ENTRY; + + LASSERT((offset & ~CFS_PAGE_MASK) == 0); + LASSERT(ed->ed_next != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + io = &info->eti_io; + queue = &info->eti_queue; + + cl_2queue_init(queue); + + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc < 0) + GOTO(out, rc); + LASSERT(rc == 0); + + + rc = cl_echo_enqueue0(env, eco, offset, + offset + npages * PAGE_CACHE_SIZE - 1, + rw == READ ? LCK_PR : LCK_PW, &lh.cookie, + CEF_NEVER); + if (rc < 0) + GOTO(error_lock, rc); + + for (i = 0; i < npages; i++) { + LASSERT(pages[i]); + clp = cl_page_find(env, obj, cl_index(obj, offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + LASSERT(clp->cp_type == CPT_TRANSIENT); + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + cl_2queue_add(queue, clp); + + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + cl_page_clip(env, clp, 0, page_size); + + offset += page_size; + } + + if (rc == 0) { + enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; + + async = async && (typ == CRT_WRITE); + if (async) + rc = cl_echo_async_brw(env, io, typ, queue); + else + rc = cl_io_submit_sync(env, io, typ, queue, 0); + CDEBUG(D_INFO, "echo_client %s write returns %d\n", + async ? "async" : "sync", rc); + } + + cl_echo_cancel0(env, ed, lh.cookie); + EXIT; +error_lock: + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + cl_io_fini(env, io); +out: + cl_env_put(env, &refcheck); + return rc; +} +/** @} echo_exports */ + + +static obd_id last_object_id; + +static int +echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) +{ + struct lov_stripe_md *ulsm = _ulsm; + int nob, i; + + nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]); + if (nob > ulsm_nob) + return (-EINVAL); + + if (copy_to_user (ulsm, lsm, sizeof(ulsm))) + return (-EFAULT); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (copy_to_user (ulsm->lsm_oinfo[i], lsm->lsm_oinfo[i], + sizeof(lsm->lsm_oinfo[0]))) + return (-EFAULT); + } + return 0; +} + +static int +echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm, + void *ulsm, int ulsm_nob) +{ + struct echo_client_obd *ec = ed->ed_ec; + int i; + + if (ulsm_nob < sizeof (*lsm)) + return (-EINVAL); + + if (copy_from_user (lsm, ulsm, sizeof (*lsm))) + return (-EFAULT); + + if (lsm->lsm_stripe_count > ec->ec_nstripes || + lsm->lsm_magic != LOV_MAGIC || + (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 || + ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) + return (-EINVAL); + + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (copy_from_user(lsm->lsm_oinfo[i], + ((struct lov_stripe_md *)ulsm)-> \ + lsm_oinfo[i], + sizeof(lsm->lsm_oinfo[0]))) + return (-EFAULT); + } + return (0); +} + +static inline void echo_md_build_name(struct lu_name *lname, char *name, + __u64 id) +{ + sprintf(name, LPU64, id); + lname->ln_name = name; + lname->ln_namelen = strlen(name); +} + +/* similar to mdt_attr_get_complex */ +static int echo_big_lmm_get(const struct lu_env *env, struct md_object *o, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + int rc; + + ENTRY; + + LASSERT(ma->ma_lmm_size > 0); + + rc = mo_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LOV); + if (rc < 0) + RETURN(rc); + + /* big_lmm may need to be grown */ + if (info->eti_big_lmmsize < rc) { + int size = size_roundup_power2(rc); + + if (info->eti_big_lmmsize > 0) { + /* free old buffer */ + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, + info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; + } + + OBD_ALLOC_LARGE(info->eti_big_lmm, size); + if (info->eti_big_lmm == NULL) + RETURN(-ENOMEM); + info->eti_big_lmmsize = size; + } + LASSERT(info->eti_big_lmmsize >= rc); + + info->eti_buf.lb_buf = info->eti_big_lmm; + info->eti_buf.lb_len = info->eti_big_lmmsize; + rc = mo_xattr_get(env, o, &info->eti_buf, XATTR_NAME_LOV); + if (rc < 0) + RETURN(rc); + + ma->ma_valid |= MA_LOV; + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = rc; + + RETURN(0); +} + +int echo_attr_get_complex(const struct lu_env *env, struct md_object *next, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_buf *buf = &info->eti_buf; + umode_t mode = lu_object_attr(&next->mo_lu); + int need = ma->ma_need; + int rc = 0, rc2; + + ENTRY; + + ma->ma_valid = 0; + + if (need & MA_INODE) { + ma->ma_need = MA_INODE; + rc = mo_attr_get(env, next, ma); + if (rc) + GOTO(out, rc); + ma->ma_valid |= MA_INODE; + } + + if (need & MA_LOV) { + if (S_ISREG(mode) || S_ISDIR(mode)) { + LASSERT(ma->ma_lmm_size > 0); + buf->lb_buf = ma->ma_lmm; + buf->lb_len = ma->ma_lmm_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_LOV); + if (rc2 > 0) { + ma->ma_lmm_size = rc2; + ma->ma_valid |= MA_LOV; + } else if (rc2 == -ENODATA) { + /* no LOV EA */ + ma->ma_lmm_size = 0; + } else if (rc2 == -ERANGE) { + rc2 = echo_big_lmm_get(env, next, ma); + if (rc2 < 0) + GOTO(out, rc = rc2); + } else { + GOTO(out, rc = rc2); + } + } + } + +#ifdef CONFIG_FS_POSIX_ACL + if (need & MA_ACL_DEF && S_ISDIR(mode)) { + buf->lb_buf = ma->ma_acl; + buf->lb_len = ma->ma_acl_size; + rc2 = mo_xattr_get(env, next, buf, XATTR_NAME_ACL_DEFAULT); + if (rc2 > 0) { + ma->ma_acl_size = rc2; + ma->ma_valid |= MA_ACL_DEF; + } else if (rc2 == -ENODATA) { + /* no ACLs */ + ma->ma_acl_size = 0; + } else { + GOTO(out, rc = rc2); + } + } +#endif +out: + ma->ma_need = need; + CDEBUG(D_INODE, "after getattr rc = %d, ma_valid = "LPX64" ma_lmm=%p\n", + rc, ma->ma_valid, ma->ma_lmm); + RETURN(rc); +} + +static int +echo_md_create_internal(const struct lu_env *env, struct echo_device *ed, + struct md_object *parent, struct lu_fid *fid, + struct lu_name *lname, struct md_op_spec *spec, + struct md_attr *ma) +{ + struct lu_object *ec_child, *child; + struct lu_device *ld = ed->ed_next; + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid2 = &info->eti_fid2; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + int rc; + + ENTRY; + + rc = mdo_lookup(env, parent, lname, fid2, spec); + if (rc == 0) + return -EEXIST; + else if (rc != -ENOENT) + return rc; + + ec_child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, + fid, &conf); + if (IS_ERR(ec_child)) { + CERROR("Can not find the child "DFID": rc = %ld\n", PFID(fid), + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child "DFID"\n", PFID(fid)); + GOTO(out_put, rc = -EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start creating object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + /* + * Do not perform lookup sanity check. We know that name does not exist. + */ + spec->sp_cr_lookup = 0; + rc = mdo_create(env, parent, lname, lu2md(child), spec, ma); + if (rc) { + CERROR("Can not create child "DFID": rc = %d\n", PFID(fid), rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End creating object "DFID" %s %p rc = %d\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent, rc); + EXIT; +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_set_lmm_size(const struct lu_env *env, struct lu_device *ld, + struct md_attr *ma) +{ + struct echo_thread_info *info = echo_env_info(env); + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + ma->ma_lmm = (void *)&info->eti_lmm; + ma->ma_lmm_size = sizeof(info->eti_lmm); + } else { + LASSERT(info->eti_big_lmmsize); + ma->ma_lmm = info->eti_big_lmm; + ma->ma_lmm_size = info->eti_big_lmmsize; + } + + return 0; +} + +static int echo_create_md_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + struct lu_fid *fid, + char *name, int namelen, + __u64 id, __u32 mode, int count, + int stripe_count, int stripe_offset) +{ + struct lu_object *parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_op_spec *spec = &info->eti_spec; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + memset(ma, 0, sizeof(*ma)); + memset(spec, 0, sizeof(*spec)); + if (stripe_count != 0) { + spec->sp_cr_flags |= FMODE_WRITE; + echo_set_lmm_size(env, ld, ma); + if (stripe_count != -1) { + struct lov_user_md_v3 *lum = &info->eti_lum; + + lum->lmm_magic = LOV_USER_MAGIC_V3; + lum->lmm_stripe_count = stripe_count; + lum->lmm_stripe_offset = stripe_offset; + lum->lmm_pattern = 0; + spec->u.sp_ea.eadata = lum; + spec->u.sp_ea.eadatalen = sizeof(*lum); + spec->sp_cr_flags |= MDS_OPEN_HAS_EA; + } + } + + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME | LA_MODE; + ma->ma_attr.la_ctime = cfs_time_current_64(); + + if (name != NULL) { + lname->ln_name = name; + lname->ln_namelen = namelen; + /* If name is specified, only create one object by name */ + rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname, + spec, ma); + RETURN(rc); + } + + /* Create multiple object sequenced by id */ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_create_internal(env, ed, lu2md(parent), fid, lname, + spec, ma); + if (rc) { + CERROR("Can not create child %s: rc = %d\n", tmp_name, + rc); + break; + } + id++; + fid->f_oid++; + } + + RETURN(rc); +} + +static struct lu_object *echo_md_lookup(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_object *child; + int rc; + ENTRY; + + CDEBUG(D_INFO, "lookup %s in parent "DFID" %p\n", lname->ln_name, + PFID(fid), parent); + rc = mdo_lookup(env, parent, lname, fid, NULL); + if (rc) { + CERROR("lookup %s: rc = %d\n", lname->ln_name, rc); + RETURN(ERR_PTR(rc)); + } + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + child = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + + RETURN(child); +} + +static int echo_setattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_device *ld = ed->ed_next; + struct lu_buf *buf = &info->eti_buf; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + echo_md_build_name(lname, name, id); + + ec_child = echo_md_lookup(env, ed, lu2md(parent), lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", + lname->ln_name, PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + rc = -EINVAL; + break; + } + + CDEBUG(D_RPCTRACE, "Start setattr object "DFID"\n", + PFID(lu_object_fid(child))); + + buf->lb_buf = info->eti_xattr_buf; + buf->lb_len = sizeof(info->eti_xattr_buf); + + sprintf(name, "%s.test1", XATTR_USER_PREFIX); + rc = mo_xattr_set(env, lu2md(child), buf, name, + LU_XATTR_CREATE); + if (rc < 0) { + CERROR("Can not setattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End setattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + RETURN(rc); +} + +static int echo_getattr_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + ENTRY; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-ENXIO); + + memset(ma, 0, sizeof(*ma)); + ma->ma_need |= MA_INODE | MA_LOV | MA_PFID | MA_HSM | MA_ACL_DEF; + ma->ma_acl = info->eti_xattr_buf; + ma->ma_acl_size = sizeof(info->eti_xattr_buf); + + for (i = 0; i < count; i++) { + struct lu_object *ec_child, *child; + + ma->ma_valid = 0; + echo_md_build_name(lname, name, id); + echo_set_lmm_size(env, ld, ma); + + ec_child = echo_md_lookup(env, ed, lu2md(parent), lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", + lname->ln_name, PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + lu_object_put(env, ec_child); + RETURN(-EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start getattr object "DFID"\n", + PFID(lu_object_fid(child))); + rc = echo_attr_get_complex(env, lu2md(child), ma); + if (rc) { + CERROR("Can not getattr child "DFID": rc = %d\n", + PFID(lu_object_fid(child)), rc); + lu_object_put(env, ec_child); + break; + } + CDEBUG(D_RPCTRACE, "End getattr object "DFID"\n", + PFID(lu_object_fid(child))); + id++; + lu_object_put(env, ec_child); + } + + RETURN(rc); +} + +static int echo_lookup_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + __u64 id, int count) +{ + struct lu_object *parent; + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + char *name = info->eti_name; + struct lu_fid *fid = &info->eti_fid; + struct lu_device *ld = ed->ed_next; + int rc = 0; + int i; + + if (ec_parent == NULL) + return -1; + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + return -ENXIO; + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + echo_md_build_name(lname, name, id); + + CDEBUG(D_RPCTRACE, "Start lookup object "DFID" %s %p\n", + PFID(lu_object_fid(parent)), lname->ln_name, parent); + + rc = mdo_lookup(env, lu2md(parent), lname, fid, NULL); + if (rc) { + CERROR("Can not lookup child %s: rc = %d\n", name, rc); + break; + } + CDEBUG(D_RPCTRACE, "End lookup object "DFID" %s %p\n", + PFID(lu_object_fid(parent)), lname->ln_name, parent); + + id++; + } + return rc; +} + +static int echo_md_destroy_internal(const struct lu_env *env, + struct echo_device *ed, + struct md_object *parent, + struct lu_name *lname, + struct md_attr *ma) +{ + struct lu_device *ld = ed->ed_next; + struct lu_object *ec_child; + struct lu_object *child; + int rc; + + ENTRY; + + ec_child = echo_md_lookup(env, ed, parent, lname); + if (IS_ERR(ec_child)) { + CERROR("Can't find child %s: rc = %ld\n", lname->ln_name, + PTR_ERR(ec_child)); + RETURN(PTR_ERR(ec_child)); + } + + child = lu_object_locate(ec_child->lo_header, ld->ld_type); + if (child == NULL) { + CERROR("Can not locate the child %s\n", lname->ln_name); + GOTO(out_put, rc = -EINVAL); + } + + CDEBUG(D_RPCTRACE, "Start destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); + + rc = mdo_unlink(env, parent, lu2md(child), lname, ma, 0); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", + lname->ln_name, rc); + GOTO(out_put, rc); + } + CDEBUG(D_RPCTRACE, "End destroy object "DFID" %s %p\n", + PFID(lu_object_fid(&parent->mo_lu)), lname->ln_name, parent); +out_put: + lu_object_put(env, ec_child); + return rc; +} + +static int echo_destroy_object(const struct lu_env *env, + struct echo_device *ed, + struct lu_object *ec_parent, + char *name, int namelen, + __u64 id, __u32 mode, + int count) +{ + struct echo_thread_info *info = echo_env_info(env); + struct lu_name *lname = &info->eti_lname; + struct md_attr *ma = &info->eti_ma; + struct lu_device *ld = ed->ed_next; + struct lu_object *parent; + int rc = 0; + int i; + ENTRY; + + parent = lu_object_locate(ec_parent->lo_header, ld->ld_type); + if (parent == NULL) + RETURN(-EINVAL); + + memset(ma, 0, sizeof(*ma)); + ma->ma_attr.la_mode = mode; + ma->ma_attr.la_valid = LA_CTIME; + ma->ma_attr.la_ctime = cfs_time_current_64(); + ma->ma_need = MA_INODE; + ma->ma_valid = 0; + + if (name != NULL) { + lname->ln_name = name; + lname->ln_namelen = namelen; + rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname, + ma); + RETURN(rc); + } + + /*prepare the requests*/ + for (i = 0; i < count; i++) { + char *tmp_name = info->eti_name; + + ma->ma_valid = 0; + echo_md_build_name(lname, tmp_name, id); + + rc = echo_md_destroy_internal(env, ed, lu2md(parent), lname, + ma); + if (rc) { + CERROR("Can not unlink child %s: rc = %d\n", name, rc); + break; + } + id++; + } + + RETURN(rc); +} + +static struct lu_object *echo_resolve_path(const struct lu_env *env, + struct echo_device *ed, char *path, + int path_len) +{ + struct lu_device *ld = ed->ed_next; + struct md_device *md = lu2md_dev(ld); + struct echo_thread_info *info = echo_env_info(env); + struct lu_fid *fid = &info->eti_fid; + struct lu_name *lname = &info->eti_lname; + struct lu_object *parent = NULL; + struct lu_object *child = NULL; + int rc = 0; + ENTRY; + + /*Only support MDD layer right now*/ + rc = md->md_ops->mdo_root_get(env, md, fid); + if (rc) { + CERROR("get root error: rc = %d\n", rc); + RETURN(ERR_PTR(rc)); + } + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + parent = lu_object_find_at(env, &ed->ed_cl.cd_lu_dev, fid, NULL); + if (IS_ERR(parent)) { + CERROR("Can not find the parent "DFID": rc = %ld\n", + PFID(fid), PTR_ERR(parent)); + RETURN(parent); + } + + while (1) { + struct lu_object *ld_parent; + char *e; + + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + + lname->ln_name = e; + lname->ln_namelen = strlen(e); + + ld_parent = lu_object_locate(parent->lo_header, ld->ld_type); + if (ld_parent == NULL) { + lu_object_put(env, parent); + rc = -EINVAL; + break; + } + + child = echo_md_lookup(env, ed, lu2md(ld_parent), lname); + lu_object_put(env, parent); + if (IS_ERR(child)) { + rc = (int)PTR_ERR(child); + CERROR("lookup %s under parent "DFID": rc = %d\n", + lname->ln_name, PFID(lu_object_fid(ld_parent)), + rc); + break; + } + parent = child; + } + if (rc) + RETURN(ERR_PTR(rc)); + + RETURN(parent); +} + +static void echo_ucred_init(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + + ucred->uc_valid = UCRED_INVALID; + + ucred->uc_suppgids[0] = -1; + ucred->uc_suppgids[1] = -1; + + ucred->uc_uid = ucred->uc_o_uid = current_uid(); + ucred->uc_gid = ucred->uc_o_gid = current_gid(); + ucred->uc_fsuid = ucred->uc_o_fsuid = current_fsuid(); + ucred->uc_fsgid = ucred->uc_o_fsgid = current_fsgid(); + ucred->uc_cap = cfs_curproc_cap_pack(); + + /* remove fs privilege for non-root user. */ + if (ucred->uc_fsuid) + ucred->uc_cap &= ~CFS_CAP_FS_MASK; + ucred->uc_valid = UCRED_NEW; +} + +static void echo_ucred_fini(struct lu_env *env) +{ + struct lu_ucred *ucred = lu_ucred(env); + ucred->uc_valid = UCRED_INIT; +} + +#define ECHO_MD_CTX_TAG (LCT_REMEMBER | LCT_MD_THREAD) +#define ECHO_MD_SES_TAG (LCT_REMEMBER | LCT_SESSION) +static int echo_md_handler(struct echo_device *ed, int command, + char *path, int path_len, __u64 id, int count, + struct obd_ioctl_data *data) +{ + struct echo_thread_info *info; + struct lu_device *ld = ed->ed_next; + struct lu_env *env; + int refcheck; + struct lu_object *parent; + char *name = NULL; + int namelen = data->ioc_plen2; + int rc = 0; + ENTRY; + + if (ld == NULL) { + CERROR("MD echo client is not being initialized properly\n"); + RETURN(-EINVAL); + } + + if (strcmp(ld->ld_type->ldt_name, LUSTRE_MDD_NAME)) { + CERROR("Only support MDD layer right now!\n"); + RETURN(-EINVAL); + } + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + rc = lu_env_refill_by_tags(env, ECHO_MD_CTX_TAG, ECHO_MD_SES_TAG); + if (rc != 0) + GOTO(out_env, rc); + + /* init big_lmm buffer */ + info = echo_env_info(env); + LASSERT(info->eti_big_lmm == NULL); + OBD_ALLOC_LARGE(info->eti_big_lmm, MIN_MD_SIZE); + if (info->eti_big_lmm == NULL) + GOTO(out_env, rc = -ENOMEM); + info->eti_big_lmmsize = MIN_MD_SIZE; + + parent = echo_resolve_path(env, ed, path, path_len); + if (IS_ERR(parent)) { + CERROR("Can not resolve the path %s: rc = %ld\n", path, + PTR_ERR(parent)); + GOTO(out_free, rc = PTR_ERR(parent)); + } + + if (namelen > 0) { + OBD_ALLOC(name, namelen + 1); + if (name == NULL) + GOTO(out_put, rc = -ENOMEM); + if (copy_from_user(name, data->ioc_pbuf2, namelen)) + GOTO(out_name, rc = -EFAULT); + } + + echo_ucred_init(env); + + switch (command) { + case ECHO_MD_CREATE: + case ECHO_MD_MKDIR: { + struct echo_thread_info *info = echo_env_info(env); + __u32 mode = data->ioc_obdo2.o_mode; + struct lu_fid *fid = &info->eti_fid; + int stripe_count = (int)data->ioc_obdo2.o_misc; + int stripe_index = (int)data->ioc_obdo2.o_stripe_idx; + + rc = ostid_to_fid(fid, &data->ioc_obdo1.o_oi, 0); + if (rc != 0) + break; + + /* In the function below, .hs_keycmp resolves to + * lu_obj_hop_keycmp() */ + /* coverity[overrun-buffer-val] */ + rc = echo_create_md_object(env, ed, parent, fid, name, namelen, + id, mode, count, stripe_count, + stripe_index); + break; + } + case ECHO_MD_DESTROY: + case ECHO_MD_RMDIR: { + __u32 mode = data->ioc_obdo2.o_mode; + + rc = echo_destroy_object(env, ed, parent, name, namelen, + id, mode, count); + break; + } + case ECHO_MD_LOOKUP: + rc = echo_lookup_object(env, ed, parent, id, count); + break; + case ECHO_MD_GETATTR: + rc = echo_getattr_object(env, ed, parent, id, count); + break; + case ECHO_MD_SETATTR: + rc = echo_setattr_object(env, ed, parent, id, count); + break; + default: + CERROR("unknown command %d\n", command); + rc = -EINVAL; + break; + } + echo_ucred_fini(env); + +out_name: + if (name != NULL) + OBD_FREE(name, namelen + 1); +out_put: + lu_object_put(env, parent); +out_free: + LASSERT(info->eti_big_lmm); + OBD_FREE_LARGE(info->eti_big_lmm, info->eti_big_lmmsize); + info->eti_big_lmm = NULL; + info->eti_big_lmmsize = 0; +out_env: + cl_env_put(env, &refcheck); + return rc; +} + +static int echo_create_object(const struct lu_env *env, struct echo_device *ed, + int on_target, struct obdo *oa, void *ulsm, + int ulsm_nob, struct obd_trans_info *oti) +{ + struct echo_object *eco; + struct echo_client_obd *ec = ed->ed_ec; + struct lov_stripe_md *lsm = NULL; + int rc; + int created = 0; + ENTRY; + + if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */ + (on_target || /* set_stripe */ + ec->ec_nstripes != 0)) { /* LOV */ + CERROR ("No valid oid\n"); + RETURN(-EINVAL); + } + + rc = echo_alloc_memmd(ed, &lsm); + if (rc < 0) { + CERROR("Cannot allocate md: rc = %d\n", rc); + GOTO(failed, rc); + } + + if (ulsm != NULL) { + int i, idx; + + rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob); + if (rc != 0) + GOTO(failed, rc); + + if (lsm->lsm_stripe_count == 0) + lsm->lsm_stripe_count = ec->ec_nstripes; + + if (lsm->lsm_stripe_size == 0) + lsm->lsm_stripe_size = PAGE_CACHE_SIZE; + + idx = cfs_rand(); + + /* setup stripes: indices + default ids if required */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (ostid_id(&lsm->lsm_oinfo[i]->loi_oi) == 0) + lsm->lsm_oinfo[i]->loi_oi = lsm->lsm_oi; + + lsm->lsm_oinfo[i]->loi_ost_idx = + (idx + i) % ec->ec_nstripes; + } + } + + /* setup object ID here for !on_target and LOV hint */ + if (oa->o_valid & OBD_MD_FLID) { + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + lsm->lsm_oi = oa->o_oi; + } + + if (ostid_id(&lsm->lsm_oi) == 0) + ostid_set_id(&lsm->lsm_oi, ++last_object_id); + + rc = 0; + if (on_target) { + /* Only echo objects are allowed to be created */ + LASSERT((oa->o_valid & OBD_MD_FLGROUP) && + (ostid_seq(&oa->o_oi) == FID_SEQ_ECHO)); + rc = obd_create(env, ec->ec_exp, oa, &lsm, oti); + if (rc != 0) { + CERROR("Cannot create objects: rc = %d\n", rc); + GOTO(failed, rc); + } + created = 1; + } + + /* See what object ID we were given */ + oa->o_oi = lsm->lsm_oi; + oa->o_valid |= OBD_MD_FLID; + + eco = cl_echo_object_find(ed, &lsm); + if (IS_ERR(eco)) + GOTO(failed, rc = PTR_ERR(eco)); + cl_echo_object_put(eco); + + CDEBUG(D_INFO, "oa oid "DOSTID"\n", POSTID(&oa->o_oi)); + EXIT; + + failed: + if (created && rc) + obd_destroy(env, ec->ec_exp, oa, lsm, oti, NULL, NULL); + if (lsm) + echo_free_memmd(ed, &lsm); + if (rc) + CERROR("create object failed with: rc = %d\n", rc); + return (rc); +} + +static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, + struct obdo *oa) +{ + struct lov_stripe_md *lsm = NULL; + struct echo_object *eco; + int rc; + ENTRY; + + if ((oa->o_valid & OBD_MD_FLID) == 0 || ostid_id(&oa->o_oi) == 0) { + /* disallow use of object id 0 */ + CERROR ("No valid oid\n"); + RETURN(-EINVAL); + } + + rc = echo_alloc_memmd(ed, &lsm); + if (rc < 0) + RETURN(rc); + + lsm->lsm_oi = oa->o_oi; + if (!(oa->o_valid & OBD_MD_FLGROUP)) + ostid_set_seq_echo(&lsm->lsm_oi); + + rc = 0; + eco = cl_echo_object_find(ed, &lsm); + if (!IS_ERR(eco)) + *ecop = eco; + else + rc = PTR_ERR(eco); + if (lsm) + echo_free_memmd(ed, &lsm); + RETURN(rc); +} + +static void echo_put_object(struct echo_object *eco) +{ + if (cl_echo_object_put(eco)) + CERROR("echo client: drop an object failed"); +} + +static void +echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp) +{ + unsigned long stripe_count; + unsigned long stripe_size; + unsigned long width; + unsigned long woffset; + int stripe_index; + obd_off offset; + + if (lsm->lsm_stripe_count <= 1) + return; + + offset = *offp; + stripe_size = lsm->lsm_stripe_size; + stripe_count = lsm->lsm_stripe_count; + + /* width = # bytes in all stripes */ + width = stripe_size * stripe_count; + + /* woffset = offset within a width; offset = whole number of widths */ + woffset = do_div (offset, width); + + stripe_index = woffset / stripe_size; + + *idp = ostid_id(&lsm->lsm_oinfo[stripe_index]->loi_oi); + *offp = offset * stripe_size + woffset % stripe_size; +} + +static void +echo_client_page_debug_setup(struct lov_stripe_md *lsm, + struct page *page, int rw, obd_id id, + obd_off offset, obd_off count) +{ + char *addr; + obd_off stripe_off; + obd_id stripe_id; + int delta; + + /* no partial pages on the client */ + LASSERT(count == PAGE_CACHE_SIZE); + + addr = kmap(page); + + for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + if (rw == OBD_BRW_WRITE) { + stripe_off = offset + delta; + stripe_id = id; + echo_get_stripe_off_id(lsm, &stripe_off, &stripe_id); + } else { + stripe_off = 0xdeadbeef00c0ffeeULL; + stripe_id = 0xdeadbeef00c0ffeeULL; + } + block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + } + + kunmap(page); +} + +static int echo_client_page_debug_check(struct lov_stripe_md *lsm, + struct page *page, obd_id id, + obd_off offset, obd_off count) +{ + obd_off stripe_off; + obd_id stripe_id; + char *addr; + int delta; + int rc; + int rc2; + + /* no partial pages on the client */ + LASSERT(count == PAGE_CACHE_SIZE); + + addr = kmap(page); + + for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { + stripe_off = offset + delta; + stripe_id = id; + echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id); + + rc2 = block_debug_check("test_brw", + addr + delta, OBD_ECHO_BLOCK_SIZE, + stripe_off, stripe_id); + if (rc2 != 0) { + CERROR ("Error in echo object "LPX64"\n", id); + rc = rc2; + } + } + + kunmap(page); + return rc; +} + +static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, + struct echo_object *eco, obd_off offset, + obd_size count, int async, + struct obd_trans_info *oti) +{ + struct lov_stripe_md *lsm = eco->eo_lsm; + obd_count npages; + struct brw_page *pga; + struct brw_page *pgp; + struct page **pages; + obd_off off; + int i; + int rc; + int verify; + int gfp_mask; + int brw_flags = 0; + ENTRY; + + verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID && + (oa->o_valid & OBD_MD_FLFLAGS) != 0 && + (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); + + gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_IOFS : GFP_HIGHUSER; + + LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); + LASSERT(lsm != NULL); + LASSERT(ostid_id(&lsm->lsm_oi) == ostid_id(&oa->o_oi)); + + if (count <= 0 || + (count & (~CFS_PAGE_MASK)) != 0) + RETURN(-EINVAL); + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_CACHE_SHIFT; + + if (rw == OBD_BRW_WRITE) + brw_flags = OBD_BRW_ASYNC; + + OBD_ALLOC(pga, npages * sizeof(*pga)); + if (pga == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(pages, npages * sizeof(*pages)); + if (pages == NULL) { + OBD_FREE(pga, npages * sizeof(*pga)); + RETURN(-ENOMEM); + } + + for (i = 0, pgp = pga, off = offset; + i < npages; + i++, pgp++, off += PAGE_CACHE_SIZE) { + + LASSERT (pgp->pg == NULL); /* for cleanup */ + + rc = -ENOMEM; + OBD_PAGE_ALLOC(pgp->pg, gfp_mask); + if (pgp->pg == NULL) + goto out; + + pages[i] = pgp->pg; + pgp->count = PAGE_CACHE_SIZE; + pgp->off = off; + pgp->flag = brw_flags; + + if (verify) + echo_client_page_debug_setup(lsm, pgp->pg, rw, + ostid_id(&oa->o_oi), off, + pgp->count); + } + + /* brw mode can only be used at client */ + LASSERT(ed->ed_next != NULL); + rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); + + out: + if (rc != 0 || rw != OBD_BRW_READ) + verify = 0; + + for (i = 0, pgp = pga; i < npages; i++, pgp++) { + if (pgp->pg == NULL) + continue; + + if (verify) { + int vrc; + vrc = echo_client_page_debug_check(lsm, pgp->pg, + ostid_id(&oa->o_oi), + pgp->off, pgp->count); + if (vrc != 0 && rc == 0) + rc = vrc; + } + OBD_PAGE_FREE(pgp->pg); + } + OBD_FREE(pga, npages * sizeof(*pga)); + OBD_FREE(pages, npages * sizeof(*pages)); + RETURN(rc); +} + +static int echo_client_prep_commit(const struct lu_env *env, + struct obd_export *exp, int rw, + struct obdo *oa, struct echo_object *eco, + obd_off offset, obd_size count, + obd_size batch, struct obd_trans_info *oti, + int async) +{ + struct lov_stripe_md *lsm = eco->eo_lsm; + struct obd_ioobj ioo; + struct niobuf_local *lnb; + struct niobuf_remote *rnb; + obd_off off; + obd_size npages, tot_pages; + int i, ret = 0, brw_flags = 0; + + ENTRY; + + if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0 || + (lsm != NULL && ostid_id(&lsm->lsm_oi) != ostid_id(&oa->o_oi))) + RETURN(-EINVAL); + + npages = batch >> PAGE_CACHE_SHIFT; + tot_pages = count >> PAGE_CACHE_SHIFT; + + OBD_ALLOC(lnb, npages * sizeof(struct niobuf_local)); + OBD_ALLOC(rnb, npages * sizeof(struct niobuf_remote)); + + if (lnb == NULL || rnb == NULL) + GOTO(out, ret = -ENOMEM); + + if (rw == OBD_BRW_WRITE && async) + brw_flags |= OBD_BRW_ASYNC; + + obdo_to_ioobj(oa, &ioo); + + off = offset; + + for(; tot_pages; tot_pages -= npages) { + int lpages; + + if (tot_pages < npages) + npages = tot_pages; + + for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) { + rnb[i].offset = off; + rnb[i].len = PAGE_CACHE_SIZE; + rnb[i].flags = brw_flags; + } + + ioo.ioo_bufcnt = npages; + oti->oti_transno = 0; + + lpages = npages; + ret = obd_preprw(env, rw, exp, oa, 1, &ioo, rnb, &lpages, + lnb, oti, NULL); + if (ret != 0) + GOTO(out, ret); + LASSERT(lpages == npages); + + for (i = 0; i < lpages; i++) { + struct page *page = lnb[i].page; + + /* read past eof? */ + if (page == NULL && lnb[i].rc == 0) + continue; + + if (async) + lnb[i].flags |= OBD_BRW_ASYNC; + + if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || + (oa->o_valid & OBD_MD_FLFLAGS) == 0 || + (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0) + continue; + + if (rw == OBD_BRW_WRITE) + echo_client_page_debug_setup(lsm, page, rw, + ostid_id(&oa->o_oi), + rnb[i].offset, + rnb[i].len); + else + echo_client_page_debug_check(lsm, page, + ostid_id(&oa->o_oi), + rnb[i].offset, + rnb[i].len); + } + + ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, + rnb, npages, lnb, oti, ret); + if (ret != 0) + GOTO(out, ret); + + /* Reset oti otherwise it would confuse ldiskfs. */ + memset(oti, 0, sizeof(*oti)); + } + +out: + if (lnb) + OBD_FREE(lnb, npages * sizeof(struct niobuf_local)); + if (rnb) + OBD_FREE(rnb, npages * sizeof(struct niobuf_remote)); + RETURN(ret); +} + +static int echo_client_brw_ioctl(const struct lu_env *env, int rw, + struct obd_export *exp, + struct obd_ioctl_data *data, + struct obd_trans_info *dummy_oti) +{ + struct obd_device *obd = class_exp2obd(exp); + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct obdo *oa = &data->ioc_obdo1; + struct echo_object *eco; + int rc; + int async = 1; + long test_mode; + ENTRY; + + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + rc = echo_get_object(&eco, ed, oa); + if (rc) + RETURN(rc); + + oa->o_valid &= ~OBD_MD_FLHANDLE; + + /* OFD/obdfilter works only via prep/commit */ + test_mode = (long)data->ioc_pbuf1; + if (test_mode == 1) + async = 0; + + if (ed->ed_next == NULL && test_mode != 3) { + test_mode = 3; + data->ioc_plen1 = data->ioc_count; + } + + /* Truncate batch size to maximum */ + if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE) + data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE; + + switch (test_mode) { + case 1: + /* fall through */ + case 2: + rc = echo_client_kbrw(ed, rw, oa, + eco, data->ioc_offset, + data->ioc_count, async, dummy_oti); + break; + case 3: + rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, + eco, data->ioc_offset, + data->ioc_count, data->ioc_plen1, + dummy_oti, async); + break; + default: + rc = -EINVAL; + } + echo_put_object(eco); + RETURN(rc); +} + +static int +echo_client_enqueue(struct obd_export *exp, struct obdo *oa, + int mode, obd_off offset, obd_size nob) +{ + struct echo_device *ed = obd2echo_dev(exp->exp_obd); + struct lustre_handle *ulh = &oa->o_handle; + struct echo_object *eco; + obd_off end; + int rc; + ENTRY; + + if (ed->ed_next == NULL) + RETURN(-EOPNOTSUPP); + + if (!(mode == LCK_PR || mode == LCK_PW)) + RETURN(-EINVAL); + + if ((offset & (~CFS_PAGE_MASK)) != 0 || + (nob & (~CFS_PAGE_MASK)) != 0) + RETURN(-EINVAL); + + rc = echo_get_object (&eco, ed, oa); + if (rc != 0) + RETURN(rc); + + end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1); + rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie); + if (rc == 0) { + oa->o_valid |= OBD_MD_FLHANDLE; + CDEBUG(D_INFO, "Cookie is "LPX64"\n", ulh->cookie); + } + echo_put_object(eco); + RETURN(rc); +} + +static int +echo_client_cancel(struct obd_export *exp, struct obdo *oa) +{ + struct echo_device *ed = obd2echo_dev(exp->exp_obd); + __u64 cookie = oa->o_handle.cookie; + + if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) + return -EINVAL; + + CDEBUG(D_INFO, "Cookie is "LPX64"\n", cookie); + return cl_echo_cancel(ed, cookie); +} + +static int +echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; + struct obd_ioctl_data *data = karg; + struct obd_trans_info dummy_oti; + struct lu_env *env; + struct oti_req_ack_lock *ack_lock; + struct obdo *oa; + struct lu_fid fid; + int rw = OBD_BRW_READ; + int rc = 0; + int i; + ENTRY; + + memset(&dummy_oti, 0, sizeof(dummy_oti)); + + oa = &data->ioc_obdo1; + if (!(oa->o_valid & OBD_MD_FLGROUP)) { + oa->o_valid |= OBD_MD_FLGROUP; + ostid_set_seq_echo(&oa->o_oi); + } + + /* This FID is unpacked just for validation at this point */ + rc = ostid_to_fid(&fid, &oa->o_oi, 0); + if (rc < 0) + RETURN(rc); + + OBD_ALLOC_PTR(env); + if (env == NULL) + RETURN(-ENOMEM); + + rc = lu_env_init(env, LCT_DT_THREAD); + if (rc) + GOTO(out, rc = -ENOMEM); + + switch (cmd) { + case OBD_IOC_CREATE: /* may create echo object */ + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_create_object(env, ed, 1, oa, data->ioc_pbuf1, + data->ioc_plen1, &dummy_oti); + GOTO(out, rc); + + case OBD_IOC_ECHO_MD: { + int count; + int cmd; + char *dir = NULL; + int dirlen; + __u64 id; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + count = data->ioc_count; + cmd = data->ioc_command; + + id = ostid_id(&data->ioc_obdo2.o_oi); + + dirlen = data->ioc_plen1; + OBD_ALLOC(dir, dirlen + 1); + if (dir == NULL) + GOTO(out, rc = -ENOMEM); + + if (copy_from_user(dir, data->ioc_pbuf1, dirlen)) { + OBD_FREE(dir, data->ioc_plen1 + 1); + GOTO(out, rc = -EFAULT); + } + + rc = echo_md_handler(ed, cmd, dir, dirlen, id, count, data); + OBD_FREE(dir, dirlen + 1); + GOTO(out, rc); + } + case OBD_IOC_ECHO_ALLOC_SEQ: { + struct lu_env *cl_env; + int refcheck; + __u64 seq; + int max_count; + + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO(out, rc = -EPERM); + + cl_env = cl_env_get(&refcheck); + if (IS_ERR(cl_env)) + GOTO(out, rc = PTR_ERR(cl_env)); + + rc = lu_env_refill_by_tags(cl_env, ECHO_MD_CTX_TAG, + ECHO_MD_SES_TAG); + if (rc != 0) { + cl_env_put(cl_env, &refcheck); + GOTO(out, rc); + } + + rc = seq_client_get_seq(cl_env, ed->ed_cl_seq, &seq); + cl_env_put(cl_env, &refcheck); + if (rc < 0) { + CERROR("%s: Can not alloc seq: rc = %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + + if (copy_to_user(data->ioc_pbuf1, &seq, data->ioc_plen1)) + return -EFAULT; + + max_count = LUSTRE_METADATA_SEQ_MAX_WIDTH; + if (copy_to_user(data->ioc_pbuf2, &max_count, + data->ioc_plen2)) + return -EFAULT; + GOTO(out, rc); + } + case OBD_IOC_DESTROY: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = obd_destroy(env, ec->ec_exp, oa, eco->eo_lsm, + &dummy_oti, NULL, NULL); + if (rc == 0) + eco->eo_deleted = 1; + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_GETATTR: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + struct obd_info oinfo = { { { 0 } } }; + oinfo.oi_md = eco->eo_lsm; + oinfo.oi_oa = oa; + rc = obd_getattr(env, ec->ec_exp, &oinfo); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_SETATTR: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + struct obd_info oinfo = { { { 0 } } }; + oinfo.oi_oa = oa; + oinfo.oi_md = eco->eo_lsm; + + rc = obd_setattr(env, ec->ec_exp, &oinfo, NULL); + echo_put_object(eco); + } + GOTO(out, rc); + + case OBD_IOC_BRW_WRITE: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rw = OBD_BRW_WRITE; + /* fall through */ + case OBD_IOC_BRW_READ: + rc = echo_client_brw_ioctl(env, rw, exp, data, &dummy_oti); + GOTO(out, rc); + + case ECHO_IOC_GET_STRIPE: + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1, + data->ioc_plen1); + echo_put_object(eco); + } + GOTO(out, rc); + + case ECHO_IOC_SET_STRIPE: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + if (data->ioc_pbuf1 == NULL) { /* unset */ + rc = echo_get_object(&eco, ed, oa); + if (rc == 0) { + eco->eo_deleted = 1; + echo_put_object(eco); + } + } else { + rc = echo_create_object(env, ed, 0, oa, + data->ioc_pbuf1, + data->ioc_plen1, &dummy_oti); + } + GOTO (out, rc); + + case ECHO_IOC_ENQUEUE: + if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_client_enqueue(exp, oa, + data->ioc_conn1, /* lock mode */ + data->ioc_offset, + data->ioc_count);/*extent*/ + GOTO (out, rc); + + case ECHO_IOC_CANCEL: + rc = echo_client_cancel(exp, oa); + GOTO (out, rc); + + default: + CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd); + GOTO (out, rc = -ENOTTY); + } + + EXIT; +out: + lu_env_fini(env); + OBD_FREE_PTR(env); + + /* XXX this should be in a helper also called by target_send_reply */ + for (ack_lock = dummy_oti.oti_ack_locks, i = 0; i < 4; + i++, ack_lock++) { + if (!ack_lock->mode) + break; + ldlm_lock_decref(&ack_lock->lock, ack_lock->mode); + } + + return rc; +} + +static int echo_client_setup(const struct lu_env *env, + struct obd_device *obddev, struct lustre_cfg *lcfg) +{ + struct echo_client_obd *ec = &obddev->u.echo_client; + struct obd_device *tgt; + struct obd_uuid echo_uuid = { "ECHO_UUID" }; + struct obd_connect_data *ocd = NULL; + int rc; + ENTRY; + + if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + spin_lock_init(&ec->ec_lock); + INIT_LIST_HEAD (&ec->ec_objects); + INIT_LIST_HEAD (&ec->ec_locks); + ec->ec_unique = 0; + ec->ec_nstripes = 0; + + if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { + lu_context_tags_update(ECHO_MD_CTX_TAG); + lu_session_tags_update(ECHO_MD_SES_TAG); + RETURN(0); + } + + OBD_ALLOC(ocd, sizeof(*ocd)); + if (ocd == NULL) { + CERROR("Can't alloc ocd connecting to %s\n", + lustre_cfg_string(lcfg, 1)); + return -ENOMEM; + } + + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | + OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 | + OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE; + ocd->ocd_brw_size = DT_MAX_BRW_SIZE; + ocd->ocd_version = LUSTRE_VERSION_CODE; + ocd->ocd_group = FID_SEQ_ECHO; + + rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); + if (rc == 0) { + /* Turn off pinger because it connects to tgt obd directly. */ + spin_lock(&tgt->obd_dev_lock); + list_del_init(&ec->ec_exp->exp_obd_chain_timed); + spin_unlock(&tgt->obd_dev_lock); + } + + OBD_FREE(ocd, sizeof(*ocd)); + + if (rc != 0) { + CERROR("fail to connect to device %s\n", + lustre_cfg_string(lcfg, 1)); + return (rc); + } + + RETURN(rc); +} + +static int echo_client_cleanup(struct obd_device *obddev) +{ + struct echo_device *ed = obd2echo_dev(obddev); + struct echo_client_obd *ec = &obddev->u.echo_client; + int rc; + ENTRY; + + /*Do nothing for Metadata echo client*/ + if (ed == NULL ) + RETURN(0); + + if (ed->ed_next_ismd) { + lu_context_tags_clear(ECHO_MD_CTX_TAG); + lu_session_tags_clear(ECHO_MD_SES_TAG); + RETURN(0); + } + + if (!list_empty(&obddev->obd_exports)) { + CERROR("still has clients!\n"); + RETURN(-EBUSY); + } + + LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0); + rc = obd_disconnect(ec->ec_exp); + if (rc != 0) + CERROR("fail to disconnect device: %d\n", rc); + + RETURN(rc); +} + +static int echo_client_connect(const struct lu_env *env, + struct obd_export **exp, + struct obd_device *src, struct obd_uuid *cluuid, + struct obd_connect_data *data, void *localdata) +{ + int rc; + struct lustre_handle conn = { 0 }; + + ENTRY; + rc = class_connect(&conn, src, cluuid); + if (rc == 0) { + *exp = class_conn2export(&conn); + } + + RETURN (rc); +} + +static int echo_client_disconnect(struct obd_export *exp) +{ +#if 0 + struct obd_device *obd; + struct echo_client_obd *ec; + struct ec_lock *ecl; +#endif + int rc; + ENTRY; + + if (exp == NULL) + GOTO(out, rc = -EINVAL); + +#if 0 + obd = exp->exp_obd; + ec = &obd->u.echo_client; + + /* no more contention on export's lock list */ + while (!list_empty (&exp->exp_ec_data.eced_locks)) { + ecl = list_entry (exp->exp_ec_data.eced_locks.next, + struct ec_lock, ecl_exp_chain); + list_del (&ecl->ecl_exp_chain); + + rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm, + ecl->ecl_mode, &ecl->ecl_lock_handle); + + CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect " + "(%d)\n", ecl->ecl_object->eco_id, rc); + + echo_put_object (ecl->ecl_object); + OBD_FREE (ecl, sizeof (*ecl)); + } +#endif + + rc = class_disconnect(exp); + GOTO(out, rc); + out: + return rc; +} + +static struct obd_ops echo_client_obd_ops = { + .o_owner = THIS_MODULE, + +#if 0 + .o_setup = echo_client_setup, + .o_cleanup = echo_client_cleanup, +#endif + + .o_iocontrol = echo_client_iocontrol, + .o_connect = echo_client_connect, + .o_disconnect = echo_client_disconnect +}; + +int echo_client_init(void) +{ + struct lprocfs_static_vars lvars = { 0 }; + int rc; + + lprocfs_echo_init_vars(&lvars); + + rc = lu_kmem_init(echo_caches); + if (rc == 0) { + rc = class_register_type(&echo_client_obd_ops, NULL, + lvars.module_vars, + LUSTRE_ECHO_CLIENT_NAME, + &echo_device_type); + if (rc) + lu_kmem_fini(echo_caches); + } + return rc; +} + +void echo_client_exit(void) +{ + class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); + lu_kmem_fini(echo_caches); +} + +static int __init obdecho_init(void) +{ + struct lprocfs_static_vars lvars; + int rc; + + ENTRY; + LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n"); + + LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0); + + lprocfs_echo_init_vars(&lvars); + + + rc = echo_client_init(); + + RETURN(rc); +} + +static void /*__exit*/ obdecho_exit(void) +{ + echo_client_exit(); + +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Testing Echo OBD driver"); +MODULE_LICENSE("GPL"); + +cfs_module(obdecho, LUSTRE_VERSION_STRING, obdecho_init, obdecho_exit); + +/** @} echo_client */ diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h new file mode 100644 index 000000000000..8e9dbc2351e7 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdecho/echo_internal.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdecho/echo_internal.h + */ + +#ifndef _ECHO_INTERNAL_H +#define _ECHO_INTERNAL_H + +/* The persistent object (i.e. actually stores stuff!) */ +#define ECHO_PERSISTENT_OBJID 1ULL +#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) + +/* block size to use for data verification */ +#define OBD_ECHO_BLOCK_SIZE (4<<10) + + +#endif diff --git a/drivers/staging/lustre/lustre/obdecho/lproc_echo.c b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c new file mode 100644 index 000000000000..e23ed32a4855 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdecho/lproc_echo.c @@ -0,0 +1,55 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_ECHO + +#include +#include + +#ifdef LPROCFS +static struct lprocfs_vars lprocfs_echo_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_echo_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_echo_module_vars; + lvars->obd_vars = lprocfs_echo_obd_vars; +} +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile new file mode 100644 index 000000000000..bbd2f7707e9f --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_LUSTRE_FS) += osc.o +osc-y := osc_request.o lproc_osc.o osc_dev.o osc_object.o \ + osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o + + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c new file mode 100644 index 000000000000..016ad02da297 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c @@ -0,0 +1,715 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include +#include +#include +#include +#include +#include "osc_internal.h" + +#ifdef LPROCFS +static int osc_rd_active(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + int rc; + + LPROCFS_CLIMP_CHECK(dev); + rc = snprintf(page, count, "%d\n", !dev->u.cli.cl_import->imp_deactive); + LPROCFS_CLIMP_EXIT(dev); + return rc; +} + +static int osc_wr_active(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + if (val < 0 || val > 1) + return -ERANGE; + + /* opposite senses */ + if (dev->u.cli.cl_import->imp_deactive == val) + rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); + else + CDEBUG(D_CONFIG, "activate %d: ignoring repeat request\n", val); + + return count; +} + +static int osc_rd_max_rpcs_in_flight(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%u\n", cli->cl_max_rpcs_in_flight); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int osc_wr_max_rpcs_in_flight(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_request_pool *pool = cli->cl_import->imp_rq_pool; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 1 || val > OSC_MAX_RIF_MAX) + return -ERANGE; + + LPROCFS_CLIMP_CHECK(dev); + if (pool && val > cli->cl_max_rpcs_in_flight) + pool->prp_populate(pool, val-cli->cl_max_rpcs_in_flight); + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_rpcs_in_flight = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} + +static int osc_rd_max_dirty_mb(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + long val; + int mult; + + client_obd_list_lock(&cli->cl_loi_list_lock); + val = cli->cl_dirty_max; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + mult = 1 << 20; + return lprocfs_read_frac_helper(page, count, val, mult); +} + +static int osc_wr_max_dirty_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int pages_number, mult, rc; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number <= 0 || + pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) || + pages_number > num_physpages / 4) /* 1/4 of RAM */ + return -ERANGE; + + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_dirty_max = (obd_count)(pages_number << PAGE_CACHE_SHIFT); + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return count; +} + +static int osc_rd_cached_mb(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int shift = 20 - PAGE_CACHE_SHIFT; + int rc; + + rc = snprintf(page, count, + "used_mb: %d\n" + "busy_cnt: %d\n", + (atomic_read(&cli->cl_lru_in_list) + + atomic_read(&cli->cl_lru_busy)) >> shift, + atomic_read(&cli->cl_lru_busy)); + + return rc; +} + +/* shrink the number of caching pages to a specific number */ +static int osc_wr_cached_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int pages_number, mult, rc; + + mult = 1 << (20 - PAGE_CACHE_SHIFT); + buffer = lprocfs_find_named_value(buffer, "used_mb:", &count); + rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); + if (rc) + return rc; + + if (pages_number < 0) + return -ERANGE; + + rc = atomic_read(&cli->cl_lru_in_list) - pages_number; + if (rc > 0) + (void)osc_lru_shrink(cli, rc); + + return count; +} + +static int osc_rd_cur_dirty_bytes(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%lu\n", cli->cl_dirty); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int osc_rd_cur_grant_bytes(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%lu\n", cli->cl_avail_grant); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int osc_wr_cur_grant_bytes(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct client_obd *cli = &obd->u.cli; + int rc; + __u64 val; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + /* this is only for shrinking grant */ + client_obd_list_lock(&cli->cl_loi_list_lock); + if (val >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + return 0; + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_CHECK(obd); + if (cli->cl_import->imp_state == LUSTRE_IMP_FULL) + rc = osc_shrink_grant_to_target(cli, val); + LPROCFS_CLIMP_EXIT(obd); + if (rc) + return rc; + return count; +} + +static int osc_rd_cur_lost_grant_bytes(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = snprintf(page, count, "%lu\n", cli->cl_lost_grant); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int osc_rd_grant_shrink_interval(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + if (obd == NULL) + return 0; + return snprintf(page, count, "%d\n", + obd->u.cli.cl_grant_shrink_interval); +} + +static int osc_wr_grant_shrink_interval(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= 0) + return -ERANGE; + + obd->u.cli.cl_grant_shrink_interval = val; + + return count; +} + +static int osc_rd_checksum(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + if (obd == NULL) + return 0; + + return snprintf(page, count, "%d\n", + obd->u.cli.cl_checksum ? 1 : 0); +} + +static int osc_wr_checksum(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + if (obd == NULL) + return 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + obd->u.cli.cl_checksum = (val ? 1 : 0); + + return count; +} + +static int osc_rd_checksum_type(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + int i, len =0; + DECLARE_CKSUM_NAME; + + if (obd == NULL) + return 0; + + for (i = 0; i < ARRAY_SIZE(cksum_name) && len < count; i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (obd->u.cli.cl_cksum_type == (1 << i)) + len += snprintf(page + len, count - len, "[%s] ", + cksum_name[i]); + else + len += snprintf(page + len, count - len, "%s ", + cksum_name[i]); + } + if (len < count) + len += sprintf(page + len, "\n"); + return len; +} + +static int osc_wd_checksum_type(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int i; + DECLARE_CKSUM_NAME; + char kernbuf[10]; + + if (obd == NULL) + return 0; + + if (count > sizeof(kernbuf) - 1) + return -EINVAL; + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + if (count > 0 && kernbuf[count - 1] == '\n') + kernbuf[count - 1] = '\0'; + else + kernbuf[count] = '\0'; + + for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { + if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) + continue; + if (!strcmp(kernbuf, cksum_name[i])) { + obd->u.cli.cl_cksum_type = 1 << i; + return count; + } + } + return -EINVAL; +} + +static int osc_rd_resend_count(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + return snprintf(page, count, "%u\n", + atomic_read(&obd->u.cli.cl_resends)); +} + +static int osc_wr_resend_count(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + + atomic_set(&obd->u.cli.cl_resends, val); + + return count; +} + +static int osc_rd_contention_seconds(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return snprintf(page, count, "%u\n", od->od_contention_time); +} + +static int osc_wr_contention_seconds(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?: + count; +} + +static int osc_rd_lockless_truncate(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return snprintf(page, count, "%u\n", od->od_lockless_truncate); +} + +static int osc_wr_lockless_truncate(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?: + count; +} + +static int osc_rd_destroys_in_flight(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + return snprintf(page, count, "%u\n", + atomic_read(&obd->u.cli.cl_destroy_in_flight)); +} + +static int lprocfs_osc_wr_max_pages_per_rpc(struct file *file, + const char *buffer, unsigned long count, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data; + int chunk_mask, rc; + __u64 val; + + rc = lprocfs_write_u64_helper(buffer, count, &val); + if (rc) + return rc; + + /* if the max_pages is specified in bytes, convert to pages */ + if (val >= ONE_MB_BRW_SIZE) + val >>= PAGE_CACHE_SHIFT; + + LPROCFS_CLIMP_CHECK(dev); + + chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1); + /* max_pages_per_rpc must be chunk aligned */ + val = (val + ~chunk_mask) & chunk_mask; + if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) { + LPROCFS_CLIMP_EXIT(dev); + return -ERANGE; + } + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_max_pages_per_rpc = val; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + LPROCFS_CLIMP_EXIT(dev); + return count; +} + +static struct lprocfs_vars lprocfs_osc_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "ping", 0, lprocfs_wr_ping, 0, 0, 0222 }, + { "connect_flags", lprocfs_rd_connect_flags, 0, 0 }, + { "blocksize", lprocfs_rd_blksize, 0, 0 }, + { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, + { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, + { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, + { "filestotal", lprocfs_rd_filestotal, 0, 0 }, + { "filesfree", lprocfs_rd_filesfree, 0, 0 }, + //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, + { "ost_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, + { "ost_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, + { "active", osc_rd_active, + osc_wr_active, 0 }, + { "max_pages_per_rpc", lprocfs_obd_rd_max_pages_per_rpc, + lprocfs_osc_wr_max_pages_per_rpc, 0 }, + { "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight, + osc_wr_max_rpcs_in_flight, 0 }, + { "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 }, + { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 }, + { "osc_cached_mb", osc_rd_cached_mb, osc_wr_cached_mb, 0 }, + { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 }, + { "cur_grant_bytes", osc_rd_cur_grant_bytes, + osc_wr_cur_grant_bytes, 0 }, + { "cur_lost_grant_bytes", osc_rd_cur_lost_grant_bytes, 0, 0}, + { "grant_shrink_interval", osc_rd_grant_shrink_interval, + osc_wr_grant_shrink_interval, 0 }, + { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, + { "checksum_type", osc_rd_checksum_type, osc_wd_checksum_type, 0 }, + { "resend_count", osc_rd_resend_count, osc_wr_resend_count, 0}, + { "timeouts", lprocfs_rd_timeouts, 0, 0 }, + { "contention_seconds", osc_rd_contention_seconds, + osc_wr_contention_seconds, 0 }, + { "lockless_truncate", osc_rd_lockless_truncate, + osc_wr_lockless_truncate, 0 }, + { "import", lprocfs_rd_import, lprocfs_wr_import, 0 }, + { "state", lprocfs_rd_state, 0, 0 }, + { "pinger_recov", lprocfs_rd_pinger_recov, + lprocfs_wr_pinger_recov, 0, 0 }, + { 0 } +}; + +static struct lprocfs_vars lprocfs_osc_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; + +#define pct(a,b) (b ? a * 100 / b : 0) + +static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; + int i; + + do_gettimeofday(&now); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + seq_printf(seq, "read RPCs in flight: %d\n", + cli->cl_r_in_flight); + seq_printf(seq, "write RPCs in flight: %d\n", + cli->cl_w_in_flight); + seq_printf(seq, "pending write pages: %d\n", + atomic_read(&cli->cl_pending_w_pages)); + seq_printf(seq, "pending read pages: %d\n", + atomic_read(&cli->cl_pending_r_pages)); + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "pages per rpc rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + 1 << i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "rpcs in flight rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + i, r, pct(r, read_tot), + pct(read_cum, read_tot), w, + pct(w, write_tot), + pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + seq_printf(seq, "\n\t\t\tread\t\t\twrite\n"); + seq_printf(seq, "offset rpcs %% cum %% |"); + seq_printf(seq, " rpcs %% cum %%\n"); + + read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); + write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); + + read_cum = 0; + write_cum = 0; + for (i = 0; i < OBD_HIST_MAX; i++) { + unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; + unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; + read_cum += r; + write_cum += w; + seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", + (i == 0) ? 0 : 1 << (i - 1), + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + if (read_cum == read_tot && write_cum == write_tot) + break; + } + + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return 0; +} +#undef pct + +static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + + lprocfs_oh_clear(&cli->cl_read_rpc_hist); + lprocfs_oh_clear(&cli->cl_write_rpc_hist); + lprocfs_oh_clear(&cli->cl_read_page_hist); + lprocfs_oh_clear(&cli->cl_write_page_hist); + lprocfs_oh_clear(&cli->cl_read_offset_hist); + lprocfs_oh_clear(&cli->cl_write_offset_hist); + + return len; +} + +LPROC_SEQ_FOPS(osc_rpc_stats); + +static int osc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + do_gettimeofday(&now); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n", + stats->os_lockless_reads); + seq_printf(seq, "lockless_truncate\t\t"LPU64"\n", + stats->os_lockless_truncates); + return 0; +} + +static ssize_t osc_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + memset(stats, 0, sizeof(*stats)); + return len; +} + +LPROC_SEQ_FOPS(osc_stats); + +int lproc_osc_attach_seqstat(struct obd_device *dev) +{ + int rc; + + rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0644, + &osc_stats_fops, dev); + if (rc == 0) + rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0644, + &osc_rpc_stats_fops, dev); + + return rc; +} + +void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_osc_module_vars; + lvars->obd_vars = lprocfs_osc_obd_vars; +} +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c new file mode 100644 index 000000000000..206feadb7371 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_cache.c @@ -0,0 +1,3002 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + * + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * osc cache management. + * + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" +#include "osc_internal.h" + +static int extent_debug; /* set it to be true for more debug */ + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta); +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + int state); +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc); +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd); +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd); +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc); +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant); + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line); +#define osc_extent_tree_dump(lvl, obj) \ + osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) + +/** \addtogroup osc + * @{ + */ + +/* ------------------ osc extent ------------------ */ +static inline char *ext_flags(struct osc_extent *ext, char *flags) +{ + char *buf = flags; + *buf++ = ext->oe_rw ? 'r' : 'w'; + if (ext->oe_intree) + *buf++ = 'i'; + if (ext->oe_srvlock) + *buf++ = 's'; + if (ext->oe_hp) + *buf++ = 'h'; + if (ext->oe_urgent) + *buf++ = 'u'; + if (ext->oe_memalloc) + *buf++ = 'm'; + if (ext->oe_trunc_pending) + *buf++ = 't'; + if (ext->oe_fsync_wait) + *buf++ = 'Y'; + *buf = 0; + return flags; +} + +static inline char list_empty_marker(struct list_head *list) +{ + return list_empty(list) ? '-' : '+'; +} + +#define EXTSTR "[%lu -> %lu/%lu]" +#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end + +#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ + struct osc_extent *__ext = (extent); \ + const char *__str[] = OES_STRINGS; \ + char __buf[16]; \ + \ + CDEBUG(lvl, \ + "extent %p@{" EXTSTR ", " \ + "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ + /* ----- extent part 0 ----- */ \ + __ext, EXTPARA(__ext), \ + /* ----- part 1 ----- */ \ + atomic_read(&__ext->oe_refc), \ + atomic_read(&__ext->oe_users), \ + list_empty_marker(&__ext->oe_link), \ + __str[__ext->oe_state], ext_flags(__ext, __buf), \ + __ext->oe_obj, \ + /* ----- part 2 ----- */ \ + __ext->oe_grants, __ext->oe_nr_pages, \ + list_empty_marker(&__ext->oe_pages), \ + waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ + __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ + /* ----- part 4 ----- */ \ + ## __VA_ARGS__); \ +} while (0) + +#undef EASSERTF +#define EASSERTF(expr, ext, fmt, args...) do { \ + if (!(expr)) { \ + OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ + osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ + LASSERT(expr); \ + } \ +} while (0) + +#undef EASSERT +#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") + +static inline struct osc_extent *rb_extent(struct rb_node *n) +{ + if (n == NULL) + return NULL; + + return container_of(n, struct osc_extent, oe_node); +} + +static inline struct osc_extent *next_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_next(&ext->oe_node)); +} + +static inline struct osc_extent *prev_extent(struct osc_extent *ext) +{ + if (ext == NULL) + return NULL; + + LASSERT(ext->oe_intree); + return rb_extent(rb_prev(&ext->oe_node)); +} + +static inline struct osc_extent *first_extent(struct osc_object *obj) +{ + return rb_extent(rb_first(&obj->oo_root)); +} + +/* object must be locked by caller. */ +static int osc_extent_sanity_check0(struct osc_extent *ext, + const char *func, const int line) +{ + struct osc_object *obj = ext->oe_obj; + struct osc_async_page *oap; + int page_count; + int rc = 0; + + if (!osc_object_is_locked(obj)) + GOTO(out, rc = 9); + + if (ext->oe_state >= OES_STATE_MAX) + GOTO(out, rc = 10); + + if (atomic_read(&ext->oe_refc) <= 0) + GOTO(out, rc = 20); + + if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) + GOTO(out, rc = 30); + + switch (ext->oe_state) { + case OES_INV: + if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) + GOTO(out, rc = 35); + GOTO(out, rc = 0); + break; + case OES_ACTIVE: + if (atomic_read(&ext->oe_users) == 0) + GOTO(out, rc = 40); + if (ext->oe_hp) + GOTO(out, rc = 50); + if (ext->oe_fsync_wait && !ext->oe_urgent) + GOTO(out, rc = 55); + break; + case OES_CACHE: + if (ext->oe_grants == 0) + GOTO(out, rc = 60); + if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) + GOTO(out, rc = 65); + default: + if (atomic_read(&ext->oe_users) > 0) + GOTO(out, rc = 70); + } + + if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) + GOTO(out, rc = 80); + + if (ext->oe_osclock == NULL && ext->oe_grants > 0) + GOTO(out, rc = 90); + + if (ext->oe_osclock) { + struct cl_lock_descr *descr; + descr = &ext->oe_osclock->cll_descr; + if (!(descr->cld_start <= ext->oe_start && + descr->cld_end >= ext->oe_max_end)) + GOTO(out, rc = 100); + } + + if (ext->oe_nr_pages > ext->oe_mppr) + GOTO(out, rc = 105); + + /* Do not verify page list if extent is in RPC. This is because an + * in-RPC extent is supposed to be exclusively accessible w/o lock. */ + if (ext->oe_state > OES_CACHE) + GOTO(out, rc = 0); + + if (!extent_debug) + GOTO(out, rc = 0); + + page_count = 0; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + pgoff_t index = oap2cl_page(oap)->cp_index; + ++page_count; + if (index > ext->oe_end || index < ext->oe_start) + GOTO(out, rc = 110); + } + if (page_count != ext->oe_nr_pages) + GOTO(out, rc = 120); + +out: + if (rc != 0) + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s:%d sanity check %p failed with rc = %d\n", + func, line, ext, rc); + return rc; +} + +#define sanity_check_nolock(ext) \ + osc_extent_sanity_check0(ext, __func__, __LINE__) + +#define sanity_check(ext) ({ \ + int __res; \ + osc_object_lock((ext)->oe_obj); \ + __res = sanity_check_nolock(ext); \ + osc_object_unlock((ext)->oe_obj); \ + __res; \ +}) + + +/** + * sanity check - to make sure there is no overlapped extent in the tree. + */ +static int osc_extent_is_overlapped(struct osc_object *obj, + struct osc_extent *ext) +{ + struct osc_extent *tmp; + + LASSERT(osc_object_is_locked(obj)); + + if (!extent_debug) + return 0; + + for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { + if (tmp == ext) + continue; + if (tmp->oe_end >= ext->oe_start && + tmp->oe_start <= ext->oe_end) + return 1; + } + return 0; +} + +static void osc_extent_state_set(struct osc_extent *ext, int state) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(state >= OES_INV && state < OES_STATE_MAX); + + /* Never try to sanity check a state changing extent :-) */ + /* LASSERT(sanity_check_nolock(ext) == 0); */ + + /* TODO: validate the state machine */ + ext->oe_state = state; + wake_up_all(&ext->oe_waitq); +} + +static struct osc_extent *osc_extent_alloc(struct osc_object *obj) +{ + struct osc_extent *ext; + + OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS); + if (ext == NULL) + return NULL; + + RB_CLEAR_NODE(&ext->oe_node); + ext->oe_obj = obj; + atomic_set(&ext->oe_refc, 1); + atomic_set(&ext->oe_users, 0); + INIT_LIST_HEAD(&ext->oe_link); + ext->oe_state = OES_INV; + INIT_LIST_HEAD(&ext->oe_pages); + init_waitqueue_head(&ext->oe_waitq); + ext->oe_osclock = NULL; + + return ext; +} + +static void osc_extent_free(struct osc_extent *ext) +{ + OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); +} + +static struct osc_extent *osc_extent_get(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) >= 0); + atomic_inc(&ext->oe_refc); + return ext; +} + +static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 0); + if (atomic_dec_and_test(&ext->oe_refc)) { + LASSERT(list_empty(&ext->oe_link)); + LASSERT(atomic_read(&ext->oe_users) == 0); + LASSERT(ext->oe_state == OES_INV); + LASSERT(!ext->oe_intree); + + if (ext->oe_osclock) { + cl_lock_put(env, ext->oe_osclock); + ext->oe_osclock = NULL; + } + osc_extent_free(ext); + } +} + +/** + * osc_extent_put_trust() is a special version of osc_extent_put() when + * it's known that the caller is not the last user. This is to address the + * problem of lacking of lu_env ;-). + */ +static void osc_extent_put_trust(struct osc_extent *ext) +{ + LASSERT(atomic_read(&ext->oe_refc) > 1); + LASSERT(osc_object_is_locked(ext->oe_obj)); + atomic_dec(&ext->oe_refc); +} + +/** + * Return the extent which includes pgoff @index, or return the greatest + * previous extent in the tree. + */ +static struct osc_extent *osc_extent_search(struct osc_object *obj, + pgoff_t index) +{ + struct rb_node *n = obj->oo_root.rb_node; + struct osc_extent *tmp, *p = NULL; + + LASSERT(osc_object_is_locked(obj)); + while (n != NULL) { + tmp = rb_extent(n); + if (index < tmp->oe_start) { + n = n->rb_left; + } else if (index > tmp->oe_end) { + p = rb_extent(n); + n = n->rb_right; + } else { + return tmp; + } + } + return p; +} + +/* + * Return the extent covering @index, otherwise return NULL. + * caller must have held object lock. + */ +static struct osc_extent *osc_extent_lookup(struct osc_object *obj, + pgoff_t index) +{ + struct osc_extent *ext; + + ext = osc_extent_search(obj, index); + if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) + return osc_extent_get(ext); + return NULL; +} + +/* caller must have held object lock. */ +static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) +{ + struct rb_node **n = &obj->oo_root.rb_node; + struct rb_node *parent = NULL; + struct osc_extent *tmp; + + LASSERT(ext->oe_intree == 0); + LASSERT(ext->oe_obj == obj); + LASSERT(osc_object_is_locked(obj)); + while (*n != NULL) { + tmp = rb_extent(*n); + parent = *n; + + if (ext->oe_end < tmp->oe_start) + n = &(*n)->rb_left; + else if (ext->oe_start > tmp->oe_end) + n = &(*n)->rb_right; + else + EASSERTF(0, tmp, EXTSTR, EXTPARA(ext)); + } + rb_link_node(&ext->oe_node, parent, n); + rb_insert_color(&ext->oe_node, &obj->oo_root); + osc_extent_get(ext); + ext->oe_intree = 1; +} + +/* caller must have held object lock. */ +static void osc_extent_erase(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + LASSERT(osc_object_is_locked(obj)); + if (ext->oe_intree) { + rb_erase(&ext->oe_node, &obj->oo_root); + ext->oe_intree = 0; + /* rbtree held a refcount */ + osc_extent_put_trust(ext); + } +} + +static struct osc_extent *osc_extent_hold(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + LASSERT(osc_object_is_locked(obj)); + LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); + if (ext->oe_state == OES_CACHE) { + osc_extent_state_set(ext, OES_ACTIVE); + osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); + } + atomic_inc(&ext->oe_users); + list_del_init(&ext->oe_link); + return osc_extent_get(ext); +} + +static void __osc_extent_remove(struct osc_extent *ext) +{ + LASSERT(osc_object_is_locked(ext->oe_obj)); + LASSERT(list_empty(&ext->oe_pages)); + osc_extent_erase(ext); + list_del_init(&ext->oe_link); + osc_extent_state_set(ext, OES_INV); + OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); +} + +static void osc_extent_remove(struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + + osc_object_lock(obj); + __osc_extent_remove(ext); + osc_object_unlock(obj); +} + +/** + * This function is used to merge extents to get better performance. It checks + * if @cur and @victim are contiguous at chunk level. + */ +static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, + struct osc_extent *victim) +{ + struct osc_object *obj = cur->oe_obj; + pgoff_t chunk_start; + pgoff_t chunk_end; + int ppc_bits; + + LASSERT(cur->oe_state == OES_CACHE); + LASSERT(osc_object_is_locked(obj)); + if (victim == NULL) + return -EINVAL; + + if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) + return -EBUSY; + + if (cur->oe_max_end != victim->oe_max_end) + return -ERANGE; + + LASSERT(cur->oe_osclock == victim->oe_osclock); + ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; + chunk_start = cur->oe_start >> ppc_bits; + chunk_end = cur->oe_end >> ppc_bits; + if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && + chunk_end + 1 != victim->oe_start >> ppc_bits) + return -ERANGE; + + OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); + + cur->oe_start = min(cur->oe_start, victim->oe_start); + cur->oe_end = max(cur->oe_end, victim->oe_end); + cur->oe_grants += victim->oe_grants; + cur->oe_nr_pages += victim->oe_nr_pages; + /* only the following bits are needed to merge */ + cur->oe_urgent |= victim->oe_urgent; + cur->oe_memalloc |= victim->oe_memalloc; + list_splice_init(&victim->oe_pages, &cur->oe_pages); + list_del_init(&victim->oe_link); + victim->oe_nr_pages = 0; + + osc_extent_get(victim); + __osc_extent_remove(victim); + osc_extent_put(env, victim); + + OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); + return 0; +} + +/** + * Drop user count of osc_extent, and unplug IO asynchronously. + */ +int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) +{ + struct osc_object *obj = ext->oe_obj; + int rc = 0; + ENTRY; + + LASSERT(atomic_read(&ext->oe_users) > 0); + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_grants > 0); + + if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { + LASSERT(ext->oe_state == OES_ACTIVE); + if (ext->oe_trunc_pending) { + /* a truncate process is waiting for this extent. + * This may happen due to a race, check + * osc_cache_truncate_start(). */ + osc_extent_state_set(ext, OES_TRUNC); + ext->oe_trunc_pending = 0; + } else { + osc_extent_state_set(ext, OES_CACHE); + osc_update_pending(obj, OBD_BRW_WRITE, + ext->oe_nr_pages); + + /* try to merge the previous and next extent. */ + osc_extent_merge(env, ext, prev_extent(ext)); + osc_extent_merge(env, ext, next_extent(ext)); + + if (ext->oe_urgent) + list_move_tail(&ext->oe_link, + &obj->oo_urgent_exts); + } + osc_object_unlock(obj); + + osc_io_unplug_async(env, osc_cli(obj), obj); + } + osc_extent_put(env, ext); + RETURN(rc); +} + +static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) +{ + return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); +} + +/** + * Find or create an extent which includes @index, core function to manage + * extent tree. + */ +struct osc_extent *osc_extent_find(const struct lu_env *env, + struct osc_object *obj, pgoff_t index, + int *grants) + +{ + struct client_obd *cli = osc_cli(obj); + struct cl_lock *lock; + struct osc_extent *cur; + struct osc_extent *ext; + struct osc_extent *conflict = NULL; + struct osc_extent *found = NULL; + pgoff_t chunk; + pgoff_t max_end; + int max_pages; /* max_pages_per_rpc */ + int chunksize; + int ppc_bits; /* pages per chunk bits */ + int chunk_mask; + int rc; + ENTRY; + + cur = osc_extent_alloc(obj); + if (cur == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); + LASSERT(lock != NULL); + LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + + LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); + ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + chunk_mask = ~((1 << ppc_bits) - 1); + chunksize = 1 << cli->cl_chunkbits; + chunk = index >> ppc_bits; + + /* align end to rpc edge, rpc size may not be a power 2 integer. */ + max_pages = cli->cl_max_pages_per_rpc; + LASSERT((max_pages & ~chunk_mask) == 0); + max_end = index - (index % max_pages) + max_pages - 1; + max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); + + /* initialize new extent by parameters so far */ + cur->oe_max_end = max_end; + cur->oe_start = index & chunk_mask; + cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; + if (cur->oe_start < lock->cll_descr.cld_start) + cur->oe_start = lock->cll_descr.cld_start; + if (cur->oe_end > max_end) + cur->oe_end = max_end; + cur->oe_osclock = lock; + cur->oe_grants = 0; + cur->oe_mppr = max_pages; + + /* grants has been allocated by caller */ + LASSERTF(*grants >= chunksize + cli->cl_extent_tax, + "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); + LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur)); + +restart: + osc_object_lock(obj); + ext = osc_extent_search(obj, cur->oe_start); + if (ext == NULL) + ext = first_extent(obj); + while (ext != NULL) { + loff_t ext_chk_start = ext->oe_start >> ppc_bits; + loff_t ext_chk_end = ext->oe_end >> ppc_bits; + + LASSERT(sanity_check_nolock(ext) == 0); + if (chunk > ext_chk_end + 1) + break; + + /* if covering by different locks, no chance to match */ + if (lock != ext->oe_osclock) { + EASSERTF(!overlapped(ext, cur), ext, + EXTSTR, EXTPARA(cur)); + + ext = next_extent(ext); + continue; + } + + /* discontiguous chunks? */ + if (chunk + 1 < ext_chk_start) { + ext = next_extent(ext); + continue; + } + + /* ok, from now on, ext and cur have these attrs: + * 1. covered by the same lock + * 2. contiguous at chunk level or overlapping. */ + + if (overlapped(ext, cur)) { + /* cur is the minimum unit, so overlapping means + * full contain. */ + EASSERTF((ext->oe_start <= cur->oe_start && + ext->oe_end >= cur->oe_end), + ext, EXTSTR, EXTPARA(cur)); + + if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { + /* for simplicity, we wait for this extent to + * finish before going forward. */ + conflict = osc_extent_get(ext); + break; + } + + found = osc_extent_hold(ext); + break; + } + + /* non-overlapped extent */ + if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { + /* we can't do anything for a non OES_CACHE extent, or + * if there is someone waiting for this extent to be + * flushed, try next one. */ + ext = next_extent(ext); + continue; + } + + /* check if they belong to the same rpc slot before trying to + * merge. the extents are not overlapped and contiguous at + * chunk level to get here. */ + if (ext->oe_max_end != max_end) { + /* if they don't belong to the same RPC slot or + * max_pages_per_rpc has ever changed, do not merge. */ + ext = next_extent(ext); + continue; + } + + /* it's required that an extent must be contiguous at chunk + * level so that we know the whole extent is covered by grant + * (the pages in the extent are NOT required to be contiguous). + * Otherwise, it will be too much difficult to know which + * chunks have grants allocated. */ + + /* try to do front merge - extend ext's start */ + if (chunk + 1 == ext_chk_start) { + /* ext must be chunk size aligned */ + EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); + + /* pull ext's start back to cover cur */ + ext->oe_start = cur->oe_start; + ext->oe_grants += chunksize; + *grants -= chunksize; + + found = osc_extent_hold(ext); + } else if (chunk == ext_chk_end + 1) { + /* rear merge */ + ext->oe_end = cur->oe_end; + ext->oe_grants += chunksize; + *grants -= chunksize; + + /* try to merge with the next one because we just fill + * in a gap */ + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + /* we can save extent tax from next extent */ + *grants += cli->cl_extent_tax; + + found = osc_extent_hold(ext); + } + if (found != NULL) + break; + + ext = next_extent(ext); + } + + osc_extent_tree_dump(D_CACHE, obj); + if (found != NULL) { + LASSERT(conflict == NULL); + if (!IS_ERR(found)) { + LASSERT(found->oe_osclock == cur->oe_osclock); + OSC_EXTENT_DUMP(D_CACHE, found, + "found caching ext for %lu.\n", index); + } + } else if (conflict == NULL) { + /* create a new extent */ + EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); + cur->oe_grants = chunksize + cli->cl_extent_tax; + *grants -= cur->oe_grants; + LASSERT(*grants >= 0); + + cur->oe_state = OES_CACHE; + found = osc_extent_hold(cur); + osc_extent_insert(obj, cur); + OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", + index, lock->cll_descr.cld_end); + } + osc_object_unlock(obj); + + if (conflict != NULL) { + LASSERT(found == NULL); + + /* waiting for IO to finish. Please notice that it's impossible + * to be an OES_TRUNC extent. */ + rc = osc_extent_wait(env, conflict, OES_INV); + osc_extent_put(env, conflict); + conflict = NULL; + if (rc < 0) + GOTO(out, found = ERR_PTR(rc)); + + goto restart; + } + EXIT; + +out: + osc_extent_put(env, cur); + LASSERT(*grants >= 0); + return found; +} + +/** + * Called when IO is finished to an extent. + */ +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc) +{ + struct client_obd *cli = osc_cli(ext->oe_obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int nr_pages = ext->oe_nr_pages; + int lost_grant = 0; + int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; + __u64 last_off = 0; + int last_count = -1; + ENTRY; + + OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); + + ext->oe_rc = rc ?: ext->oe_nr_pages; + EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + list_del_init(&oap->oap_rpc_item); + list_del_init(&oap->oap_pending_item); + if (last_off <= oap->oap_obj_off) { + last_off = oap->oap_obj_off; + last_count = oap->oap_count; + } + + --ext->oe_nr_pages; + osc_ap_completion(env, cli, oap, sent, rc); + } + EASSERT(ext->oe_nr_pages == 0, ext); + + if (!sent) { + lost_grant = ext->oe_grants; + } else if (blocksize < PAGE_CACHE_SIZE && + last_count != PAGE_CACHE_SIZE) { + /* For short writes we shouldn't count parts of pages that + * span a whole chunk on the OST side, or our accounting goes + * wrong. Should match the code in filter_grant_check. */ + int offset = oap->oap_page_off & ~CFS_PAGE_MASK; + int count = oap->oap_count + (offset & (blocksize - 1)); + int end = (offset + oap->oap_count) & (blocksize - 1); + if (end) + count += blocksize - end; + + lost_grant = PAGE_CACHE_SIZE - count; + } + if (ext->oe_grants > 0) + osc_free_grant(cli, nr_pages, lost_grant); + + osc_extent_remove(ext); + /* put the refcount for RPC */ + osc_extent_put(env, ext); + RETURN(0); +} + +static int extent_wait_cb(struct osc_extent *ext, int state) +{ + int ret; + + osc_object_lock(ext->oe_obj); + ret = ext->oe_state == state; + osc_object_unlock(ext->oe_obj); + + return ret; +} + +/** + * Wait for the extent's state to become @state. + */ +static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, + int state) +{ + struct osc_object *obj = ext->oe_obj; + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, + LWI_ON_SIGNAL_NOOP, NULL); + int rc = 0; + ENTRY; + + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + /* `Kick' this extent only if the caller is waiting for it to be + * written out. */ + if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) { + if (ext->oe_state == OES_ACTIVE) { + ext->oe_urgent = 1; + } else if (ext->oe_state == OES_CACHE) { + ext->oe_urgent = 1; + osc_extent_hold(ext); + rc = 1; + } + } + osc_object_unlock(obj); + if (rc == 1) + osc_extent_release(env, ext); + + /* wait for the extent until its state becomes @state */ + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); + if (rc == -ETIMEDOUT) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "%s: wait ext to %d timedout, recovery in progress?\n", + osc_export(obj)->exp_obd->obd_name, state); + + lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), + &lwi); + } + if (rc == 0 && ext->oe_rc < 0) + rc = ext->oe_rc; + RETURN(rc); +} + +/** + * Discard pages with index greater than @size. If @ext is overlapped with + * @size, then partial truncate happens. + */ +static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, + bool partial) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_async_page *oap; + struct osc_async_page *tmp; + int pages_in_chunk = 0; + int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + __u64 trunc_chunk = trunc_index >> ppc_bits; + int grants = 0; + int nr_pages = 0; + int rc = 0; + ENTRY; + + LASSERT(sanity_check(ext) == 0); + LASSERT(ext->oe_state == OES_TRUNC); + LASSERT(!ext->oe_urgent); + + /* Request new lu_env. + * We can't use that env from osc_cache_truncate_start() because + * it's from lov_io_sub and not fully initialized. */ + env = cl_env_nested_get(&nest); + io = &osc_env_info(env)->oti_io; + io->ci_obj = cl_object_top(osc2cl(obj)); + rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (rc < 0) + GOTO(out, rc); + + /* discard all pages with index greater then trunc_index */ + list_for_each_entry_safe(oap, tmp, &ext->oe_pages, + oap_pending_item) { + struct cl_page *sub = oap2cl_page(oap); + struct cl_page *page = cl_page_top(sub); + + LASSERT(list_empty(&oap->oap_rpc_item)); + + /* only discard the pages with their index greater than + * trunc_index, and ... */ + if (sub->cp_index < trunc_index || + (sub->cp_index == trunc_index && partial)) { + /* accounting how many pages remaining in the chunk + * so that we can calculate grants correctly. */ + if (sub->cp_index >> ppc_bits == trunc_chunk) + ++pages_in_chunk; + continue; + } + + list_del_init(&oap->oap_pending_item); + + cl_page_get(page); + lu_ref_add(&page->cp_reference, "truncate", current); + + if (cl_page_own(env, io, page) == 0) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + LASSERT(0); + } + + lu_ref_del(&page->cp_reference, "truncate", current); + cl_page_put(env, page); + + --ext->oe_nr_pages; + ++nr_pages; + } + EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, + ext->oe_nr_pages == 0), + ext, "trunc_index %lu, partial %d\n", trunc_index, partial); + + osc_object_lock(obj); + if (ext->oe_nr_pages == 0) { + LASSERT(pages_in_chunk == 0); + grants = ext->oe_grants; + ext->oe_grants = 0; + } else { /* calculate how many grants we can free */ + int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; + pgoff_t last_index; + + + /* if there is no pages in this chunk, we can also free grants + * for the last chunk */ + if (pages_in_chunk == 0) { + /* if this is the 1st chunk and no pages in this chunk, + * ext->oe_nr_pages must be zero, so we should be in + * the other if-clause. */ + LASSERT(trunc_chunk > 0); + --trunc_chunk; + ++chunks; + } + + /* this is what we can free from this extent */ + grants = chunks << cli->cl_chunkbits; + ext->oe_grants -= grants; + last_index = ((trunc_chunk + 1) << ppc_bits) - 1; + ext->oe_end = min(last_index, ext->oe_max_end); + LASSERT(ext->oe_end >= ext->oe_start); + LASSERT(ext->oe_grants > 0); + } + osc_object_unlock(obj); + + if (grants > 0 || nr_pages > 0) + osc_free_grant(cli, nr_pages, grants); + +out: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + RETURN(rc); +} + +/** + * This function is used to make the extent prepared for transfer. + * A race with flusing page - ll_writepage() has to be handled cautiously. + */ +static int osc_extent_make_ready(const struct lu_env *env, + struct osc_extent *ext) +{ + struct osc_async_page *oap; + struct osc_async_page *last = NULL; + struct osc_object *obj = ext->oe_obj; + int page_count = 0; + int rc; + ENTRY; + + /* we're going to grab page lock, so object lock must not be taken. */ + LASSERT(sanity_check(ext) == 0); + /* in locking state, any process should not touch this extent. */ + EASSERT(ext->oe_state == OES_LOCKING, ext); + EASSERT(ext->oe_owner != NULL, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); + + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + if (last == NULL || last->oap_obj_off < oap->oap_obj_off) + last = oap; + + /* checking ASYNC_READY is race safe */ + if ((oap->oap_async_flags & ASYNC_READY) != 0) + continue; + + rc = osc_make_ready(env, oap, OBD_BRW_WRITE); + switch (rc) { + case 0: + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY; + spin_unlock(&oap->oap_lock); + break; + case -EALREADY: + LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); + break; + default: + LASSERTF(0, "unknown return code: %d\n", rc); + } + } + + LASSERT(page_count == ext->oe_nr_pages); + LASSERT(last != NULL); + /* the last page is the only one we need to refresh its count by + * the size of file. */ + if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { + last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); + LASSERT(last->oap_count > 0); + LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); + last->oap_async_flags |= ASYNC_COUNT_STABLE; + } + + /* for the rest of pages, we don't need to call osf_refresh_count() + * because it's known they are not the last page */ + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { + oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + } + } + + osc_object_lock(obj); + osc_extent_state_set(ext, OES_RPC); + osc_object_unlock(obj); + /* get a refcount for RPC. */ + osc_extent_get(ext); + + RETURN(0); +} + +/** + * Quick and simple version of osc_extent_find(). This function is frequently + * called to expand the extent for the same IO. To expand the extent, the + * page index must be in the same or next chunk of ext->oe_end. + */ +static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) +{ + struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *next; + int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; + pgoff_t chunk = index >> ppc_bits; + pgoff_t end_chunk; + pgoff_t end_index; + int chunksize = 1 << cli->cl_chunkbits; + int rc = 0; + ENTRY; + + LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); + osc_object_lock(obj); + LASSERT(sanity_check_nolock(ext) == 0); + end_chunk = ext->oe_end >> ppc_bits; + if (chunk > end_chunk + 1) + GOTO(out, rc = -ERANGE); + + if (end_chunk >= chunk) + GOTO(out, rc = 0); + + LASSERT(end_chunk + 1 == chunk); + /* try to expand this extent to cover @index */ + end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); + + next = next_extent(ext); + if (next != NULL && next->oe_start <= end_index) + /* complex mode - overlapped with the next extent, + * this case will be handled by osc_extent_find() */ + GOTO(out, rc = -EAGAIN); + + ext->oe_end = end_index; + ext->oe_grants += chunksize; + *grants -= chunksize; + LASSERT(*grants >= 0); + EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, + "overlapped after expanding for %lu.\n", index); + EXIT; + +out: + osc_object_unlock(obj); + RETURN(rc); +} + +static void osc_extent_tree_dump0(int level, struct osc_object *obj, + const char *func, int line) +{ + struct osc_extent *ext; + int cnt; + + CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", + obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); + + /* osc_object_lock(obj); */ + cnt = 1; + for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) + OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); + + cnt = 1; + list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) + OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); + /* osc_object_unlock(obj); */ +} + +/* ------------------ osc extent end ------------------ */ + +static inline int osc_is_ready(struct osc_object *osc) +{ + return !list_empty(&osc->oo_ready_item) || + !list_empty(&osc->oo_hp_ready_item); +} + +#define OSC_IO_DEBUG(OSC, STR, args...) \ + CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ + (OSC), osc_is_ready(OSC), \ + list_empty_marker(&(OSC)->oo_hp_ready_item), \ + list_empty_marker(&(OSC)->oo_ready_item), \ + atomic_read(&(OSC)->oo_nr_writes), \ + list_empty_marker(&(OSC)->oo_hp_exts), \ + list_empty_marker(&(OSC)->oo_urgent_exts), \ + atomic_read(&(OSC)->oo_nr_reads), \ + list_empty_marker(&(OSC)->oo_reading_exts), \ + ##args) + +static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, + int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = cl_page_top(oap2cl_page(oap)); + int result; + + LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ + + ENTRY; + result = cl_page_make_ready(env, page, CRT_WRITE); + if (result == 0) + opg->ops_submit_time = cfs_time_current(); + RETURN(result); +} + +static int osc_refresh_count(const struct lu_env *env, + struct osc_async_page *oap, int cmd) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = oap2cl_page(oap); + struct cl_object *obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + int result; + loff_t kms; + + /* readpage queues with _COUNT_STABLE, shouldn't get here. */ + LASSERT(!(cmd & OBD_BRW_READ)); + LASSERT(opg != NULL); + obj = opg->ops_cl.cpl_obj; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result < 0) + return result; + kms = attr->cat_kms; + if (cl_offset(obj, page->cp_index) >= kms) + /* catch race with truncate */ + return 0; + else if (cl_offset(obj, page->cp_index + 1) > kms) + /* catch sub-page write at end of file */ + return kms % PAGE_CACHE_SIZE; + else + return PAGE_CACHE_SIZE; +} + +static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, + int cmd, int rc) +{ + struct osc_page *opg = oap2osc_page(oap); + struct cl_page *page = cl_page_top(oap2cl_page(oap)); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + enum cl_req_type crt; + int srvlock; + + ENTRY; + + cmd &= ~OBD_BRW_NOQUOTA; + LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); + LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); + LASSERT(opg->ops_transfer_pinned); + + /* + * page->cp_req can be NULL if io submission failed before + * cl_req was allocated. + */ + if (page->cp_req != NULL) + cl_req_page_done(env, page); + LASSERT(page->cp_req == NULL); + + crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; + /* Clear opg->ops_transfer_pinned before VM lock is released. */ + opg->ops_transfer_pinned = 0; + + spin_lock(&obj->oo_seatbelt); + LASSERT(opg->ops_submitter != NULL); + LASSERT(!list_empty(&opg->ops_inflight)); + list_del_init(&opg->ops_inflight); + opg->ops_submitter = NULL; + spin_unlock(&obj->oo_seatbelt); + + opg->ops_submit_time = 0; + srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; + + /* statistic */ + if (rc == 0 && srvlock) { + struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; + struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; + int bytes = oap->oap_count; + + if (crt == CRT_READ) + stats->os_lockless_reads += bytes; + else + stats->os_lockless_writes += bytes; + } + + /* + * This has to be the last operation with the page, as locks are + * released in cl_page_completion() and nothing except for the + * reference counter protects page from concurrent reclaim. + */ + lu_ref_del(&page->cp_reference, "transfer", page); + + cl_page_completion(env, page, crt, rc); + + RETURN(0); +} + +#define OSC_DUMP_GRANT(cli, fmt, args...) do { \ + struct client_obd *__tmp = (cli); \ + CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ + "unstable_pages: %d/%d dropped: %ld avail: %ld, " \ + "reserved: %ld, flight: %d } " fmt, \ + __tmp->cl_import->imp_obd->obd_name, \ + __tmp->cl_dirty, __tmp->cl_dirty_max, \ + atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ + atomic_read(&obd_unstable_pages), obd_max_dirty_pages, \ + __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ + __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ +} while (0) + +/* caller must hold loi_list_lock */ +static void osc_consume_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock)); + LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); + atomic_inc(&obd_dirty_pages); + cli->cl_dirty += PAGE_CACHE_SIZE; + pga->flag |= OBD_BRW_FROM_GRANT; + CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", + PAGE_CACHE_SIZE, pga, pga->pg); + osc_update_next_shrink(cli); +} + +/* the companion to osc_consume_write_grant, called when a brw has completed. + * must be called with the loi lock held. */ +static void osc_release_write_grant(struct client_obd *cli, + struct brw_page *pga) +{ + ENTRY; + + LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock)); + if (!(pga->flag & OBD_BRW_FROM_GRANT)) { + EXIT; + return; + } + + pga->flag &= ~OBD_BRW_FROM_GRANT; + atomic_dec(&obd_dirty_pages); + cli->cl_dirty -= PAGE_CACHE_SIZE; + if (pga->flag & OBD_BRW_NOCACHE) { + pga->flag &= ~OBD_BRW_NOCACHE; + atomic_dec(&obd_dirty_transit_pages); + cli->cl_dirty_transit -= PAGE_CACHE_SIZE; + } + EXIT; +} + +/** + * To avoid sleeping with object lock held, it's good for us allocate enough + * grants before entering into critical section. + * + * client_obd_list_lock held by caller + */ +static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) +{ + int rc = -EDQUOT; + + if (cli->cl_avail_grant >= bytes) { + cli->cl_avail_grant -= bytes; + cli->cl_reserved_grant += bytes; + rc = 0; + } + return rc; +} + +static void __osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + /* it's quite normal for us to get more grant than reserved. + * Thinking about a case that two extents merged by adding a new + * chunk, we can save one extent tax. If extent tax is greater than + * one chunk, we can save more grant by adding a new chunk */ + cli->cl_reserved_grant -= reserved; + if (unused > reserved) { + cli->cl_avail_grant += reserved; + cli->cl_lost_grant += unused - reserved; + } else { + cli->cl_avail_grant += unused; + } +} + +void osc_unreserve_grant(struct client_obd *cli, + unsigned int reserved, unsigned int unused) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + __osc_unreserve_grant(cli, reserved, unused); + if (unused > 0) + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +/** + * Free grant after IO is finished or canceled. + * + * @lost_grant is used to remember how many grants we have allocated but not + * used, we should return these grants to OST. There're two cases where grants + * can be lost: + * 1. truncate; + * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was + * written. In this case OST may use less chunks to serve this partial + * write. OSTs don't actually know the page size on the client side. so + * clients have to calculate lost grant by the blocksize on the OST. + * See filter_grant_check() for details. + */ +static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, + unsigned int lost_grant) +{ + int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + + client_obd_list_lock(&cli->cl_loi_list_lock); + atomic_sub(nr_pages, &obd_dirty_pages); + cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; + cli->cl_lost_grant += lost_grant; + if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { + /* borrow some grant from truncate to avoid the case that + * truncate uses up all avail grant */ + cli->cl_lost_grant -= grant; + cli->cl_avail_grant += grant; + } + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", + lost_grant, cli->cl_lost_grant, + cli->cl_avail_grant, cli->cl_dirty); +} + +/** + * The companion to osc_enter_cache(), called when @oap is no longer part of + * the dirty accounting due to error. + */ +static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + osc_release_write_grant(cli, &oap->oap_brw_page); + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +/** + * Non-blocking version of osc_enter_cache() that consumes grant only when it + * is available. + */ +static int osc_enter_cache_try(struct client_obd *cli, + struct osc_async_page *oap, + int bytes, int transient) +{ + int rc; + + OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + + rc = osc_reserve_grant(cli, bytes); + if (rc < 0) + return 0; + + if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && + atomic_read(&obd_unstable_pages) + 1 + + atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + if (transient) { + cli->cl_dirty_transit += PAGE_CACHE_SIZE; + atomic_inc(&obd_dirty_transit_pages); + oap->oap_brw_flags |= OBD_BRW_NOCACHE; + } + rc = 1; + } else { + __osc_unreserve_grant(cli, bytes, bytes); + rc = 0; + } + return rc; +} + +static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) +{ + int rc; + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = list_empty(&ocw->ocw_entry); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +/** + * The main entry to reserve dirty page accounting. Usually the grant reserved + * in this function will be freed in bulk in osc_free_grant() unless it fails + * to add osc cache, in that case, it will be freed in osc_exit_cache(). + * + * The process will be put into sleep if it's already run out of grant. + */ +static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int bytes) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + struct osc_cache_waiter ocw; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + int rc = -EDQUOT; + ENTRY; + + OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + /* force the caller to try sync io. this can jump the list + * of queued writes and create a discontiguous rpc stream */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || + cli->cl_dirty_max < PAGE_CACHE_SIZE || + cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) + GOTO(out, rc = -EDQUOT); + + /* Hopefully normal case - cache space and write credits available */ + if (osc_enter_cache_try(cli, oap, bytes, 0)) + GOTO(out, rc = 0); + + /* We can get here for two reasons: too many dirty pages in cache, or + * run out of grants. In both cases we should write dirty pages out. + * Adding a cache waiter will trigger urgent write-out no matter what + * RPC size will be. + * The exiting condition is no avail grants and no dirty pages caching, + * that really means there is no space on the OST. */ + init_waitqueue_head(&ocw.ocw_waitq); + ocw.ocw_oap = oap; + ocw.ocw_grant = bytes; + while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { + list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); + ocw.ocw_rc = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug_async(env, cli, NULL); + + CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", + cli->cl_import->imp_obd->obd_name, &ocw, oap); + + rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); + + client_obd_list_lock(&cli->cl_loi_list_lock); + + /* l_wait_event is interrupted by signal */ + if (rc < 0) { + list_del_init(&ocw.ocw_entry); + GOTO(out, rc); + } + + LASSERT(list_empty(&ocw.ocw_entry)); + rc = ocw.ocw_rc; + + if (rc != -EDQUOT) + GOTO(out, rc); + if (osc_enter_cache_try(cli, oap, bytes, 0)) + GOTO(out, rc = 0); + } + EXIT; +out: + client_obd_list_unlock(&cli->cl_loi_list_lock); + OSC_DUMP_GRANT(cli, "returned %d.\n", rc); + RETURN(rc); +} + +/* caller must hold loi_list_lock */ +void osc_wake_cache_waiters(struct client_obd *cli) +{ + struct list_head *l, *tmp; + struct osc_cache_waiter *ocw; + + ENTRY; + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { + ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); + list_del_init(&ocw->ocw_entry); + + ocw->ocw_rc = -EDQUOT; + /* we can't dirty more */ + if (cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max || + atomic_read(&obd_unstable_pages) + 1 + + atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) { + CDEBUG(D_CACHE, "no dirty room: dirty: %ld " + "osc max %ld, sys max %d\n", cli->cl_dirty, + cli->cl_dirty_max, obd_max_dirty_pages); + goto wakeup; + } + + ocw->ocw_rc = 0; + if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) + ocw->ocw_rc = -EDQUOT; + +wakeup: + CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", + ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); + + wake_up(&ocw->ocw_waitq); + } + + EXIT; +} + +static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) +{ + int hprpc = !!list_empty(&osc->oo_hp_exts); + return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; +} + +/* This maintains the lists of pending pages to read/write for a given object + * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() + * to quickly find objects that are ready to send an RPC. */ +static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, + int cmd) +{ + int invalid_import = 0; + ENTRY; + + /* if we have an invalid import we want to drain the queued pages + * by forcing them through rpcs that immediately fail and complete + * the pages. recovery relies on this to empty the queued pages + * before canceling the locks and evicting down the llite pages */ + if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) + invalid_import = 1; + + if (cmd & OBD_BRW_WRITE) { + if (atomic_read(&osc->oo_nr_writes) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_hp_exts)) { + CDEBUG(D_CACHE, "high prio request forcing RPC\n"); + RETURN(1); + } + if (!list_empty(&osc->oo_urgent_exts)) { + CDEBUG(D_CACHE, "urgent request forcing RPC\n"); + RETURN(1); + } + /* trigger a write rpc stream as long as there are dirtiers + * waiting for space. as they're waiting, they're not going to + * create more pages to coalesce with what's waiting.. */ + if (!list_empty(&cli->cl_cache_waiters)) { + CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); + RETURN(1); + } + if (atomic_read(&osc->oo_nr_writes) >= + cli->cl_max_pages_per_rpc) + RETURN(1); + } else { + if (atomic_read(&osc->oo_nr_reads) == 0) + RETURN(0); + if (invalid_import) { + CDEBUG(D_CACHE, "invalid import forcing RPC\n"); + RETURN(1); + } + /* all read are urgent. */ + if (!list_empty(&osc->oo_reading_exts)) + RETURN(1); + } + + RETURN(0); +} + +static void osc_update_pending(struct osc_object *obj, int cmd, int delta) +{ + struct client_obd *cli = osc_cli(obj); + if (cmd & OBD_BRW_WRITE) { + atomic_add(delta, &obj->oo_nr_writes); + atomic_add(delta, &cli->cl_pending_w_pages); + LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); + } else { + atomic_add(delta, &obj->oo_nr_reads); + atomic_add(delta, &cli->cl_pending_r_pages); + LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); + } + OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); +} + +static int osc_makes_hprpc(struct osc_object *obj) +{ + return !list_empty(&obj->oo_hp_exts); +} + +static void on_list(struct list_head *item, struct list_head *list, int should_be_on) +{ + if (list_empty(item) && should_be_on) + list_add_tail(item, list); + else if (!list_empty(item) && !should_be_on) + list_del_init(item); +} + +/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc + * can find pages to build into rpcs quickly */ +static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + if (osc_makes_hprpc(osc)) { + /* HP rpc */ + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); + } else { + on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); + on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, + osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || + osc_makes_rpc(cli, osc, OBD_BRW_READ)); + } + + on_list(&osc->oo_write_item, &cli->cl_loi_write_list, + atomic_read(&osc->oo_nr_writes) > 0); + + on_list(&osc->oo_read_item, &cli->cl_loi_read_list, + atomic_read(&osc->oo_nr_reads) > 0); + + return osc_is_ready(osc); +} + +static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) +{ + int is_ready; + + client_obd_list_lock(&cli->cl_loi_list_lock); + is_ready = __osc_list_maint(cli, osc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return is_ready; +} + +/* this is trying to propogate async writeback errors back up to the + * application. As an async write fails we record the error code for later if + * the app does an fsync. As long as errors persist we force future rpcs to be + * sync so that the app can get a sync error and break the cycle of queueing + * pages for which writeback will fail. */ +static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, + int rc) +{ + if (rc) { + if (!ar->ar_rc) + ar->ar_rc = rc; + + ar->ar_force_sync = 1; + ar->ar_min_xid = ptlrpc_sample_next_xid(); + return; + + } + + if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) + ar->ar_force_sync = 0; +} + +/* Performs "unstable" page accounting. This function balances the + * increment operations performed in osc_inc_unstable_pages. It is + * registered as the RPC request callback, and is executed when the + * bulk RPC is committed on the server. Thus at this point, the pages + * involved in the bulk transfer are no longer considered unstable. */ +void osc_dec_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + obd_count page_count = desc->bd_iov_count; + int i; + + /* No unstable page tracking */ + if (cli->cl_cache == NULL) + return; + + LASSERT(page_count >= 0); + + for (i = 0; i < page_count; i++) + dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + + atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); + LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); + + atomic_sub(page_count, &obd_unstable_pages); + LASSERT(atomic_read(&obd_unstable_pages) >= 0); + + spin_lock(&req->rq_lock); + req->rq_committed = 1; + req->rq_unstable = 0; + spin_unlock(&req->rq_lock); + + wake_up_all(&cli->cl_cache->ccc_unstable_waitq); +} + +/* "unstable" page accounting. See: osc_dec_unstable_pages. */ +void osc_inc_unstable_pages(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + obd_count page_count = desc->bd_iov_count; + int i; + + /* No unstable page tracking */ + if (cli->cl_cache == NULL) + return; + + LASSERT(page_count >= 0); + + for (i = 0; i < page_count; i++) + inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); + + LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); + atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); + + LASSERT(atomic_read(&obd_unstable_pages) >= 0); + atomic_add(page_count, &obd_unstable_pages); + + spin_lock(&req->rq_lock); + + /* If the request has already been committed (i.e. brw_commit + * called via rq_commit_cb), we need to undo the unstable page + * increments we just performed because rq_commit_cb wont be + * called again. Otherwise, just set the commit callback so the + * unstable page accounting is properly updated when the request + * is committed */ + if (req->rq_committed) { + /* Drop lock before calling osc_dec_unstable_pages */ + spin_unlock(&req->rq_lock); + osc_dec_unstable_pages(req); + spin_lock(&req->rq_lock); + } else { + req->rq_unstable = 1; + req->rq_commit_cb = osc_dec_unstable_pages; + } + + spin_unlock(&req->rq_lock); +} + +/* this must be called holding the loi list lock to give coverage to exit_cache, + * async_flag maintenance, and oap_request */ +static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, + struct osc_async_page *oap, int sent, int rc) +{ + struct osc_object *osc = oap->oap_obj; + struct lov_oinfo *loi = osc->oo_oinfo; + __u64 xid = 0; + + ENTRY; + if (oap->oap_request != NULL) { + if (rc == 0) + osc_inc_unstable_pages(oap->oap_request); + + xid = ptlrpc_req_xid(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + + /* As the transfer for this page is being done, clear the flags */ + spin_lock(&oap->oap_lock); + oap->oap_async_flags = 0; + spin_unlock(&oap->oap_lock); + oap->oap_interrupted = 0; + + if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { + client_obd_list_lock(&cli->cl_loi_list_lock); + osc_process_ar(&cli->cl_ar, xid, rc); + osc_process_ar(&loi->loi_ar, xid, rc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + } + + rc = osc_completion(env, oap, oap->oap_cmd, rc); + if (rc) + CERROR("completion on oap %p obj %p returns %d.\n", + oap, osc, rc); + + EXIT; +} + +/** + * Try to add extent to one RPC. We need to think about the following things: + * - # of pages must not be over max_pages_per_rpc + * - extent must be compatible with previous ones + */ +static int try_to_add_extent_for_io(struct client_obd *cli, + struct osc_extent *ext, struct list_head *rpclist, + int *pc, unsigned int *max_pages) +{ + struct osc_extent *tmp; + ENTRY; + + EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), + ext); + + *max_pages = max(ext->oe_mppr, *max_pages); + if (*pc + ext->oe_nr_pages > *max_pages) + RETURN(0); + + list_for_each_entry(tmp, rpclist, oe_link) { + EASSERT(tmp->oe_owner == current, tmp); +#if 0 + if (overlapped(tmp, ext)) { + OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); + EASSERT(0, ext); + } +#endif + + if (tmp->oe_srvlock != ext->oe_srvlock || + !tmp->oe_grants != !ext->oe_grants) + RETURN(0); + + /* remove break for strict check */ + break; + } + + *pc += ext->oe_nr_pages; + list_move_tail(&ext->oe_link, rpclist); + ext->oe_owner = current; + RETURN(1); +} + +/** + * In order to prevent multiple ptlrpcd from breaking contiguous extents, + * get_write_extent() takes all appropriate extents in atomic. + * + * The following policy is used to collect extents for IO: + * 1. Add as many HP extents as possible; + * 2. Add the first urgent extent in urgent extent list and take it out of + * urgent list; + * 3. Add subsequent extents of this urgent extent; + * 4. If urgent list is not empty, goto 2; + * 5. Traverse the extent tree from the 1st extent; + * 6. Above steps exit if there is no space in this RPC. + */ +static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + int page_count = 0; + unsigned int max_pages = cli->cl_max_pages_per_rpc; + + LASSERT(osc_object_is_locked(obj)); + while (!list_empty(&obj->oo_hp_exts)) { + ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, + oe_link); + LASSERT(ext->oe_state == OES_CACHE); + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + EASSERT(ext->oe_nr_pages <= max_pages, ext); + } + if (page_count == max_pages) + return page_count; + + while (!list_empty(&obj->oo_urgent_exts)) { + ext = list_entry(obj->oo_urgent_exts.next, + struct osc_extent, oe_link); + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + + if (!ext->oe_intree) + continue; + + while ((ext = next_extent(ext)) != NULL) { + if ((ext->oe_state != OES_CACHE) || + (!list_empty(&ext->oe_link) && + ext->oe_owner != NULL)) + continue; + + if (!try_to_add_extent_for_io(cli, ext, rpclist, + &page_count, &max_pages)) + return page_count; + } + } + if (page_count == max_pages) + return page_count; + + ext = first_extent(obj); + while (ext != NULL) { + if ((ext->oe_state != OES_CACHE) || + /* this extent may be already in current rpclist */ + (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { + ext = next_extent(ext); + continue; + } + + if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, + &max_pages)) + return page_count; + + ext = next_extent(ext); + } + return page_count; +} + +static int +osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + LIST_HEAD(rpclist); + struct osc_extent *ext; + struct osc_extent *tmp; + struct osc_extent *first = NULL; + obd_count page_count = 0; + int srvlock = 0; + int rc = 0; + ENTRY; + + LASSERT(osc_object_is_locked(osc)); + + page_count = get_write_extents(osc, &rpclist); + LASSERT(equi(page_count == 0, list_empty(&rpclist))); + + if (list_empty(&rpclist)) + RETURN(0); + + osc_update_pending(osc, OBD_BRW_WRITE, -page_count); + + list_for_each_entry(ext, &rpclist, oe_link) { + LASSERT(ext->oe_state == OES_CACHE || + ext->oe_state == OES_LOCK_DONE); + if (ext->oe_state == OES_CACHE) + osc_extent_state_set(ext, OES_LOCKING); + else + osc_extent_state_set(ext, OES_RPC); + } + + /* we're going to grab page lock, so release object lock because + * lock order is page lock -> object lock. */ + osc_object_unlock(osc); + + list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { + if (ext->oe_state == OES_LOCKING) { + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + continue; + } + } + if (first == NULL) { + first = ext; + srvlock = ext->oe_srvlock; + } else { + LASSERT(srvlock == ext->oe_srvlock); + } + } + + if (!list_empty(&rpclist)) { + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol); + LASSERT(list_empty(&rpclist)); + } + + osc_object_lock(osc); + RETURN(rc); +} + +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cmd OBD_BRW_* macroses + * \param lop pending pages + * + * \return zero if no page added to send queue. + * \return 1 if pages successfully added to send queue. + * \return negative on errors. + */ +static int +osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + struct osc_extent *ext; + struct osc_extent *next; + LIST_HEAD(rpclist); + int page_count = 0; + unsigned int max_pages = cli->cl_max_pages_per_rpc; + int rc = 0; + ENTRY; + + LASSERT(osc_object_is_locked(osc)); + list_for_each_entry_safe(ext, next, + &osc->oo_reading_exts, oe_link) { + EASSERT(ext->oe_state == OES_LOCK_DONE, ext); + if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, + &max_pages)) + break; + osc_extent_state_set(ext, OES_RPC); + EASSERT(ext->oe_nr_pages <= max_pages, ext); + } + LASSERT(page_count <= max_pages); + + osc_update_pending(osc, OBD_BRW_READ, -page_count); + + if (!list_empty(&rpclist)) { + osc_object_unlock(osc); + + LASSERT(page_count > 0); + rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol); + LASSERT(list_empty(&rpclist)); + + osc_object_lock(osc); + } + RETURN(rc); +} + +#define list_to_obj(list, item) ({ \ + struct list_head *__tmp = (list)->next; \ + list_del_init(__tmp); \ + list_entry(__tmp, struct osc_object, oo_##item); \ +}) + +/* This is called by osc_check_rpcs() to find which objects have pages that + * we could be sending. These lists are maintained by osc_makes_rpc(). */ +static struct osc_object *osc_next_obj(struct client_obd *cli) +{ + ENTRY; + + /* First return objects that have blocked locks so that they + * will be flushed quickly and other clients can get the lock, + * then objects which have pages ready to be stuffed into RPCs */ + if (!list_empty(&cli->cl_loi_hp_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item)); + if (!list_empty(&cli->cl_loi_ready_list)) + RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item)); + + /* then if we have cache waiters, return all objects with queued + * writes. This is especially important when many small files + * have filled up the cache and not been fired into rpcs because + * they don't pass the nr_pending/object threshhold */ + if (!list_empty(&cli->cl_cache_waiters) && + !list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); + + /* then return all queued objects when we have an invalid import + * so that they get flushed */ + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { + if (!list_empty(&cli->cl_loi_write_list)) + RETURN(list_to_obj(&cli->cl_loi_write_list, + write_item)); + if (!list_empty(&cli->cl_loi_read_list)) + RETURN(list_to_obj(&cli->cl_loi_read_list, + read_item)); + } + RETURN(NULL); +} + +/* called with the loi list lock held */ +static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli, + pdl_policy_t pol) +{ + struct osc_object *osc; + int rc = 0; + ENTRY; + + while ((osc = osc_next_obj(cli)) != NULL) { + struct cl_object *obj = osc2cl(osc); + struct lu_ref_link *link; + + OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); + + if (osc_max_rpc_in_flight(cli, osc)) { + __osc_list_maint(cli, osc); + break; + } + + cl_object_get(obj); + client_obd_list_unlock(&cli->cl_loi_list_lock); + link = lu_object_ref_add(&obj->co_lu, "check", current); + + /* attempt some read/write balancing by alternating between + * reads and writes in an object. The makes_rpc checks here + * would be redundant if we were getting read/write work items + * instead of objects. we don't want send_oap_rpc to drain a + * partial read pending queue when we're given this object to + * do io on writes while there are cache waiters */ + osc_object_lock(osc); + if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { + rc = osc_send_write_rpc(env, cli, osc, pol); + if (rc < 0) { + CERROR("Write request failed with %d\n", rc); + + /* osc_send_write_rpc failed, mostly because of + * memory pressure. + * + * It can't break here, because if: + * - a page was submitted by osc_io_submit, so + * page locked; + * - no request in flight + * - no subsequent request + * The system will be in live-lock state, + * because there is no chance to call + * osc_io_unplug() and osc_check_rpcs() any + * more. pdflush can't help in this case, + * because it might be blocked at grabbing + * the page lock as we mentioned. + * + * Anyway, continue to drain pages. */ + /* break; */ + } + } + if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { + rc = osc_send_read_rpc(env, cli, osc, pol); + if (rc < 0) + CERROR("Read request failed with %d\n", rc); + } + osc_object_unlock(osc); + + osc_list_maint(cli, osc); + lu_object_ref_del_at(&obj->co_lu, link, "check", current); + cl_object_put(env, obj); + + client_obd_list_lock(&cli->cl_loi_list_lock); + } +} + +static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol, int async) +{ + int has_rpcs = 1; + int rc = 0; + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (osc != NULL) + has_rpcs = __osc_list_maint(cli, osc); + if (has_rpcs) { + if (!async) { + /* disable osc_lru_shrink() temporarily to avoid + * potential stack overrun problem. LU-2859 */ + atomic_inc(&cli->cl_lru_shrinkers); + osc_check_rpcs(env, cli, pol); + atomic_dec(&cli->cl_lru_shrinkers); + } else { + CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", + cli); + LASSERT(cli->cl_writeback_work != NULL); + rc = ptlrpcd_queue_work(cli->cl_writeback_work); + } + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} + +static int osc_io_unplug_async(const struct lu_env *env, + struct client_obd *cli, struct osc_object *osc) +{ + /* XXX: policy is no use actually. */ + return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1); +} + +void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol) +{ + (void)osc_io_unplug0(env, cli, osc, pol, 0); +} + +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset) +{ + struct obd_export *exp = osc_export(osc); + struct osc_async_page *oap = &ops->ops_oap; + ENTRY; + + if (!page) + return cfs_size_round(sizeof(*oap)); + + oap->oap_magic = OAP_MAGIC; + oap->oap_cli = &exp->exp_obd->u.cli; + oap->oap_obj = osc; + + oap->oap_page = page; + oap->oap_obj_off = offset; + LASSERT(!(offset & ~CFS_PAGE_MASK)); + + if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags = OBD_BRW_NOQUOTA; + + INIT_LIST_HEAD(&oap->oap_pending_item); + INIT_LIST_HEAD(&oap->oap_rpc_item); + + spin_lock_init(&oap->oap_lock); + CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n", + oap, page, oap->oap_obj_off); + RETURN(0); +} + +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_extent *ext = NULL; + struct osc_async_page *oap = &ops->ops_oap; + struct client_obd *cli = oap->oap_cli; + struct osc_object *osc = oap->oap_obj; + pgoff_t index; + int grants = 0; + int brw_flags = OBD_BRW_ASYNC; + int cmd = OBD_BRW_WRITE; + int need_release = 0; + int rc = 0; + ENTRY; + + if (oap->oap_magic != OAP_MAGIC) + RETURN(-EINVAL); + + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) + RETURN(-EIO); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) + RETURN(-EBUSY); + + /* Set the OBD_BRW_SRVLOCK before the page is queued. */ + brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; + if (!client_is_remote(osc_export(osc)) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) { + brw_flags |= OBD_BRW_NOQUOTA; + cmd |= OBD_BRW_NOQUOTA; + } + + /* check if the file's owner/group is over quota */ + if (!(cmd & OBD_BRW_NOQUOTA)) { + struct cl_object *obj; + struct cl_attr *attr; + unsigned int qid[MAXQUOTAS]; + + obj = cl_object_top(&osc->oo_cl); + attr = &osc_env_info(env)->oti_attr; + + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + + qid[USRQUOTA] = attr->cat_uid; + qid[GRPQUOTA] = attr->cat_gid; + if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) + rc = -EDQUOT; + if (rc) + RETURN(rc); + } + + oap->oap_cmd = cmd; + oap->oap_page_off = ops->ops_from; + oap->oap_count = ops->ops_to - ops->ops_from; + oap->oap_async_flags = 0; + oap->oap_brw_flags = brw_flags; + + OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", + oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); + + index = oap2cl_page(oap)->cp_index; + + /* Add this page into extent by the following steps: + * 1. if there exists an active extent for this IO, mostly this page + * can be added to the active extent and sometimes we need to + * expand extent to accomodate this page; + * 2. otherwise, a new extent will be allocated. */ + + ext = oio->oi_active; + if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { + /* one chunk plus extent overhead must be enough to write this + * page */ + grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + if (ext->oe_end >= index) + grants = 0; + + /* it doesn't need any grant to dirty this page */ + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = osc_enter_cache_try(cli, oap, grants, 0); + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (rc == 0) { /* try failed */ + grants = 0; + need_release = 1; + } else if (ext->oe_end < index) { + int tmp = grants; + /* try to expand this extent */ + rc = osc_extent_expand(ext, index, &tmp); + if (rc < 0) { + need_release = 1; + /* don't free reserved grant */ + } else { + OSC_EXTENT_DUMP(D_CACHE, ext, + "expanded for %lu.\n", index); + osc_unreserve_grant(cli, grants, tmp); + grants = 0; + } + } + rc = 0; + } else if (ext != NULL) { + /* index is located outside of active extent */ + need_release = 1; + } + if (need_release) { + osc_extent_release(env, ext); + oio->oi_active = NULL; + ext = NULL; + } + + if (ext == NULL) { + int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + + /* try to find new extent to cover this page */ + LASSERT(oio->oi_active == NULL); + /* we may have allocated grant for this page if we failed + * to expand the previous active extent. */ + LASSERT(ergo(grants > 0, grants >= tmp)); + + rc = 0; + if (grants == 0) { + /* we haven't allocated grant for this page. */ + rc = osc_enter_cache(env, cli, oap, tmp); + if (rc == 0) + grants = tmp; + } + + tmp = grants; + if (rc == 0) { + ext = osc_extent_find(env, osc, index, &tmp); + if (IS_ERR(ext)) { + LASSERT(tmp == grants); + osc_exit_cache(cli, oap); + rc = PTR_ERR(ext); + ext = NULL; + } else { + oio->oi_active = ext; + } + } + if (grants > 0) + osc_unreserve_grant(cli, grants, tmp); + } + + LASSERT(ergo(rc == 0, ext != NULL)); + if (ext != NULL) { + EASSERTF(ext->oe_end >= index && ext->oe_start <= index, + ext, "index = %lu.\n", index); + LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); + + osc_object_lock(osc); + if (ext->oe_nr_pages == 0) + ext->oe_srvlock = ops->ops_srvlock; + else + LASSERT(ext->oe_srvlock == ops->ops_srvlock); + ++ext->oe_nr_pages; + list_add_tail(&oap->oap_pending_item, &ext->oe_pages); + osc_object_unlock(osc); + } + RETURN(rc); +} + +int osc_teardown_async_page(const struct lu_env *env, + struct osc_object *obj, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + struct osc_extent *ext = NULL; + int rc = 0; + ENTRY; + + LASSERT(oap->oap_magic == OAP_MAGIC); + + CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", + oap, ops, oap2cl_page(oap)->cp_index); + + osc_object_lock(obj); + if (!list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); + rc = -EBUSY; + } else if (!list_empty(&oap->oap_pending_item)) { + ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); + /* only truncated pages are allowed to be taken out. + * See osc_extent_truncate() and osc_cache_truncate_start() + * for details. */ + if (ext != NULL && ext->oe_state != OES_TRUNC) { + OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", + oap2cl_page(oap)->cp_index); + rc = -EBUSY; + } + } + osc_object_unlock(obj); + if (ext != NULL) + osc_extent_put(env, ext); + RETURN(rc); +} + +/** + * This is called when a page is picked up by kernel to write out. + * + * We should find out the corresponding extent and add the whole extent + * into urgent list. The extent may be being truncated or used, handle it + * carefully. + */ +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops) +{ + struct osc_extent *ext = NULL; + struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); + struct cl_page *cp = ops->ops_cl.cpl_page; + pgoff_t index = cp->cp_index; + struct osc_async_page *oap = &ops->ops_oap; + bool unplug = false; + int rc = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_lookup(obj, index); + if (ext == NULL) { + osc_extent_tree_dump(D_ERROR, obj); + LASSERTF(0, "page index %lu is NOT covered.\n", index); + } + + switch (ext->oe_state) { + case OES_RPC: + case OES_LOCK_DONE: + CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), + "flush an in-rpc page?\n"); + LASSERT(0); + break; + case OES_LOCKING: + /* If we know this extent is being written out, we should abort + * so that the writer can make this page ready. Otherwise, there + * exists a deadlock problem because other process can wait for + * page writeback bit holding page lock; and meanwhile in + * vvp_page_make_ready(), we need to grab page lock before + * really sending the RPC. */ + case OES_TRUNC: + /* race with truncate, page will be redirtied */ + GOTO(out, rc = -EAGAIN); + default: + break; + } + + rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); + if (rc) + GOTO(out, rc); + + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; + spin_unlock(&oap->oap_lock); + + if (memory_pressure_get()) + ext->oe_memalloc = 1; + + ext->oe_urgent = 1; + if (ext->oe_state == OES_CACHE) { + OSC_EXTENT_DUMP(D_CACHE, ext, + "flush page %p make it urgent.\n", oap); + if (list_empty(&ext->oe_link)) + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + rc = 0; + EXIT; + +out: + osc_object_unlock(obj); + osc_extent_put(env, ext); + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + return rc; +} + +/** + * this is called when a sync waiter receives an interruption. Its job is to + * get the caller woken as soon as possible. If its page hasn't been put in an + * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as + * desiring interruption which will forcefully complete the rpc once the rpc + * has timed out. + */ +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) +{ + struct osc_async_page *oap = &ops->ops_oap; + struct osc_object *obj = oap->oap_obj; + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *found = NULL; + struct list_head *plist; + pgoff_t index = oap2cl_page(oap)->cp_index; + int rc = -EBUSY; + int cmd; + ENTRY; + + LASSERT(!oap->oap_interrupted); + oap->oap_interrupted = 1; + + /* Find out the caching extent */ + osc_object_lock(obj); + if (oap->oap_cmd & OBD_BRW_WRITE) { + plist = &obj->oo_urgent_exts; + cmd = OBD_BRW_WRITE; + } else { + plist = &obj->oo_reading_exts; + cmd = OBD_BRW_READ; + } + list_for_each_entry(ext, plist, oe_link) { + if (ext->oe_start <= index && ext->oe_end >= index) { + LASSERT(ext->oe_state == OES_LOCK_DONE); + /* For OES_LOCK_DONE state extent, it has already held + * a refcount for RPC. */ + found = osc_extent_get(ext); + break; + } + } + if (found != NULL) { + list_del_init(&found->oe_link); + osc_update_pending(obj, cmd, -found->oe_nr_pages); + osc_object_unlock(obj); + + osc_extent_finish(env, found, 0, -EINTR); + osc_extent_put(env, found); + rc = 0; + } else { + osc_object_unlock(obj); + /* ok, it's been put in an rpc. only one oap gets a request + * reference */ + if (oap->oap_request != NULL) { + ptlrpc_mark_interrupted(oap->oap_request); + ptlrpcd_wake(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + } + + osc_list_maint(cli, obj); + RETURN(rc); +} + +int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, + struct list_head *list, int cmd, int brw_flags) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_async_page *oap; + int page_count = 0; + int mppr = cli->cl_max_pages_per_rpc; + pgoff_t start = CL_PAGE_EOF; + pgoff_t end = 0; + ENTRY; + + list_for_each_entry(oap, list, oap_pending_item) { + struct cl_page *cp = oap2cl_page(oap); + if (cp->cp_index > end) + end = cp->cp_index; + if (cp->cp_index < start) + start = cp->cp_index; + ++page_count; + mppr <<= (page_count > mppr); + } + + ext = osc_extent_alloc(obj); + if (ext == NULL) { + list_for_each_entry(oap, list, oap_pending_item) { + list_del_init(&oap->oap_pending_item); + osc_ap_completion(env, cli, oap, 0, -ENOMEM); + } + RETURN(-ENOMEM); + } + + ext->oe_rw = !!(cmd & OBD_BRW_READ); + ext->oe_urgent = 1; + ext->oe_start = start; + ext->oe_end = ext->oe_max_end = end; + ext->oe_obj = obj; + ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); + ext->oe_nr_pages = page_count; + ext->oe_mppr = mppr; + list_splice_init(list, &ext->oe_pages); + + osc_object_lock(obj); + /* Reuse the initial refcount for RPC, don't drop it */ + osc_extent_state_set(ext, OES_LOCK_DONE); + if (cmd & OBD_BRW_WRITE) { + list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); + osc_update_pending(obj, OBD_BRW_WRITE, page_count); + } else { + list_add_tail(&ext->oe_link, &obj->oo_reading_exts); + osc_update_pending(obj, OBD_BRW_READ, page_count); + } + osc_object_unlock(obj); + + osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND); + RETURN(0); +} + +/** + * Called by osc_io_setattr_start() to freeze and destroy covering extents. + */ +int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj, __u64 size) +{ + struct client_obd *cli = osc_cli(obj); + struct osc_extent *ext; + struct osc_extent *waiting = NULL; + pgoff_t index; + LIST_HEAD(list); + int result = 0; + bool partial; + ENTRY; + + /* pages with index greater or equal to index will be truncated. */ + index = cl_index(osc2cl(obj), size); + partial = size > cl_offset(osc2cl(obj), index); + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + EASSERT(ext->oe_state != OES_TRUNC, ext); + + if (ext->oe_state > OES_CACHE || ext->oe_urgent) { + /* if ext is in urgent state, it means there must exist + * a page already having been flushed by write_page(). + * We have to wait for this extent because we can't + * truncate that page. */ + LASSERT(!ext->oe_hp); + OSC_EXTENT_DUMP(D_CACHE, ext, + "waiting for busy extent\n"); + waiting = osc_extent_get(ext); + break; + } + + OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size); + + osc_extent_get(ext); + if (ext->oe_state == OES_ACTIVE) { + /* though we grab inode mutex for write path, but we + * release it before releasing extent(in osc_io_end()), + * so there is a race window that an extent is still + * in OES_ACTIVE when truncate starts. */ + LASSERT(!ext->oe_trunc_pending); + ext->oe_trunc_pending = 1; + } else { + EASSERT(ext->oe_state == OES_CACHE, ext); + osc_extent_state_set(ext, OES_TRUNC); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + EASSERT(list_empty(&ext->oe_link), ext); + list_add_tail(&ext->oe_link, &list); + + ext = next_extent(ext); + } + osc_object_unlock(obj); + + osc_list_maint(cli, obj); + + while (!list_empty(&list)) { + int rc; + + ext = list_entry(list.next, struct osc_extent, oe_link); + list_del_init(&ext->oe_link); + + /* extent may be in OES_ACTIVE state because inode mutex + * is released before osc_io_end() in file write case */ + if (ext->oe_state != OES_TRUNC) + osc_extent_wait(env, ext, OES_TRUNC); + + rc = osc_extent_truncate(ext, index, partial); + if (rc < 0) { + if (result == 0) + result = rc; + + OSC_EXTENT_DUMP(D_ERROR, ext, + "truncate error %d\n", rc); + } else if (ext->oe_nr_pages == 0) { + osc_extent_remove(ext); + } else { + /* this must be an overlapped extent which means only + * part of pages in this extent have been truncated. + */ + EASSERTF(ext->oe_start <= index, ext, + "trunc index = %lu/%d.\n", index, partial); + /* fix index to skip this partially truncated extent */ + index = ext->oe_end + 1; + partial = false; + + /* we need to hold this extent in OES_TRUNC state so + * that no writeback will happen. This is to avoid + * BUG 17397. */ + LASSERT(oio->oi_trunc == NULL); + oio->oi_trunc = osc_extent_get(ext); + OSC_EXTENT_DUMP(D_CACHE, ext, + "trunc at "LPU64"\n", size); + } + osc_extent_put(env, ext); + } + if (waiting != NULL) { + int rc; + + /* ignore the result of osc_extent_wait the write initiator + * should take care of it. */ + rc = osc_extent_wait(env, waiting, OES_INV); + if (rc < 0) + OSC_EXTENT_DUMP(D_CACHE, ext, "wait error: %d.\n", rc); + + osc_extent_put(env, waiting); + waiting = NULL; + goto again; + } + RETURN(result); +} + +/** + * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. + */ +void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj) +{ + struct osc_extent *ext = oio->oi_trunc; + + oio->oi_trunc = NULL; + if (ext != NULL) { + bool unplug = false; + + EASSERT(ext->oe_nr_pages > 0, ext); + EASSERT(ext->oe_state == OES_TRUNC, ext); + EASSERT(!ext->oe_urgent, ext); + + OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); + osc_object_lock(obj); + osc_extent_state_set(ext, OES_CACHE); + if (ext->oe_fsync_wait && !ext->oe_urgent) { + ext->oe_urgent = 1; + list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); + unplug = true; + } + osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); + osc_object_unlock(obj); + osc_extent_put(env, ext); + + if (unplug) + osc_io_unplug_async(env, osc_cli(obj), obj); + } +} + +/** + * Wait for extents in a specific range to be written out. + * The caller must have called osc_cache_writeback_range() to issue IO + * otherwise it will take a long time for this function to finish. + * + * Caller must hold inode_mutex , or cancel exclusive dlm lock so that + * nobody else can dirty this range of file while we're waiting for + * extents to be written. + */ +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end) +{ + struct osc_extent *ext; + pgoff_t index = start; + int result = 0; + ENTRY; + +again: + osc_object_lock(obj); + ext = osc_extent_search(obj, index); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < index) + ext = next_extent(ext); + while (ext != NULL) { + int rc; + + if (ext->oe_start > end) + break; + + if (!ext->oe_fsync_wait) { + ext = next_extent(ext); + continue; + } + + EASSERT(ergo(ext->oe_state == OES_CACHE, + ext->oe_hp || ext->oe_urgent), ext); + EASSERT(ergo(ext->oe_state == OES_ACTIVE, + !ext->oe_hp && ext->oe_urgent), ext); + + index = ext->oe_end + 1; + osc_extent_get(ext); + osc_object_unlock(obj); + + rc = osc_extent_wait(env, ext, OES_INV); + if (result == 0) + result = rc; + osc_extent_put(env, ext); + goto again; + } + osc_object_unlock(obj); + + OSC_IO_DEBUG(obj, "sync file range.\n"); + RETURN(result); +} + +/** + * Called to write out a range of osc object. + * + * @hp : should be set this is caused by lock cancel; + * @discard: is set if dirty pages should be dropped - file will be deleted or + * truncated, this implies there is no partially discarding extents. + * + * Return how many pages will be issued, or error code if error occurred. + */ +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard) +{ + struct osc_extent *ext; + LIST_HEAD(discard_list); + bool unplug = false; + int result = 0; + ENTRY; + + osc_object_lock(obj); + ext = osc_extent_search(obj, start); + if (ext == NULL) + ext = first_extent(obj); + else if (ext->oe_end < start) + ext = next_extent(ext); + while (ext != NULL) { + if (ext->oe_start > end) + break; + + ext->oe_fsync_wait = 1; + switch (ext->oe_state) { + case OES_CACHE: + result += ext->oe_nr_pages; + if (!discard) { + struct list_head *list = NULL; + if (hp) { + EASSERT(!ext->oe_hp, ext); + ext->oe_hp = 1; + list = &obj->oo_hp_exts; + } else if (!ext->oe_urgent) { + ext->oe_urgent = 1; + list = &obj->oo_urgent_exts; + } + if (list != NULL) + list_move_tail(&ext->oe_link, list); + unplug = true; + } else { + /* the only discarder is lock cancelling, so + * [start, end] must contain this extent */ + EASSERT(ext->oe_start >= start && + ext->oe_max_end <= end, ext); + osc_extent_state_set(ext, OES_LOCKING); + ext->oe_owner = current; + list_move_tail(&ext->oe_link, + &discard_list); + osc_update_pending(obj, OBD_BRW_WRITE, + -ext->oe_nr_pages); + } + break; + case OES_ACTIVE: + /* It's pretty bad to wait for ACTIVE extents, because + * we don't know how long we will wait for it to be + * flushed since it may be blocked at awaiting more + * grants. We do this for the correctness of fsync. */ + LASSERT(hp == 0 && discard == 0); + ext->oe_urgent = 1; + break; + case OES_TRUNC: + /* this extent is being truncated, can't do anything + * for it now. it will be set to urgent after truncate + * is finished in osc_cache_truncate_end(). */ + default: + break; + } + ext = next_extent(ext); + } + osc_object_unlock(obj); + + LASSERT(ergo(!discard, list_empty(&discard_list))); + if (!list_empty(&discard_list)) { + struct osc_extent *tmp; + int rc; + + osc_list_maint(osc_cli(obj), obj); + list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { + list_del_init(&ext->oe_link); + EASSERT(ext->oe_state == OES_LOCKING, ext); + + /* Discard caching pages. We don't actually write this + * extent out but we complete it as if we did. */ + rc = osc_extent_make_ready(env, ext); + if (unlikely(rc < 0)) { + OSC_EXTENT_DUMP(D_ERROR, ext, + "make_ready returned %d\n", rc); + if (result >= 0) + result = rc; + } + + /* finish the extent as if the pages were sent */ + osc_extent_finish(env, ext, 0, 0); + } + } + + if (unplug) + osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND); + + if (hp || discard) { + int rc; + rc = osc_cache_wait_range(env, obj, start, end); + if (result >= 0 && rc < 0) + result = rc; + } + + OSC_IO_DEBUG(obj, "cache page out.\n"); + RETURN(result); +} + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h new file mode 100644 index 000000000000..001a9c84ab8e --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h @@ -0,0 +1,679 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#ifndef OSC_CL_INTERNAL_H +#define OSC_CL_INTERNAL_H + +# include + +#include +/* osc_build_res_name() */ +#include +#include +#include +#include "osc_internal.h" + +/** \defgroup osc osc + * @{ + */ + +struct osc_extent; + +/** + * State maintained by osc layer for each IO context. + */ +struct osc_io { + /** super class */ + struct cl_io_slice oi_cl; + /** true if this io is lockless. */ + int oi_lockless; + /** active extents, we know how many bytes is going to be written, + * so having an active extent will prevent it from being fragmented */ + struct osc_extent *oi_active; + /** partially truncated extent, we need to hold this extent to prevent + * page writeback from happening. */ + struct osc_extent *oi_trunc; + + struct obd_info oi_info; + struct obdo oi_oa; + struct osc_async_cbargs { + bool opc_rpc_sent; + int opc_rc; + struct completion opc_sync; + } oi_cbarg; +}; + +/** + * State of transfer for osc. + */ +struct osc_req { + struct cl_req_slice or_cl; +}; + +/** + * State maintained by osc layer for the duration of a system call. + */ +struct osc_session { + struct osc_io os_io; +}; + +#define OTI_PVEC_SIZE 64 +struct osc_thread_info { + struct ldlm_res_id oti_resname; + ldlm_policy_data_t oti_policy; + struct cl_lock_descr oti_descr; + struct cl_attr oti_attr; + struct lustre_handle oti_handle; + struct cl_page_list oti_plist; + struct cl_io oti_io; + struct cl_page *oti_pvec[OTI_PVEC_SIZE]; +}; + +struct osc_object { + struct cl_object oo_cl; + struct lov_oinfo *oo_oinfo; + /** + * True if locking against this stripe got -EUSERS. + */ + int oo_contended; + cfs_time_t oo_contention_time; + /** + * List of pages in transfer. + */ + struct list_head oo_inflight[CRT_NR]; + /** + * Lock, protecting ccc_object::cob_inflight, because a seat-belt is + * locked during take-off and landing. + */ + spinlock_t oo_seatbelt; + + /** + * used by the osc to keep track of what objects to build into rpcs. + * Protected by client_obd->cli_loi_list_lock. + */ + struct list_head oo_ready_item; + struct list_head oo_hp_ready_item; + struct list_head oo_write_item; + struct list_head oo_read_item; + + /** + * extent is a red black tree to manage (async) dirty pages. + */ + struct rb_root oo_root; + /** + * Manage write(dirty) extents. + */ + struct list_head oo_hp_exts; /* list of hp extents */ + struct list_head oo_urgent_exts; /* list of writeback extents */ + struct list_head oo_rpc_exts; + + struct list_head oo_reading_exts; + + atomic_t oo_nr_reads; + atomic_t oo_nr_writes; + + /** Protect extent tree. Will be used to protect + * oo_{read|write}_pages soon. */ + spinlock_t oo_lock; +}; + +static inline void osc_object_lock(struct osc_object *obj) +{ + spin_lock(&obj->oo_lock); +} + +static inline int osc_object_trylock(struct osc_object *obj) +{ + return spin_trylock(&obj->oo_lock); +} + +static inline void osc_object_unlock(struct osc_object *obj) +{ + spin_unlock(&obj->oo_lock); +} + +static inline int osc_object_is_locked(struct osc_object *obj) +{ + return spin_is_locked(&obj->oo_lock); +} + +/* + * Lock "micro-states" for osc layer. + */ +enum osc_lock_state { + OLS_NEW, + OLS_ENQUEUED, + OLS_UPCALL_RECEIVED, + OLS_GRANTED, + OLS_RELEASED, + OLS_BLOCKED, + OLS_CANCELLED +}; + +/** + * osc-private state of cl_lock. + * + * Interaction with DLM. + * + * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode). + * + * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in + * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock. + * + * This pointer is protected through a reference, acquired by + * osc_lock_upcall0(). Also, an additional reference is acquired by + * ldlm_lock_addref() call protecting the lock from cancellation, until + * osc_lock_unuse() releases it. + * + * Below is a description of how lock references are acquired and released + * inside of DLM. + * + * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) + * - ldlm_lock_create() + * - ldlm_lock_new(): initializes a lock with 2 references. One for + * the caller (released when reply from the server is received, or on + * error), and another for the hash table. + * - ldlm_lock_addref_internal(): protects the lock from cancellation. + * + * - When reply is received from the server (osc_enqueue_interpret()) + * - ldlm_cli_enqueue_fini() + * - LDLM_LOCK_PUT(): releases caller reference acquired by + * ldlm_lock_new(). + * - if (rc != 0) + * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). + * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). + * + * - When lock is being cancelled (ldlm_lock_cancel()) + * - ldlm_lock_destroy() + * - LDLM_LOCK_PUT(): releases hash-table reference acquired by + * ldlm_lock_new(). + * + * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called + * either when lock is cancelled (osc_lock_blocking()), or when locks is + * deleted without cancellation (e.g., from cl_locks_prune()). In the latter + * case ldlm lock remains in memory, and can be re-attached to osc_lock in the + * future. + */ +struct osc_lock { + struct cl_lock_slice ols_cl; + /** underlying DLM lock */ + struct ldlm_lock *ols_lock; + /** lock value block */ + struct ost_lvb ols_lvb; + /** DLM flags with which osc_lock::ols_lock was enqueued */ + __u64 ols_flags; + /** osc_lock::ols_lock handle */ + struct lustre_handle ols_handle; + struct ldlm_enqueue_info ols_einfo; + enum osc_lock_state ols_state; + + /** + * How many pages are using this lock for io, currently only used by + * read-ahead. If non-zero, the underlying dlm lock won't be cancelled + * during recovery to avoid deadlock. see bz16774. + * + * \see osc_page::ops_lock + * \see osc_page_addref_lock(), osc_page_putref_lock() + */ + atomic_t ols_pageref; + + /** + * true, if ldlm_lock_addref() was called against + * osc_lock::ols_lock. This is used for sanity checking. + * + * \see osc_lock::ols_has_ref + */ + unsigned ols_hold :1, + /** + * this is much like osc_lock::ols_hold, except that this bit is + * cleared _after_ reference in released in osc_lock_unuse(). This + * fine distinction is needed because: + * + * - if ldlm lock still has a reference, osc_ast_data_get() needs + * to return associated cl_lock (so that a flag is needed that is + * cleared after ldlm_lock_decref() returned), and + * + * - ldlm_lock_decref() can invoke blocking ast (for a + * LDLM_FL_CBPENDING lock), and osc_lock functions like + * osc_lock_cancel() called from there need to know whether to + * release lock reference (so that a flag is needed that is + * cleared before ldlm_lock_decref() is called). + */ + ols_has_ref:1, + /** + * inherit the lockless attribute from top level cl_io. + * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. + */ + ols_locklessable:1, + /** + * set by osc_lock_use() to wait until blocking AST enters into + * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for + * further synchronization. + */ + ols_ast_wait:1, + /** + * If the data of this lock has been flushed to server side. + */ + ols_flush:1, + /** + * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat + * the EVAVAIL error as torerable, this will make upper logic happy + * to wait all glimpse locks to each OSTs to be completed. + * Glimpse lock converts to normal lock if the server lock is + * granted. + * Glimpse lock should be destroyed immediately after use. + */ + ols_glimpse:1, + /** + * For async glimpse lock. + */ + ols_agl:1; + /** + * IO that owns this lock. This field is used for a dead-lock + * avoidance by osc_lock_enqueue_wait(). + * + * XXX: unfortunately, the owner of a osc_lock is not unique, + * the lock may have multiple users, if the lock is granted and + * then matched. + */ + struct osc_io *ols_owner; +}; + + +/** + * Page state private for osc layer. + */ +struct osc_page { + struct cl_page_slice ops_cl; + /** + * Page queues used by osc to detect when RPC can be formed. + */ + struct osc_async_page ops_oap; + /** + * An offset within page from which next transfer starts. This is used + * by cl_page_clip() to submit partial page transfers. + */ + int ops_from; + /** + * An offset within page at which next transfer ends. + * + * \see osc_page::ops_from. + */ + int ops_to; + /** + * Boolean, true iff page is under transfer. Used for sanity checking. + */ + unsigned ops_transfer_pinned:1, + /** + * True for a `temporary page' created by read-ahead code, probably + * outside of any DLM lock. + */ + ops_temp:1, + /** + * in LRU? + */ + ops_in_lru:1, + /** + * Set if the page must be transferred with OBD_BRW_SRVLOCK. + */ + ops_srvlock:1; + union { + /** + * lru page list. ops_inflight and ops_lru are exclusive so + * that they can share the same data. + */ + struct list_head ops_lru; + /** + * Linkage into a per-osc_object list of pages in flight. For + * debugging. + */ + struct list_head ops_inflight; + }; + /** + * Thread that submitted this page for transfer. For debugging. + */ + task_t *ops_submitter; + /** + * Submit time - the time when the page is starting RPC. For debugging. + */ + cfs_time_t ops_submit_time; + + /** + * A lock of which we hold a reference covers this page. Only used by + * read-ahead: for a readahead page, we hold it's covering lock to + * prevent it from being canceled during recovery. + * + * \see osc_lock::ols_pageref + * \see osc_page_addref_lock(), osc_page_putref_lock(). + */ + struct cl_lock *ops_lock; +}; + +extern struct kmem_cache *osc_lock_kmem; +extern struct kmem_cache *osc_object_kmem; +extern struct kmem_cache *osc_thread_kmem; +extern struct kmem_cache *osc_session_kmem; +extern struct kmem_cache *osc_req_kmem; +extern struct kmem_cache *osc_extent_kmem; + +extern struct lu_device_type osc_device_type; +extern struct lu_context_key osc_key; +extern struct lu_context_key osc_session_key; + +#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +int osc_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int osc_req_init (const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + +void osc_index2policy (ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end); +int osc_lvb_print (const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb); + +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags); +int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops); +int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, + obd_flag async_flags); +int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, + struct page *page, loff_t offset); +int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, + struct osc_page *ops); +int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, + struct osc_page *ops); +int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, + struct list_head *list, int cmd, int brw_flags); +int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj, __u64 size); +void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, + struct osc_object *obj); +int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end, int hp, int discard); +int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, + pgoff_t start, pgoff_t end); +void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, + struct osc_object *osc, pdl_policy_t pol); + +void osc_object_set_contended (struct osc_object *obj); +void osc_object_clear_contended(struct osc_object *obj); +int osc_object_is_contended (struct osc_object *obj); + +int osc_lock_is_lockless (const struct osc_lock *olck); + +/***************************************************************************** + * + * Accessors. + * + */ + +static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) +{ + struct osc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &osc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct osc_session *osc_env_session(const struct lu_env *env) +{ + struct osc_session *ses; + + ses = lu_context_key_get(env->le_ses, &osc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct osc_io *osc_env_io(const struct lu_env *env) +{ + return &osc_env_session(env)->os_io; +} + +static inline int osc_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &osc_device_type; +} + +static inline struct osc_device *lu2osc_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &osc_device_type); + return container_of0(d, struct osc_device, od_cl.cd_lu_dev); +} + +static inline struct obd_export *osc_export(const struct osc_object *obj) +{ + return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; +} + +static inline struct client_obd *osc_cli(const struct osc_object *obj) +{ + return &osc_export(obj)->exp_obd->u.cli; +} + +static inline struct osc_object *cl2osc(const struct cl_object *obj) +{ + LINVRNT(osc_is_object(&obj->co_lu)); + return container_of0(obj, struct osc_object, oo_cl); +} + +static inline struct cl_object *osc2cl(const struct osc_object *obj) +{ + return (struct cl_object *)&obj->oo_cl; +} + +static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode) +{ + LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); + if (mode == CLM_READ) + return LCK_PR; + else if (mode == CLM_WRITE) + return LCK_PW; + else + return LCK_GROUP; +} + +static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode) +{ + LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); + if (mode == LCK_PR) + return CLM_READ; + else if (mode == LCK_PW) + return CLM_WRITE; + else + return CLM_GROUP; +} + +static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct osc_page, ops_cl); +} + +static inline struct osc_page *oap2osc(struct osc_async_page *oap) +{ + return container_of0(oap, struct osc_page, ops_oap); +} + +static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) +{ + return oap2osc(oap)->ops_cl.cpl_page; +} + +static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) +{ + return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); +} + +static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); + return container_of0(slice, struct osc_lock, ols_cl); +} + +static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) +{ + return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); +} + +static inline int osc_io_srvlock(struct osc_io *oio) +{ + return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); +} + +enum osc_extent_state { + OES_INV = 0, /** extent is just initialized or destroyed */ + OES_ACTIVE = 1, /** process is using this extent */ + OES_CACHE = 2, /** extent is ready for IO */ + OES_LOCKING = 3, /** locking page to prepare IO */ + OES_LOCK_DONE = 4, /** locking finished, ready to send */ + OES_RPC = 5, /** in RPC */ + OES_TRUNC = 6, /** being truncated */ + OES_STATE_MAX +}; +#define OES_STRINGS { "inv", "active", "cache", "locking", "lockdone", "rpc", \ + "trunc", NULL } + +/** + * osc_extent data to manage dirty pages. + * osc_extent has the following attributes: + * 1. all pages in the same must be in one RPC in write back; + * 2. # of pages must be less than max_pages_per_rpc - implied by 1; + * 3. must be covered by only 1 osc_lock; + * 4. exclusive. It's impossible to have overlapped osc_extent. + * + * The lifetime of an extent is from when the 1st page is dirtied to when + * all pages inside it are written out. + * + * LOCKING ORDER + * ============= + * page lock -> client_obd_list_lock -> object lock(osc_object::oo_lock) + */ +struct osc_extent { + /** red-black tree node */ + struct rb_node oe_node; + /** osc_object of this extent */ + struct osc_object *oe_obj; + /** refcount, removed from red-black tree if reaches zero. */ + atomic_t oe_refc; + /** busy if non-zero */ + atomic_t oe_users; + /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ + struct list_head oe_link; + /** state of this extent */ + unsigned int oe_state; + /** flags for this extent. */ + unsigned int oe_intree:1, + /** 0 is write, 1 is read */ + oe_rw:1, + oe_srvlock:1, + oe_memalloc:1, + /** an ACTIVE extent is going to be truncated, so when this extent + * is released, it will turn into TRUNC state instead of CACHE. */ + oe_trunc_pending:1, + /** this extent should be written asap and someone may wait for the + * write to finish. This bit is usually set along with urgent if + * the extent was CACHE state. + * fsync_wait extent can't be merged because new extent region may + * exceed fsync range. */ + oe_fsync_wait:1, + /** covering lock is being canceled */ + oe_hp:1, + /** this extent should be written back asap. set if one of pages is + * called by page WB daemon, or sync write or reading requests. */ + oe_urgent:1; + /** how many grants allocated for this extent. + * Grant allocated for this extent. There is no grant allocated + * for reading extents and sync write extents. */ + unsigned int oe_grants; + /** # of dirty pages in this extent */ + unsigned int oe_nr_pages; + /** list of pending oap pages. Pages in this list are NOT sorted. */ + struct list_head oe_pages; + /** Since an extent has to be written out in atomic, this is used to + * remember the next page need to be locked to write this extent out. + * Not used right now. + */ + struct osc_page *oe_next_page; + /** start and end index of this extent, include start and end + * themselves. Page offset here is the page index of osc_pages. + * oe_start is used as keyword for red-black tree. */ + pgoff_t oe_start; + pgoff_t oe_end; + /** maximum ending index of this extent, this is limited by + * max_pages_per_rpc, lock extent and chunk size. */ + pgoff_t oe_max_end; + /** waitqueue - for those who want to be notified if this extent's + * state has changed. */ + wait_queue_head_t oe_waitq; + /** lock covering this extent */ + struct cl_lock *oe_osclock; + /** terminator of this extent. Must be true if this extent is in IO. */ + task_t *oe_owner; + /** return value of writeback. If somebody is waiting for this extent, + * this value can be known by outside world. */ + int oe_rc; + /** max pages per rpc when this extent was created */ + unsigned int oe_mppr; +}; + +int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, + int sent, int rc); +int osc_extent_release(const struct lu_env *env, struct osc_extent *ext); + +/** @} osc */ + +#endif /* OSC_CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c new file mode 100644 index 000000000000..4208ddfd73b3 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_dev.c @@ -0,0 +1,261 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device, cl_req for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* class_name2obd() */ +#include + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +struct kmem_cache *osc_lock_kmem; +struct kmem_cache *osc_object_kmem; +struct kmem_cache *osc_thread_kmem; +struct kmem_cache *osc_session_kmem; +struct kmem_cache *osc_req_kmem; +struct kmem_cache *osc_extent_kmem; +struct kmem_cache *osc_quota_kmem; + +struct lu_kmem_descr osc_caches[] = { + { + .ckd_cache = &osc_lock_kmem, + .ckd_name = "osc_lock_kmem", + .ckd_size = sizeof (struct osc_lock) + }, + { + .ckd_cache = &osc_object_kmem, + .ckd_name = "osc_object_kmem", + .ckd_size = sizeof (struct osc_object) + }, + { + .ckd_cache = &osc_thread_kmem, + .ckd_name = "osc_thread_kmem", + .ckd_size = sizeof (struct osc_thread_info) + }, + { + .ckd_cache = &osc_session_kmem, + .ckd_name = "osc_session_kmem", + .ckd_size = sizeof (struct osc_session) + }, + { + .ckd_cache = &osc_req_kmem, + .ckd_name = "osc_req_kmem", + .ckd_size = sizeof (struct osc_req) + }, + { + .ckd_cache = &osc_extent_kmem, + .ckd_name = "osc_extent_kmem", + .ckd_size = sizeof (struct osc_extent) + }, + { + .ckd_cache = &osc_quota_kmem, + .ckd_name = "osc_quota_kmem", + .ckd_size = sizeof(struct osc_quota_info) + }, + { + .ckd_cache = NULL + } +}; + +struct lock_class_key osc_ast_guard_class; + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_device *osc2lu_dev(struct osc_device *osc) +{ + return &osc->od_cl.cd_lu_dev; +} + +/***************************************************************************** + * + * Osc device and device type functions. + * + */ + +static void *osc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_thread_info *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_thread_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, osc_thread_kmem); +} + +struct lu_context_key osc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = osc_key_init, + .lct_fini = osc_key_fini +}; + +static void *osc_session_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_session *info; + + OBD_SLAB_ALLOC_PTR_GFP(info, osc_session_kmem, __GFP_IO); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_session_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_session *info = data; + OBD_SLAB_FREE_PTR(info, osc_session_kmem); +} + +struct lu_context_key osc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = osc_session_init, + .lct_fini = osc_session_fini +}; + +/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); + +static int osc_cl_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + ENTRY; + RETURN(osc_process_config_base(d->ld_obd, cfg)); +} + +static const struct lu_device_operations osc_lu_ops = { + .ldo_object_alloc = osc_object_alloc, + .ldo_process_config = osc_cl_process_config, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations osc_cl_ops = { + .cdo_req_init = osc_req_init +}; + +static int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + RETURN(0); +} + +static struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return 0; +} + +static struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osc_device *od = lu2osc_dev(d); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(od); + return NULL; +} + +static struct lu_device *osc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *od; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(od); + if (od == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&od->od_cl, t); + d = osc2lu_dev(od); + d->ld_ops = &osc_lu_ops; + od->od_cl.cd_ops = &osc_cl_ops; + + /* Setup OSC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = osc_setup(obd, cfg); + if (rc) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + od->od_exp = obd->obd_self_export; + RETURN(d); +} + +static const struct lu_device_type_operations osc_device_type_ops = { + .ldto_init = osc_type_init, + .ldto_fini = osc_type_fini, + + .ldto_start = osc_type_start, + .ldto_stop = osc_type_stop, + + .ldto_device_alloc = osc_device_alloc, + .ldto_device_free = osc_device_free, + + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type osc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_OSC_NAME, + .ldt_ops = &osc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h new file mode 100644 index 000000000000..5343da2fa87a --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_internal.h @@ -0,0 +1,210 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef OSC_INTERNAL_H +#define OSC_INTERNAL_H + +#define OAP_MAGIC 8675309 + +struct lu_env; + +enum async_flags { + ASYNC_READY = 0x1, /* ap_make_ready will not be called before this + page is added to an rpc */ + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ + ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called + to give the caller a chance to update + or cancel the size of the io */ + ASYNC_HP = 0x10, +}; + +struct osc_async_page { + int oap_magic; + unsigned short oap_cmd; + unsigned short oap_interrupted:1; + + struct list_head oap_pending_item; + struct list_head oap_rpc_item; + + obd_off oap_obj_off; + unsigned oap_page_off; + enum async_flags oap_async_flags; + + struct brw_page oap_brw_page; + + struct ptlrpc_request *oap_request; + struct client_obd *oap_cli; + struct osc_object *oap_obj; + + struct ldlm_lock *oap_ldlm_lock; + spinlock_t oap_lock; +}; + +#define oap_page oap_brw_page.pg +#define oap_count oap_brw_page.count +#define oap_brw_flags oap_brw_page.flag + +struct osc_cache_waiter { + struct list_head ocw_entry; + wait_queue_head_t ocw_waitq; + struct osc_async_page *ocw_oap; + int ocw_grant; + int ocw_rc; +}; + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti); +int osc_real_create(struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti); +void osc_wake_cache_waiters(struct client_obd *cli); +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); +void osc_update_next_shrink(struct client_obd *cli); + +/* + * cl integration. + */ +#include + +extern struct ptlrpc_request_set *PTLRPCD_SET; + +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl); +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode); + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + int *flags, void *data, struct lustre_handle *lockh, + int unref); + +int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); + +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd, pdl_policy_t p); +int osc_lru_shrink(struct client_obd *cli, int target); + +extern spinlock_t osc_ast_guard; + +int osc_cleanup(struct obd_device *obd); +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); + +#ifdef LPROCFS +int lproc_osc_attach_seqstat(struct obd_device *dev); +void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;} +static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +extern struct lu_device_type osc_device_type; + +static inline int osc_recoverable_error(int rc) +{ + return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || + rc == -EAGAIN || rc == -EINPROGRESS); +} + +static inline unsigned long rpcs_in_flight(struct client_obd *cli) +{ + return cli->cl_r_in_flight + cli->cl_w_in_flight; +} + +#ifndef min_t +#define min_t(type,x,y) \ + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) +#endif + +struct osc_device { + struct cl_device od_cl; + struct obd_export *od_exp; + + /* Write stats is actually protected by client_obd's lock. */ + struct osc_stats { + uint64_t os_lockless_writes; /* by bytes */ + uint64_t os_lockless_reads; /* by bytes */ + uint64_t os_lockless_truncates; /* by times */ + } od_stats; + + /* configuration item(s) */ + int od_contention_time; + int od_lockless_truncate; +}; + +static inline struct osc_device *obd2osc_dev(const struct obd_device *d) +{ + return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); +} + +int osc_dlm_lock_pageref(struct ldlm_lock *dlm); + +extern struct kmem_cache *osc_quota_kmem; +struct osc_quota_info { + /** linkage for quota hash table */ + struct hlist_node oqi_hash; + obd_uid oqi_id; +}; +int osc_quota_setup(struct obd_device *obd); +int osc_quota_cleanup(struct obd_device *obd); +int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], + obd_flag valid, obd_flag flags); +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int osc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk); + +void osc_inc_unstable_pages(struct ptlrpc_request *req); +void osc_dec_unstable_pages(struct ptlrpc_request *req); +#endif /* OSC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c new file mode 100644 index 000000000000..1b277045b3e4 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_io.c @@ -0,0 +1,836 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for OSC layer. + * + * Author: Nikita Danilov + * Author: Jinshan Xiong + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct osc_req *cl2osc_req(const struct cl_req_slice *slice) +{ + LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type); + return container_of0(slice, struct osc_req, or_cl); +} + +static struct osc_io *cl2osc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); + LINVRNT(oio == osc_env_io(env)); + return oio; +} + +static struct osc_page *osc_cl_page_osc(struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, &osc_device_type); + LASSERT(slice != NULL); + + return cl2osc_page(slice); +} + + +/***************************************************************************** + * + * io operations. + * + */ + +static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) +{ +} + +/** + * An implementation of cl_io_operations::cio_io_submit() method for osc + * layer. Iterates over pages in the in-queue, prepares each for io by calling + * cl_page_prep() and then either submits them through osc_io_submit_page() + * or, if page is already submitted, changes osc flags through + * osc_set_async_flags(). + */ +static int osc_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page *page; + struct cl_page *tmp; + struct client_obd *cli = NULL; + struct osc_object *osc = NULL; /* to keep gcc happy */ + struct osc_page *opg; + struct cl_io *io; + LIST_HEAD (list); + + struct cl_page_list *qin = &queue->c2_qin; + struct cl_page_list *qout = &queue->c2_qout; + int queued = 0; + int result = 0; + int cmd; + int brw_flags; + int max_pages; + + LASSERT(qin->pl_nr > 0); + + CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt); + + osc = cl2osc(ios->cis_obj); + cli = osc_cli(osc); + max_pages = cli->cl_max_pages_per_rpc; + + cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; + + /* + * NOTE: here @page is a top-level page. This is done to avoid + * creation of sub-page-list. + */ + cl_page_list_for_each_safe(page, tmp, qin) { + struct osc_async_page *oap; + + /* Top level IO. */ + io = page->cp_owner; + LASSERT(io != NULL); + + opg = osc_cl_page_osc(page); + oap = &opg->ops_oap; + LASSERT(osc == oap->oap_obj); + + if (!list_empty(&oap->oap_pending_item) || + !list_empty(&oap->oap_rpc_item)) { + CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", + oap, opg); + result = -EBUSY; + break; + } + + result = cl_page_prep(env, io, page, crt); + if (result != 0) { + LASSERT(result < 0); + if (result != -EALREADY) + break; + /* + * Handle -EALREADY error: for read case, the page is + * already in UPTODATE state; for write, the page + * is not dirty. + */ + result = 0; + continue; + } + + cl_page_list_move(qout, qin, page); + oap->oap_async_flags = ASYNC_URGENT|ASYNC_READY; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + + osc_page_submit(env, opg, crt, brw_flags); + list_add_tail(&oap->oap_pending_item, &list); + if (++queued == max_pages) { + queued = 0; + result = osc_queue_sync_pages(env, osc, &list, cmd, + brw_flags); + if (result < 0) + break; + } + } + + if (queued > 0) + result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags); + + CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); + return qout->pl_nr > 0 ? 0 : result; +} + +static void osc_page_touch_at(const struct lu_env *env, + struct cl_object *obj, pgoff_t idx, unsigned to) +{ + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; + + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); + /* + * XXX old code used + * + * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); + * + * here + */ + CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n", + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); + + valid = 0; + if (kms > loi->loi_kms) { + attr->cat_kms = kms; + valid |= CAT_KMS; + } + if (kms > loi->loi_lvb.lvb_size) { + attr->cat_size = kms; + valid |= CAT_SIZE; + } + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); +} + +/** + * This is called when a page is accessed within file in a way that creates + * new page, if one were missing (i.e., if there were a hole at that place in + * the file, or accessed page is beyond the current file size). Examples: + * ->commit_write() and ->nopage() methods. + * + * Expand stripe KMS if necessary. + */ +static void osc_page_touch(const struct lu_env *env, + struct osc_page *opage, unsigned to) +{ + struct cl_page *page = opage->ops_cl.cpl_page; + struct cl_object *obj = opage->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, page->cp_index, to); +} + +/** + * Implements cl_io_operations::cio_prepare_write() method for osc layer. + * + * \retval -EIO transfer initiated against this osc will most likely fail + * \retval 0 transfer initiated against this osc will most likely succeed. + * + * The reason for this check is to immediately return an error to the caller + * in the case of a deactivated import. Note, that import can be deactivated + * later, while pages, dirtied by this IO, are still in the cache, but this is + * irrelevant, because that would still return an error to the application (if + * it does fsync), but many applications don't do fsync because of performance + * issues, and we wanted to return an -EIO at write time to notify the + * application. + */ +static int osc_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev); + struct obd_import *imp = class_exp2cliimp(dev->od_exp); + struct osc_io *oio = cl2osc_io(env, ios); + int result = 0; + ENTRY; + + /* + * This implements OBD_BRW_CHECK logic from old client. + */ + + if (imp == NULL || imp->imp_invalid) + result = -EIO; + if (result == 0 && oio->oi_lockless) + /* this page contains `invalid' data, but who cares? + * nobody can access the invalid data. + * in osc_io_commit_write(), we're going to write exact + * [from, to) bytes of this page to OST. -jay */ + cl_page_export(env, slice->cpl_page, 1); + + RETURN(result); +} + +static int osc_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_io *oio = cl2osc_io(env, ios); + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + struct osc_async_page *oap = &opg->ops_oap; + ENTRY; + + LASSERT(to > 0); + /* + * XXX instead of calling osc_page_touch() here and in + * osc_io_fault_start() it might be more logical to introduce + * cl_page_touch() method, that generic cl_io_commit_write() and page + * fault code calls. + */ + osc_page_touch(env, cl2osc_page(slice), to); + if (!client_is_remote(osc_export(obj)) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + + if (oio->oi_lockless) + /* see osc_io_prepare_write() for lockless io handling. */ + cl_page_clip(env, slice->cpl_page, from, to); + + RETURN(0); +} + +static int osc_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io; + struct cl_fault_io *fio; + + ENTRY; + + io = ios->cis_io; + fio = &io->u.ci_fault; + CDEBUG(D_INFO, "%lu %d %d\n", + fio->ft_index, fio->ft_writable, fio->ft_nob); + /* + * If mapping is writeable, adjust kms to cover this page, + * but do not extend kms beyond actual file size. + * See bug 10919. + */ + if (fio->ft_writable) + osc_page_touch_at(env, ios->cis_obj, + fio->ft_index, fio->ft_nob); + RETURN(0); +} + +static int osc_async_upcall(void *a, int rc) +{ + struct osc_async_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +/** + * Checks that there are no pages being written in the extent being truncated. + */ +static int trunc_check_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + const struct cl_page_slice *slice; + struct osc_page *ops; + struct osc_async_page *oap; + __u64 start = *(__u64 *)cbdata; + + slice = cl_page_at(page, &osc_device_type); + LASSERT(slice != NULL); + ops = cl2osc_page(slice); + oap = &ops->ops_oap; + + if (oap->oap_cmd & OBD_BRW_WRITE && + !list_empty(&oap->oap_pending_item)) + CL_PAGE_DEBUG(D_ERROR, env, page, "exists " LPU64 "/%s.\n", + start, current->comm); + + { + struct page *vmpage = cl_page_vmpage(env, page); + if (PageLocked(vmpage)) + CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", + ops, page->cp_index, + (oap->oap_cmd & OBD_BRW_RWMASK)); + } + + return CLP_GANG_OKAY; +} + +static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, + struct osc_io *oio, __u64 size) +{ + struct cl_object *clob; + int partial; + pgoff_t start; + + clob = oio->oi_cl.cis_obj; + start = cl_index(clob, size); + partial = cl_offset(clob, start) < size; + + /* + * Complain if there are pages in the truncated region. + */ + cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, + trunc_check_cb, (void *)&size); +} + +static int osc_io_setattr_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + unsigned int ia_valid = io->u.ci_setattr.sa_valid; + int result = 0; + struct obd_info oinfo = { { { 0 } } }; + + /* truncate cache dirty pages first */ + if (cl_io_is_trunc(io)) + result = osc_cache_truncate_start(env, oio, cl2osc(obj), size); + + if (result == 0 && oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; + unsigned int cl_valid = 0; + + if (ia_valid & ATTR_SIZE) { + attr->cat_size = attr->cat_kms = size; + cl_valid = (CAT_SIZE | CAT_KMS); + } + if (ia_valid & ATTR_MTIME_SET) { + attr->cat_mtime = lvb->lvb_mtime; + cl_valid |= CAT_MTIME; + } + if (ia_valid & ATTR_ATIME_SET) { + attr->cat_atime = lvb->lvb_atime; + cl_valid |= CAT_ATIME; + } + if (ia_valid & ATTR_CTIME_SET) { + attr->cat_ctime = lvb->lvb_ctime; + cl_valid |= CAT_CTIME; + } + result = cl_object_attr_set(env, obj, attr, cl_valid); + } + cl_object_attr_unlock(obj); + } + memset(oa, 0, sizeof(*oa)); + if (result == 0) { + oa->o_oi = loi->loi_oi; + oa->o_mtime = attr->cat_mtime; + oa->o_atime = attr->cat_atime; + oa->o_ctime = attr->cat_ctime; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME; + if (ia_valid & ATTR_SIZE) { + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_SRVLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + } else { + LASSERT(oio->oi_lockless == 0); + } + + oinfo.oi_oa = oa; + oinfo.oi_capa = io->u.ci_setattr.sa_capa; + init_completion(&cbargs->opc_sync); + + if (ia_valid & ATTR_SIZE) + result = osc_punch_base(osc_export(cl2osc(obj)), + &oinfo, osc_async_upcall, + cbargs, PTLRPCD_SET); + else + result = osc_setattr_async_base(osc_export(cl2osc(obj)), + &oinfo, NULL, + osc_async_upcall, + cbargs, PTLRPCD_SET); + cbargs->opc_rpc_sent = result == 0; + } + return result; +} + +static void osc_io_setattr_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int result = 0; + + if (cbargs->opc_rpc_sent) { + wait_for_completion(&cbargs->opc_sync); + result = io->ci_result = cbargs->opc_rc; + } + if (result == 0) { + if (oio->oi_lockless) { + /* lockless truncate */ + struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + + LASSERT(cl_io_is_trunc(io)); + /* XXX: Need a lock. */ + osd->od_stats.os_lockless_truncates++; + } + } + + if (cl_io_is_trunc(io)) { + __u64 size = io->u.ci_setattr.sa_attr.lvb_size; + osc_trunc_check(env, io, oio, size); + if (oio->oi_trunc != NULL) { + osc_cache_truncate_end(env, oio, cl2osc(obj)); + oio->oi_trunc = NULL; + } + } +} + +static int osc_io_read_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int result = 0; + ENTRY; + + if (oio->oi_lockless == 0) { + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + attr->cat_atime = LTIME_S(CFS_CURRENT_TIME); + result = cl_object_attr_set(env, obj, attr, + CAT_ATIME); + } + cl_object_attr_unlock(obj); + } + RETURN(result); +} + +static int osc_io_write_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int result = 0; + ENTRY; + + if (oio->oi_lockless == 0) { + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + attr->cat_mtime = attr->cat_ctime = + LTIME_S(CFS_CURRENT_TIME); + result = cl_object_attr_set(env, obj, attr, + CAT_MTIME | CAT_CTIME); + } + cl_object_attr_unlock(obj); + } + RETURN(result); +} + +static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, + struct cl_fsync_io *fio) +{ + struct osc_io *oio = osc_env_io(env); + struct obdo *oa = &oio->oi_oa; + struct obd_info *oinfo = &oio->oi_info; + struct lov_oinfo *loi = obj->oo_oinfo; + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + int rc = 0; + ENTRY; + + memset(oa, 0, sizeof(*oa)); + oa->o_oi = loi->loi_oi; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + /* reload size abd blocks for start and end of sync range */ + oa->o_size = fio->fi_start; + oa->o_blocks = fio->fi_end; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + obdo_set_parent_fid(oa, fio->fi_fid); + + memset(oinfo, 0, sizeof(*oinfo)); + oinfo->oi_oa = oa; + oinfo->oi_capa = fio->fi_capa; + init_completion(&cbargs->opc_sync); + + rc = osc_sync_base(osc_export(obj), oinfo, osc_async_upcall, cbargs, + PTLRPCD_SET); + RETURN(rc); +} + +static int osc_io_fsync_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct cl_fsync_io *fio = &io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + struct osc_object *osc = cl2osc(obj); + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + ENTRY; + + if (fio->fi_end == OBD_OBJECT_EOF) + end = CL_PAGE_EOF; + + result = osc_cache_writeback_range(env, osc, start, end, 0, + fio->fi_mode == CL_FSYNC_DISCARD); + if (result > 0) { + fio->fi_nr_written += result; + result = 0; + } + if (fio->fi_mode == CL_FSYNC_ALL) { + int rc; + + /* we have to wait for writeback to finish before we can + * send OST_SYNC RPC. This is bad because it causes extents + * to be written osc by osc. However, we usually start + * writeback before CL_FSYNC_ALL so this won't have any real + * problem. */ + rc = osc_cache_wait_range(env, osc, start, end); + if (result == 0) + result = rc; + rc = osc_fsync_ost(env, osc, fio); + if (result == 0) + result = rc; + } + + RETURN(result); +} + +static void osc_io_fsync_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; + struct cl_object *obj = slice->cis_obj; + pgoff_t start = cl_index(obj, fio->fi_start); + pgoff_t end = cl_index(obj, fio->fi_end); + int result = 0; + + if (fio->fi_mode == CL_FSYNC_LOCAL) { + result = osc_cache_wait_range(env, cl2osc(obj), start, end); + } else if (fio->fi_mode == CL_FSYNC_ALL) { + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_async_cbargs *cbargs = &oio->oi_cbarg; + + wait_for_completion(&cbargs->opc_sync); + if (result == 0) + result = cbargs->opc_rc; + } + slice->cis_io->ci_result = result; +} + +static void osc_io_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = cl2osc_io(env, slice); + + if (oio->oi_active) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } +} + +static const struct cl_io_operations osc_io_ops = { + .op = { + [CIT_READ] = { + .cio_start = osc_io_read_start, + .cio_fini = osc_io_fini + }, + [CIT_WRITE] = { + .cio_start = osc_io_write_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_SETATTR] = { + .cio_start = osc_io_setattr_start, + .cio_end = osc_io_setattr_end + }, + [CIT_FAULT] = { + .cio_start = osc_io_fault_start, + .cio_end = osc_io_end, + .cio_fini = osc_io_fini + }, + [CIT_FSYNC] = { + .cio_start = osc_io_fsync_start, + .cio_end = osc_io_fsync_end, + .cio_fini = osc_io_fini + }, + [CIT_MISC] = { + .cio_fini = osc_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = osc_io_submit + }, + [CRT_WRITE] = { + .cio_submit = osc_io_submit + } + }, + .cio_prepare_write = osc_io_prepare_write, + .cio_commit_write = osc_io_commit_write +}; + +/***************************************************************************** + * + * Transfer operations. + * + */ + +static int osc_req_prep(const struct lu_env *env, + const struct cl_req_slice *slice) +{ + return 0; +} + +static void osc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct osc_req *or; + + or = cl2osc_req(slice); + OBD_SLAB_FREE_PTR(or, osc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for osc + * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq + * fields. + */ +static void osc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct lov_oinfo *oinfo; + struct cl_req *clerq; + struct cl_page *apage; /* _some_ page in @clerq */ + struct cl_lock *lock; /* _some_ lock protecting @apage */ + struct osc_lock *olck; + struct osc_page *opg; + struct obdo *oa; + struct ost_lvb *lvb; + + oinfo = cl2osc(obj)->oo_oinfo; + lvb = &oinfo->loi_lvb; + oa = attr->cra_oa; + + if ((flags & OBD_MD_FLMTIME) != 0) { + oa->o_mtime = lvb->lvb_mtime; + oa->o_valid |= OBD_MD_FLMTIME; + } + if ((flags & OBD_MD_FLATIME) != 0) { + oa->o_atime = lvb->lvb_atime; + oa->o_valid |= OBD_MD_FLATIME; + } + if ((flags & OBD_MD_FLCTIME) != 0) { + oa->o_ctime = lvb->lvb_ctime; + oa->o_valid |= OBD_MD_FLCTIME; + } + if (flags & OBD_MD_FLGROUP) { + ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLGROUP; + } + if (flags & OBD_MD_FLID) { + ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); + oa->o_valid |= OBD_MD_FLID; + } + if (flags & OBD_MD_FLHANDLE) { + clerq = slice->crs_req; + LASSERT(!list_empty(&clerq->crq_pages)); + apage = container_of(clerq->crq_pages.next, + struct cl_page, cp_flight); + opg = osc_cl_page_osc(apage); + apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */ + lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1); + if (lock == NULL) { + struct cl_object_header *head; + struct cl_lock *scan; + + head = cl_object_header(apage->cp_obj); + list_for_each_entry(scan, &head->coh_locks, + cll_linkage) + CL_LOCK_DEBUG(D_ERROR, env, scan, + "no cover page!\n"); + CL_PAGE_DEBUG(D_ERROR, env, apage, + "dump uncover page!\n"); + libcfs_debug_dumpstack(NULL); + LBUG(); + } + + olck = osc_lock_at(lock); + LASSERT(olck != NULL); + LASSERT(ergo(opg->ops_srvlock, olck->ols_lock == NULL)); + /* check for lockless io. */ + if (olck->ols_lock != NULL) { + oa->o_handle = olck->ols_lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + cl_lock_put(env, lock); + } +} + +static const struct cl_req_operations osc_req_ops = { + .cro_prep = osc_req_prep, + .cro_attr_set = osc_req_attr_set, + .cro_completion = osc_req_completion +}; + + +int osc_io_init(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + return 0; +} + +int osc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct osc_req *or; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(or, osc_req_kmem, __GFP_IO); + if (or != NULL) { + cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c new file mode 100644 index 000000000000..640bc3d34709 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_lock.c @@ -0,0 +1,1663 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +# include +/* fid_build_reg_res_name() */ +#include + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +#define _PAGEREF_MAGIC (-10000000) + +/***************************************************************************** + * + * Type conversions. + * + */ + +static const struct cl_lock_operations osc_lock_ops; +static const struct cl_lock_operations osc_lock_lockless_ops; +static void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force); +static int osc_lock_has_pages(struct osc_lock *olck); + +int osc_lock_is_lockless(const struct osc_lock *olck) +{ + return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops); +} + +/** + * Returns a weak pointer to the ldlm lock identified by a handle. Returned + * pointer cannot be dereferenced, as lock is not protected from concurrent + * reclaim. This function is a helper for osc_lock_invariant(). + */ +static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(handle); + if (lock != NULL) + LDLM_LOCK_PUT(lock); + return lock; +} + +/** + * Invariant that has to be true all of the time. + */ +static int osc_lock_invariant(struct osc_lock *ols) +{ + struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); + struct ldlm_lock *olock = ols->ols_lock; + int handle_used = lustre_handle_is_used(&ols->ols_handle); + + return + ergo(osc_lock_is_lockless(ols), + ols->ols_locklessable && ols->ols_lock == NULL) || + (ergo(olock != NULL, handle_used) && + ergo(olock != NULL, + olock->l_handle.h_cookie == ols->ols_handle.cookie) && + /* + * Check that ->ols_handle and ->ols_lock are consistent, but + * take into account that they are set at the different time. + */ + ergo(handle_used, + ergo(lock != NULL && olock != NULL, lock == olock) && + ergo(lock == NULL, olock == NULL)) && + ergo(ols->ols_state == OLS_CANCELLED, + olock == NULL && !handle_used) && + /* + * DLM lock is destroyed only after we have seen cancellation + * ast. + */ + ergo(olock != NULL && ols->ols_state < OLS_CANCELLED, + !olock->l_destroyed) && + ergo(ols->ols_state == OLS_GRANTED, + olock != NULL && + olock->l_req_mode == olock->l_granted_mode && + ols->ols_hold)); +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +/** + * Breaks a link between osc_lock and dlm_lock. + */ +static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) +{ + struct ldlm_lock *dlmlock; + + spin_lock(&osc_ast_guard); + dlmlock = olck->ols_lock; + if (dlmlock == NULL) { + spin_unlock(&osc_ast_guard); + return; + } + + olck->ols_lock = NULL; + /* wb(); --- for all who checks (ols->ols_lock != NULL) before + * call to osc_lock_detach() */ + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + + lock_res_and_lock(dlmlock); + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { + struct cl_object *obj = olck->ols_cl.cls_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + __u64 old_kms; + + cl_object_attr_lock(obj); + /* Must get the value under the lock to avoid possible races. */ + old_kms = cl2osc(obj)->oo_oinfo->loi_kms; + /* Update the kms. Need to loop all granted locks. + * Not a problem for the client */ + attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); + + cl_object_attr_set(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + } + unlock_res_and_lock(dlmlock); + + /* release a reference taken in osc_lock_upcall0(). */ + LASSERT(olck->ols_has_ref); + lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); + LDLM_LOCK_RELEASE(dlmlock); + olck->ols_has_ref = 0; +} + +static int osc_lock_unhold(struct osc_lock *ols) +{ + int result = 0; + + if (ols->ols_hold) { + ols->ols_hold = 0; + result = osc_cancel_base(&ols->ols_handle, + ols->ols_einfo.ei_mode); + } + return result; +} + +static int osc_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + + switch (ols->ols_state) { + case OLS_NEW: + LASSERT(!ols->ols_hold); + LASSERT(ols->ols_agl); + return 0; + case OLS_UPCALL_RECEIVED: + osc_lock_unhold(ols); + case OLS_ENQUEUED: + LASSERT(!ols->ols_hold); + osc_lock_detach(env, ols); + ols->ols_state = OLS_NEW; + return 0; + case OLS_GRANTED: + LASSERT(!ols->ols_glimpse); + LASSERT(ols->ols_hold); + /* + * Move lock into OLS_RELEASED state before calling + * osc_cancel_base() so that possible synchronous cancellation + * (that always happens e.g., for liblustre) sees that lock is + * released. + */ + ols->ols_state = OLS_RELEASED; + return osc_lock_unhold(ols); + default: + CERROR("Impossible state: %d\n", ols->ols_state); + LBUG(); + } +} + +static void osc_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + /* + * ->ols_hold can still be true at this point if, for example, a + * thread that requested a lock was killed (and released a reference + * to the lock), before reply from a server was received. In this case + * lock is destroyed immediately after upcall. + */ + osc_lock_unhold(ols); + LASSERT(ols->ols_lock == NULL); + LASSERT(atomic_read(&ols->ols_pageref) == 0 || + atomic_read(&ols->ols_pageref) == _PAGEREF_MAGIC); + + OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); +} + +static void osc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + ldlm_policy_data_t *policy) +{ + const struct cl_lock_descr *d = &lock->cll_descr; + + osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); + policy->l_extent.gid = d->cld_gid; +} + +static __u64 osc_enq2ldlm_flags(__u32 enqflags) +{ + __u64 result = 0; + + LASSERT((enqflags & ~CEF_MASK) == 0); + + if (enqflags & CEF_NONBLOCK) + result |= LDLM_FL_BLOCK_NOWAIT; + if (enqflags & CEF_ASYNC) + result |= LDLM_FL_HAS_INTENT; + if (enqflags & CEF_DISCARD_DATA) + result |= LDLM_AST_DISCARD_DATA; + return result; +} + +/** + * Global spin-lock protecting consistency of ldlm_lock::l_ast_data + * pointers. Initialized in osc_init(). + */ +spinlock_t osc_ast_guard; + +static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock) +{ + struct osc_lock *olck; + + lock_res_and_lock(dlm_lock); + spin_lock(&osc_ast_guard); + olck = dlm_lock->l_ast_data; + if (olck != NULL) { + struct cl_lock *lock = olck->ols_cl.cls_lock; + /* + * If osc_lock holds a reference on ldlm lock, return it even + * when cl_lock is in CLS_FREEING state. This way + * + * osc_ast_data_get(dlmlock) == NULL + * + * guarantees that all osc references on dlmlock were + * released. osc_dlm_blocking_ast0() relies on that. + */ + if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) { + cl_lock_get_trust(lock); + lu_ref_add_atomic(&lock->cll_reference, + "ast", current); + } else + olck = NULL; + } + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(dlm_lock); + return olck; +} + +static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck) +{ + struct cl_lock *lock; + + lock = olck->ols_cl.cls_lock; + lu_ref_del(&lock->cll_reference, "ast", current); + cl_lock_put(env, lock); +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. Copy of osc_update_enqueue() + * logic. + * + * This can be optimized to not update attributes when lock is a result of a + * local match. + * + * Called under lock and resource spin-locks. + */ +static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, + int rc) +{ + struct ost_lvb *lvb; + struct cl_object *obj; + struct lov_oinfo *oinfo; + struct cl_attr *attr; + unsigned valid; + + ENTRY; + + if (!(olck->ols_flags & LDLM_FL_LVB_READY)) + RETURN_EXIT; + + lvb = &olck->ols_lvb; + obj = olck->ols_cl.cls_obj; + oinfo = cl2osc(obj)->oo_oinfo; + attr = &osc_env_info(env)->oti_attr; + valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (rc == 0) { + struct ldlm_lock *dlmlock; + __u64 size; + + dlmlock = olck->ols_lock; + LASSERT(dlmlock != NULL); + + /* re-grab LVB from a dlm lock under DLM spin-locks. */ + *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; + size = lvb->lvb_size; + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (size > dlmlock->l_policy_data.l_extent.end) + size = dlmlock->l_policy_data.l_extent.end + 1; + if (size >= oinfo->loi_kms) { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64 + ", kms="LPU64, lvb->lvb_size, size); + valid |= CAT_KMS; + attr->cat_kms = size; + } else { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss=" + LPU64"; leaving kms="LPU64", end="LPU64, + lvb->lvb_size, oinfo->loi_kms, + dlmlock->l_policy_data.l_extent.end); + } + ldlm_lock_allow_match_locked(dlmlock); + } else if (rc == -ENAVAIL && olck->ols_glimpse) { + CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" + " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms); + } else + valid = 0; + + if (valid != 0) + cl_object_attr_set(env, obj, attr, valid); + + cl_object_attr_unlock(obj); + + EXIT; +} + +/** + * Called when a lock is granted, from an upcall (when server returned a + * granted lock), or from completion AST, when server returned a blocked lock. + * + * Called under lock and resource spin-locks, that are released temporarily + * here. + */ +static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck, + struct ldlm_lock *dlmlock, int rc) +{ + struct ldlm_extent *ext; + struct cl_lock *lock; + struct cl_lock_descr *descr; + + LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode); + + ENTRY; + if (olck->ols_state < OLS_GRANTED) { + lock = olck->ols_cl.cls_lock; + ext = &dlmlock->l_policy_data.l_extent; + descr = &osc_env_info(env)->oti_descr; + descr->cld_obj = lock->cll_descr.cld_obj; + + /* XXX check that ->l_granted_mode is valid. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, ext->start); + descr->cld_end = cl_index(descr->cld_obj, ext->end); + descr->cld_gid = ext->gid; + /* + * tell upper layers the extent of the lock that was actually + * granted + */ + olck->ols_state = OLS_GRANTED; + osc_lock_lvb_update(env, olck, rc); + + /* release DLM spin-locks to allow cl_lock_{modify,signal}() + * to take a semaphore on a parent lock. This is safe, because + * spin-locks are needed to protect consistency of + * dlmlock->l_*_mode and LVB, and we have finished processing + * them. */ + unlock_res_and_lock(dlmlock); + cl_lock_modify(env, lock, descr); + cl_lock_signal(env, lock); + LINVRNT(osc_lock_invariant(olck)); + lock_res_and_lock(dlmlock); + } + EXIT; +} + +static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck) + +{ + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(dlmlock->l_ast_data == olck); + LASSERT(olck->ols_lock == NULL); + olck->ols_lock = dlmlock; + spin_unlock(&osc_ast_guard); + + /* + * Lock might be not yet granted. In this case, completion ast + * (osc_ldlm_completion_ast()) comes later and finishes lock + * granting. + */ + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) + osc_lock_granted(env, olck, dlmlock, 0); + unlock_res_and_lock(dlmlock); + + /* + * osc_enqueue_interpret() decrefs asynchronous locks, counter + * this. + */ + ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode); + olck->ols_hold = 1; + + /* lock reference taken by ldlm_handle2lock_long() is owned by + * osc_lock and released in osc_lock_detach() */ + lu_ref_add(&dlmlock->l_reference, "osc_lock", olck); + olck->ols_has_ref = 1; +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int osc_lock_upcall(void *cookie, int errcode) +{ + struct osc_lock *olck = cookie; + struct cl_lock_slice *slice = &olck->ols_cl; + struct cl_lock *lock = slice->cls_lock; + struct lu_env *env; + struct cl_env_nest nest; + + ENTRY; + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + int rc; + + cl_lock_mutex_get(env, lock); + + LASSERT(lock->cll_state >= CLS_QUEUING); + if (olck->ols_state == OLS_ENQUEUED) { + olck->ols_state = OLS_UPCALL_RECEIVED; + rc = ldlm_error2errno(errcode); + } else if (olck->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %d\n", olck->ols_state); + LBUG(); + } + if (rc) { + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock(&olck->ols_handle); + if (dlmlock != NULL) { + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(olck->ols_lock == NULL); + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + ldlm_lock_fail_match_locked(dlmlock); + unlock_res_and_lock(dlmlock); + LDLM_LOCK_PUT(dlmlock); + } + } else { + if (olck->ols_glimpse) + olck->ols_glimpse = 0; + osc_lock_upcall0(env, olck); + } + + /* Error handling, some errors are tolerable. */ + if (olck->ols_locklessable && rc == -EUSERS) { + /* This is a tolerable error, turn this lock into + * lockless lock. + */ + osc_object_set_contended(cl2osc(slice->cls_obj)); + LASSERT(slice->cls_ops == &osc_lock_ops); + + /* Change this lock to ldlmlock-less lock. */ + osc_lock_to_lockless(env, olck, 1); + olck->ols_state = OLS_GRANTED; + rc = 0; + } else if (olck->ols_glimpse && rc == -ENAVAIL) { + osc_lock_lvb_update(env, olck, rc); + cl_lock_delete(env, lock); + /* Hide the error. */ + rc = 0; + } + + if (rc == 0) { + /* For AGL case, the RPC sponsor may exits the cl_lock + * processing without wait() called before related OSC + * lock upcall(). So update the lock status according + * to the enqueue result inside AGL upcall(). */ + if (olck->ols_agl) { + lock->cll_flags |= CLF_FROM_UPCALL; + cl_wait_try(env, lock); + lock->cll_flags &= ~CLF_FROM_UPCALL; + if (!olck->ols_glimpse) + olck->ols_agl = 0; + } + cl_lock_signal(env, lock); + /* del user for lock upcall cookie */ + cl_unuse_try(env, lock); + } else { + /* del user for lock upcall cookie */ + cl_lock_user_del(env, lock); + cl_lock_error(env, lock, rc); + } + + /* release cookie reference, acquired by osc_lock_enqueue() */ + cl_lock_hold_release(env, lock, "upcall", lock); + cl_lock_mutex_put(env, lock); + + lu_ref_del(&lock->cll_reference, "upcall", lock); + /* This maybe the last reference, so must be called after + * cl_lock_mutex_put(). */ + cl_lock_put(env, lock); + + cl_env_nested_put(&nest, env); + } else { + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LBUG(); + } + RETURN(errcode); +} + +/** + * Core of osc_dlm_blocking_ast() logic. + */ +static void osc_lock_blocking(const struct lu_env *env, + struct ldlm_lock *dlmlock, + struct osc_lock *olck, int blocking) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LASSERT(olck->ols_lock == dlmlock); + CLASSERT(OLS_BLOCKED < OLS_CANCELLED); + LASSERT(!osc_lock_is_lockless(olck)); + + /* + * Lock might be still addref-ed here, if e.g., blocking ast + * is sent for a failed lock. + */ + osc_lock_unhold(olck); + + if (blocking && olck->ols_state < OLS_BLOCKED) + /* + * Move osc_lock into OLS_BLOCKED before canceling the lock, + * because it recursively re-enters osc_lock_blocking(), with + * the state set to OLS_CANCELLED. + */ + olck->ols_state = OLS_BLOCKED; + /* + * cancel and destroy lock at least once no matter how blocking ast is + * entered (see comment above osc_ldlm_blocking_ast() for use + * cases). cl_lock_cancel() and cl_lock_delete() are idempotent. + */ + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int osc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + void *data, int flag) +{ + struct osc_lock *olck; + struct cl_lock *lock; + int result; + int cancel; + + LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING); + + cancel = 0; + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + LINVRNT(osc_lock_invariant(olck)); + if (olck->ols_ast_wait) { + /* wake up osc_lock_use() */ + cl_lock_signal(env, lock); + olck->ols_ast_wait = 0; + } + /* + * Lock might have been canceled while this thread was + * sleeping for lock mutex, but olck is pinned in memory. + */ + if (olck == dlmlock->l_ast_data) { + /* + * NOTE: DLM sends blocking AST's for failed locks + * (that are still in pre-OLS_GRANTED state) + * too, and they have to be canceled otherwise + * DLM lock is never destroyed and stuck in + * the memory. + * + * Alternatively, ldlm_cli_cancel() can be + * called here directly for osc_locks with + * ols_state < OLS_GRANTED to maintain an + * invariant that ->clo_cancel() is only called + * for locks that were granted. + */ + LASSERT(data == olck); + osc_lock_blocking(env, dlmlock, + olck, flag == LDLM_CB_BLOCKING); + } else + cancel = 1; + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + } else + /* + * DLM lock exists, but there is no cl_lock attached to it. + * This is a `normal' race. cl_object and its cl_lock's can be + * removed by memory pressure, together with all pages. + */ + cancel = (flag == LDLM_CB_BLOCKING); + + if (cancel) { + struct lustre_handle *lockh; + + lockh = &osc_env_info(env)->oti_handle; + ldlm_lock2handle(dlmlock, lockh); + result = ldlm_cli_cancel(lockh, LCF_ASYNC); + } else + result = 0; + return result; +} + +/** + * Blocking ast invoked by ldlm when dlm lock is either blocking progress of + * some other lock, or is canceled. This function is installed as a + * ldlm_lock::l_blocking_ast() for client extent locks. + * + * Control flow is tricky, because ldlm uses the same call-back + * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. + * + * \param dlmlock lock for which ast occurred. + * + * \param new description of a conflicting lock in case of blocking ast. + * + * \param data value of dlmlock->l_ast_data + * + * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish + * cancellation and blocking ast's. + * + * Possible use cases: + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel + * lock due to lock lru pressure, or explicit user request to purge + * locks. + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify + * us that dlmlock conflicts with another lock that some client is + * enqueing. Lock is canceled. + * + * - cl_lock_cancel() is called. osc_lock_cancel() calls + * ldlm_cli_cancel() that calls + * + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + * recursively entering osc_ldlm_blocking_ast(). + * + * - client cancels lock voluntary (e.g., as a part of early cancellation): + * + * cl_lock_cancel()-> + * osc_lock_cancel()-> + * ldlm_cli_cancel()-> + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + */ +static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, + int flag) +{ + struct lu_env *env; + struct cl_env_nest nest; + int result; + + /* + * This can be called in the context of outer IO, e.g., + * + * cl_enqueue()->... + * ->osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer context. + */ + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); + cl_env_nested_put(&nest, env); + } else { + result = PTR_ERR(env); + /* + * XXX This should never happen, as cl_lock is + * stuck. Pre-allocated environment a la vvp_inode_fini_env + * should be used. + */ + LBUG(); + } + if (result != 0) { + if (result == -ENODATA) + result = 0; + else + CERROR("BAST failed: %d\n", result); + } + return result; +} + +static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, + __u64 flags, void *data) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct osc_lock *olck; + struct cl_lock *lock; + int result; + int dlmrc; + + /* first, do dlm part of the work */ + dlmrc = ldlm_completion_ast_async(dlmlock, flags, data); + /* then, notify cl_lock */ + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + /* + * ldlm_handle_cp_callback() copied LVB from request + * to lock->l_lvb_data, store it in osc_lock. + */ + LASSERT(dlmlock->l_lvb_data != NULL); + lock_res_and_lock(dlmlock); + olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; + if (olck->ols_lock == NULL) { + /* + * upcall (osc_lock_upcall()) hasn't yet been + * called. Do nothing now, upcall will bind + * olck to dlmlock and signal the waiters. + * + * This maintains an invariant that osc_lock + * and ldlm_lock are always bound when + * osc_lock is in OLS_GRANTED state. + */ + } else if (dlmlock->l_granted_mode == + dlmlock->l_req_mode) { + osc_lock_granted(env, olck, dlmlock, dlmrc); + } + unlock_res_and_lock(dlmlock); + + if (dlmrc != 0) { + CL_LOCK_DEBUG(D_ERROR, env, lock, + "dlmlock returned %d\n", dlmrc); + cl_lock_error(env, lock, dlmrc); + } + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + result = 0; + } else + result = -ELDLM_NO_LOCK_DATA; + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + return dlmrc ?: result; +} + +static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + struct ptlrpc_request *req = data; + struct osc_lock *olck; + struct cl_lock *lock; + struct cl_object *obj; + struct cl_env_nest nest; + struct lu_env *env; + struct ost_lvb *lvb; + struct req_capsule *cap; + int result; + + LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + /* osc_ast_data_get() has to go after environment is + * allocated, because osc_ast_data() acquires a + * reference to a lock, and it can only be released in + * environment. + */ + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + /* Do not grab the mutex of cl_lock for glimpse. + * See LU-1274 for details. + * BTW, it's okay for cl_lock to be cancelled during + * this period because server can handle this race. + * See ldlm_server_glimpse_ast() for details. + * cl_lock_mutex_get(env, lock); */ + cap = &req->rq_pill; + req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); + req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + result = req_capsule_server_pack(cap); + if (result == 0) { + lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); + obj = lock->cll_descr.cld_obj; + result = cl_object_glimpse(env, obj, lvb); + } + if (!exp_connect_lvb_type(req->rq_export)) + req_capsule_shrink(&req->rq_pill, + &RMF_DLM_LVB, + sizeof(struct ost_lvb_v1), + RCL_SERVER); + osc_ast_data_put(env, olck); + } else { + /* + * These errors are normal races, so we don't want to + * fill the console with messages by calling + * ptlrpc_error() + */ + lustre_pack_reply(req, 1, NULL, NULL); + result = -ELDLM_NO_LOCK_DATA; + } + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + req->rq_status = result; + return result; +} + +static unsigned long osc_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + /* + * don't need to grab coh_page_guard since we don't care the exact # + * of pages.. + */ + return cl_object_header(slice->cls_obj)->coh_pages; +} + +/** + * Get the weight of dlm lock for early cancellation. + * + * XXX: it should return the pages covered by this \a dlmlock. + */ +static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct osc_lock *lock; + struct cl_lock *cll; + unsigned long weight; + ENTRY; + + might_sleep(); + /* + * osc_ldlm_weigh_ast has a complex context since it might be called + * because of lock canceling, or from user's input. We have to make + * a new environment for it. Probably it is implementation safe to use + * the upper context because cl_lock_put don't modify environment + * variables. But in case of .. + */ + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + /* Mostly because lack of memory, tend to eliminate this lock*/ + RETURN(0); + + LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT); + lock = osc_ast_data_get(dlmlock); + if (lock == NULL) { + /* cl_lock was destroyed because of memory pressure. + * It is much reasonable to assign this type of lock + * a lower cost. + */ + GOTO(out, weight = 0); + } + + cll = lock->ols_cl.cls_lock; + cl_lock_mutex_get(env, cll); + weight = cl_lock_weigh(env, cll); + cl_lock_mutex_put(env, cll); + osc_ast_data_put(env, lock); + EXIT; + +out: + cl_env_nested_put(&nest, env); + return weight; +} + +static void osc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *clock, + struct osc_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + enum cl_lock_mode mode; + + mode = clock->cll_descr.cld_mode; + if (mode == CLM_PHANTOM) + /* + * For now, enqueue all glimpse locks in read mode. In the + * future, client might choose to enqueue LCK_PW lock for + * glimpse on a file opened for write. + */ + mode = CLM_READ; + + einfo->ei_type = LDLM_EXTENT; + einfo->ei_mode = osc_cl_lock2ldlm(mode); + einfo->ei_cb_bl = osc_ldlm_blocking_ast; + einfo->ei_cb_cp = osc_ldlm_completion_ast; + einfo->ei_cb_gl = osc_ldlm_glimpse_ast; + einfo->ei_cb_wg = osc_ldlm_weigh_ast; + einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */ +} + +/** + * Determine if the lock should be converted into a lockless lock. + * + * Steps to check: + * - if the lock has an explicite requirment for a non-lockless lock; + * - if the io lock request type ci_lockreq; + * - send the enqueue rpc to ost to make the further decision; + * - special treat to truncate lockless lock + * + * Additional policy can be implemented here, e.g., never do lockless-io + * for large extents. + */ +static void osc_lock_to_lockless(const struct lu_env *env, + struct osc_lock *ols, int force) +{ + struct cl_lock_slice *slice = &ols->ols_cl; + + LASSERT(ols->ols_state == OLS_NEW || + ols->ols_state == OLS_UPCALL_RECEIVED); + + if (force) { + ols->ols_locklessable = 1; + slice->cls_ops = &osc_lock_lockless_ops; + } else { + struct osc_io *oio = osc_env_io(env); + struct cl_io *io = oio->oi_cl.cis_io; + struct cl_object *obj = slice->cls_obj; + struct osc_object *oob = cl2osc(obj); + const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + struct obd_connect_data *ocd; + + LASSERT(io->ci_lockreq == CILR_MANDATORY || + io->ci_lockreq == CILR_MAYBE || + io->ci_lockreq == CILR_NEVER); + + ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; + ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && + (io->ci_lockreq == CILR_MAYBE) && + (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK); + if (io->ci_lockreq == CILR_NEVER || + /* lockless IO */ + (ols->ols_locklessable && osc_object_is_contended(oob)) || + /* lockless truncate */ + (cl_io_is_trunc(io) && + (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) && + osd->od_lockless_truncate)) { + ols->ols_locklessable = 1; + slice->cls_ops = &osc_lock_lockless_ops; + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); +} + +static int osc_lock_compatible(const struct osc_lock *qing, + const struct osc_lock *qed) +{ + enum cl_lock_mode qing_mode; + enum cl_lock_mode qed_mode; + + qing_mode = qing->ols_cl.cls_lock->cll_descr.cld_mode; + if (qed->ols_glimpse && + (qed->ols_state >= OLS_UPCALL_RECEIVED || qing_mode == CLM_READ)) + return 1; + + qed_mode = qed->ols_cl.cls_lock->cll_descr.cld_mode; + return ((qing_mode == CLM_READ) && (qed_mode == CLM_READ)); +} + +/** + * Cancel all conflicting locks and wait for them to be destroyed. + * + * This function is used for two purposes: + * + * - early cancel all conflicting locks before starting IO, and + * + * - guarantee that pages added to the page cache by lockless IO are never + * covered by locks other than lockless IO lock, and, hence, are not + * visible to other threads. + */ +static int osc_lock_enqueue_wait(const struct lu_env *env, + const struct osc_lock *olck) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + struct cl_lock_descr *descr = &lock->cll_descr; + struct cl_object_header *hdr = cl_object_header(descr->cld_obj); + struct cl_lock *scan; + struct cl_lock *conflict= NULL; + int lockless = osc_lock_is_lockless(olck); + int rc = 0; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + + /* make it enqueue anyway for glimpse lock, because we actually + * don't need to cancel any conflicting locks. */ + if (olck->ols_glimpse) + return 0; + + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { + struct cl_lock_descr *cld = &scan->cll_descr; + const struct osc_lock *scan_ols; + + if (scan == lock) + break; + + if (scan->cll_state < CLS_QUEUING || + scan->cll_state == CLS_FREEING || + cld->cld_start > descr->cld_end || + cld->cld_end < descr->cld_start) + continue; + + /* overlapped and living locks. */ + + /* We're not supposed to give up group lock. */ + if (scan->cll_descr.cld_mode == CLM_GROUP) { + LASSERT(descr->cld_mode != CLM_GROUP || + descr->cld_gid != scan->cll_descr.cld_gid); + continue; + } + + scan_ols = osc_lock_at(scan); + + /* We need to cancel the compatible locks if we're enqueuing + * a lockless lock, for example: + * imagine that client has PR lock on [0, 1000], and thread T0 + * is doing lockless IO in [500, 1500] region. Concurrent + * thread T1 can see lockless data in [500, 1000], which is + * wrong, because these data are possibly stale. */ + if (!lockless && osc_lock_compatible(olck, scan_ols)) + continue; + + cl_lock_get_trust(scan); + conflict = scan; + break; + } + spin_unlock(&hdr->coh_lock_guard); + + if (conflict) { + if (lock->cll_descr.cld_mode == CLM_GROUP) { + /* we want a group lock but a previous lock request + * conflicts, we do not wait but return 0 so the + * request is send to the server + */ + CDEBUG(D_DLMTRACE, "group lock %p is conflicted " + "with %p, no wait, send to server\n", + lock, conflict); + cl_lock_put(env, conflict); + rc = 0; + } else { + CDEBUG(D_DLMTRACE, "lock %p is conflicted with %p, " + "will wait\n", + lock, conflict); + LASSERT(lock->cll_conflict == NULL); + lu_ref_add(&conflict->cll_reference, "cancel-wait", + lock); + lock->cll_conflict = conflict; + rc = CLO_WAIT; + } + } + RETURN(rc); +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int osc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *unused, __u32 enqflags) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = ols->ols_cl.cls_lock; + int result; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERTF(ols->ols_state == OLS_NEW, + "Impossible state: %d\n", ols->ols_state); + + LASSERTF(ergo(ols->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), + "lock = %p, ols = %p\n", lock, ols); + + result = osc_lock_enqueue_wait(env, ols); + if (result == 0) { + if (!osc_lock_is_lockless(ols)) { + struct osc_object *obj = cl2osc(slice->cls_obj); + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + ldlm_policy_data_t *policy = &info->oti_policy; + struct ldlm_enqueue_info *einfo = &ols->ols_einfo; + + /* lock will be passed as upcall cookie, + * hold ref to prevent to be released. */ + cl_lock_hold_add(env, lock, "upcall", lock); + /* a user for lock also */ + cl_lock_user_add(env, lock); + ols->ols_state = OLS_ENQUEUED; + + /* + * XXX: this is possible blocking point as + * ldlm_lock_match(LDLM_FL_LVB_READY) waits for + * LDLM_CP_CALLBACK. + */ + ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); + osc_lock_build_policy(env, lock, policy); + result = osc_enqueue_base(osc_export(obj), resname, + &ols->ols_flags, policy, + &ols->ols_lvb, + obj->oo_oinfo->loi_kms_valid, + osc_lock_upcall, + ols, einfo, &ols->ols_handle, + PTLRPCD_SET, 1, ols->ols_agl); + if (result != 0) { + cl_lock_user_del(env, lock); + cl_lock_unhold(env, lock, "upcall", lock); + if (unlikely(result == -ECANCELED)) { + ols->ols_state = OLS_NEW; + result = 0; + } + } + } else { + ols->ols_state = OLS_GRANTED; + ols->ols_owner = osc_env_io(env); + } + } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); + RETURN(result); +} + +static int osc_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + + if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) { + if (olck->ols_flags & LDLM_FL_LVB_READY) { + return 0; + } else if (olck->ols_agl) { + if (lock->cll_flags & CLF_FROM_UPCALL) + /* It is from enqueue RPC reply upcall for + * updating state. Do not re-enqueue. */ + return -ENAVAIL; + else + olck->ols_state = OLS_NEW; + } else { + LASSERT(lock->cll_error); + return lock->cll_error; + } + } + + if (olck->ols_state == OLS_NEW) { + int rc; + + LASSERT(olck->ols_agl); + olck->ols_agl = 0; + rc = osc_lock_enqueue(env, slice, NULL, CEF_ASYNC | CEF_MUST); + if (rc != 0) + return rc; + else + return CLO_REENQUEUED; + } + + LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED && + lock->cll_error == 0, olck->ols_lock != NULL)); + + return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT; +} + +/** + * An implementation of cl_lock_operations::clo_use() method that pins cached + * lock. + */ +static int osc_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + int rc; + + LASSERT(!olck->ols_hold); + + /* + * Atomically check for LDLM_FL_CBPENDING and addref a lock if this + * flag is not set. This protects us from a concurrent blocking ast. + */ + rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode); + if (rc == 0) { + olck->ols_hold = 1; + olck->ols_state = OLS_GRANTED; + } else { + struct cl_lock *lock; + + /* + * Lock is being cancelled somewhere within + * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already + * set, but osc_ldlm_blocking_ast() hasn't yet acquired + * cl_lock mutex. + */ + lock = slice->cls_lock; + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(lock->cll_users > 0); + /* set a flag for osc_dlm_blocking_ast0() to signal the + * lock.*/ + olck->ols_ast_wait = 1; + rc = CLO_WAIT; + } + return rc; +} + +static int osc_lock_flush(struct osc_lock *ols, int discard) +{ + struct cl_lock *lock = ols->ols_cl.cls_lock; + struct cl_env_nest nest; + struct lu_env *env; + int result = 0; + ENTRY; + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + struct osc_object *obj = cl2osc(ols->ols_cl.cls_obj); + struct cl_lock_descr *descr = &lock->cll_descr; + int rc = 0; + + if (descr->cld_mode >= CLM_WRITE) { + result = osc_cache_writeback_range(env, obj, + descr->cld_start, descr->cld_end, + 1, discard); + LDLM_DEBUG(ols->ols_lock, + "lock %p: %d pages were %s.\n", lock, result, + discard ? "discarded" : "written"); + if (result > 0) + result = 0; + } + + rc = cl_lock_discard_pages(env, lock); + if (result == 0 && rc < 0) + result = rc; + + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + if (result == 0) { + ols->ols_flush = 1; + LINVRNT(!osc_lock_has_pages(ols)); + } + RETURN(result); +} + +/** + * Implements cl_lock_operations::clo_cancel() method for osc layer. This is + * called (as part of cl_lock_cancel()) when lock is canceled either voluntary + * (LRU pressure, early cancellation, umount, etc.) or due to the conflict + * with some other lock some where in the cluster. This function does the + * following: + * + * - invalidates all pages protected by this lock (after sending dirty + * ones to the server, as necessary); + * + * - decref's underlying ldlm lock; + * + * - cancels ldlm lock (ldlm_cli_cancel()). + */ +static void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *lock = slice->cls_lock; + struct osc_lock *olck = cl2osc_lock(slice); + struct ldlm_lock *dlmlock = olck->ols_lock; + int result = 0; + int discard; + + LASSERT(cl_lock_is_mutexed(lock)); + LINVRNT(osc_lock_invariant(olck)); + + if (dlmlock != NULL) { + int do_cancel; + + discard = !!(dlmlock->l_flags & LDLM_FL_DISCARD_DATA); + if (olck->ols_state >= OLS_GRANTED) + result = osc_lock_flush(olck, discard); + osc_lock_unhold(olck); + + lock_res_and_lock(dlmlock); + /* Now that we're the only user of dlm read/write reference, + * mostly the ->l_readers + ->l_writers should be zero. + * However, there is a corner case. + * See bug 18829 for details.*/ + do_cancel = (dlmlock->l_readers == 0 && + dlmlock->l_writers == 0); + dlmlock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(dlmlock); + if (do_cancel) + result = ldlm_cli_cancel(&olck->ols_handle, LCF_ASYNC); + if (result < 0) + CL_LOCK_DEBUG(D_ERROR, env, lock, + "lock %p cancel failure with error(%d)\n", + lock, result); + } + olck->ols_state = OLS_CANCELLED; + olck->ols_flags &= ~LDLM_FL_LVB_READY; + osc_lock_detach(env, olck); +} + +static int osc_lock_has_pages(struct osc_lock *olck) +{ + return 0; +} + +static void osc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck; + + olck = cl2osc_lock(slice); + if (olck->ols_glimpse) { + LASSERT(!olck->ols_hold); + LASSERT(!olck->ols_lock); + return; + } + + LINVRNT(osc_lock_invariant(olck)); + LINVRNT(!osc_lock_has_pages(olck)); + + osc_lock_unhold(olck); + osc_lock_detach(env, olck); +} + +/** + * Implements cl_lock_operations::clo_state() method for osc layer. + * + * Maintains osc_lock::ols_owner field. + * + * This assumes that lock always enters CLS_HELD (from some other state) in + * the same IO context as one that requested the lock. This should not be a + * problem, because context is by definition shared by all activity pertaining + * to the same high-level IO. + */ +static void osc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + /* + * XXX multiple io contexts can use the lock at the same time. + */ + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) { + struct osc_io *oio = osc_env_io(env); + + LASSERT(lock->ols_owner == NULL); + lock->ols_owner = oio; + } else if (state != CLS_HELD) + lock->ols_owner = NULL; +} + +static int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + /* + * XXX print ldlm lock and einfo properly. + */ + (*p)(env, cookie, "%p %#16llx "LPX64" %d %p ", + lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie, + lock->ols_state, lock->ols_owner); + osc_lvb_print(env, cookie, p, &lock->ols_lvb); + return 0; +} + +static int osc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + if (need->cld_enq_flags & CEF_NEVER) + return 0; + + if (ols->ols_state >= OLS_CANCELLED) + return 0; + + if (need->cld_mode == CLM_PHANTOM) { + if (ols->ols_agl) + return !(ols->ols_state > OLS_RELEASED); + + /* + * Note: the QUEUED lock can't be matched here, otherwise + * it might cause the deadlocks. + * In read_process, + * P1: enqueued read lock, create sublock1 + * P2: enqueued write lock, create sublock2(conflicted + * with sublock1). + * P1: Grant read lock. + * P1: enqueued glimpse lock(with holding sublock1_read), + * matched with sublock2, waiting sublock2 to be granted. + * But sublock2 can not be granted, because P1 + * will not release sublock1. Bang! + */ + if (ols->ols_state < OLS_GRANTED || + ols->ols_state > OLS_RELEASED) + return 0; + } else if (need->cld_enq_flags & CEF_MUST) { + /* + * If the lock hasn't ever enqueued, it can't be matched + * because enqueue process brings in many information + * which can be used to determine things such as lockless, + * CEF_MUST, etc. + */ + if (ols->ols_state < OLS_UPCALL_RECEIVED && + ols->ols_locklessable) + return 0; + } + return 1; +} + +static const struct cl_lock_operations osc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_wait = osc_lock_wait, + .clo_unuse = osc_lock_unuse, + .clo_use = osc_lock_use, + .clo_delete = osc_lock_delete, + .clo_state = osc_lock_state, + .clo_cancel = osc_lock_cancel, + .clo_weigh = osc_lock_weigh, + .clo_print = osc_lock_print, + .clo_fits_into = osc_lock_fits_into, +}; + +static int osc_lock_lockless_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = slice->cls_lock; + + LASSERT(ols->ols_state == OLS_GRANTED); + LINVRNT(osc_lock_invariant(ols)); + + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + return 0; +} + +static void osc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + int result; + + result = osc_lock_flush(ols, 0); + if (result) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, result); + ols->ols_state = OLS_CANCELLED; +} + +static int osc_lock_lockless_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED); + + return lock->cll_error; +} + +static void osc_lock_lockless_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD) { + struct osc_io *oio = osc_env_io(env); + + LASSERT(ergo(lock->ols_owner, lock->ols_owner == oio)); + lock->ols_owner = oio; + + /* set the io to be lockless if this lock is for io's + * host object */ + if (cl_object_same(oio->oi_cl.cis_obj, slice->cls_obj)) + oio->oi_lockless = 1; + } +} + +static int osc_lock_lockless_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + if (!(need->cld_enq_flags & CEF_NEVER)) + return 0; + + /* lockless lock should only be used by its owning io. b22147 */ + return (lock->ols_owner == osc_env_io(env)); +} + +static const struct cl_lock_operations osc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_wait = osc_lock_lockless_wait, + .clo_unuse = osc_lock_lockless_unuse, + .clo_state = osc_lock_lockless_state, + .clo_fits_into = osc_lock_lockless_fits_into, + .clo_cancel = osc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *unused) +{ + struct osc_lock *clk; + int result; + + OBD_SLAB_ALLOC_PTR_GFP(clk, osc_lock_kmem, __GFP_IO); + if (clk != NULL) { + __u32 enqflags = lock->cll_descr.cld_enq_flags; + + osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo); + atomic_set(&clk->ols_pageref, 0); + clk->ols_state = OLS_NEW; + + clk->ols_flags = osc_enq2ldlm_flags(enqflags); + clk->ols_agl = !!(enqflags & CEF_AGL); + if (clk->ols_agl) + clk->ols_flags |= LDLM_FL_BLOCK_NOWAIT; + if (clk->ols_flags & LDLM_FL_HAS_INTENT) + clk->ols_glimpse = 1; + + cl_lock_slice_add(lock, &clk->ols_cl, obj, &osc_lock_ops); + + if (!(enqflags & CEF_MUST)) + /* try to convert this lock to a lockless lock */ + osc_lock_to_lockless(env, clk, (enqflags & CEF_NEVER)); + if (clk->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) + clk->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; + + LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx\n", + lock, clk, clk->ols_flags); + + result = 0; + } else + result = -ENOMEM; + return result; +} + +int osc_dlm_lock_pageref(struct ldlm_lock *dlm) +{ + struct osc_lock *olock; + int rc = 0; + + spin_lock(&osc_ast_guard); + olock = dlm->l_ast_data; + /* + * there's a very rare race with osc_page_addref_lock(), but that + * doesn't matter because in the worst case we don't cancel a lock + * which we actually can, that's no harm. + */ + if (olock != NULL && + atomic_add_return(_PAGEREF_MAGIC, + &olock->ols_pageref) != _PAGEREF_MAGIC) { + atomic_sub(_PAGEREF_MAGIC, &olock->ols_pageref); + rc = 1; + } + spin_unlock(&osc_ast_guard); + return rc; +} + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c new file mode 100644 index 000000000000..ca94e6331381 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_object.c @@ -0,0 +1,275 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/** \addtogroup osc + * @{ + */ + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_object *osc2lu(struct osc_object *osc) +{ + return &osc->oo_cl.co_lu; +} + +static struct osc_object *lu2osc(const struct lu_object *obj) +{ + LINVRNT(osc_is_object(obj)); + return container_of0(obj, struct osc_object, oo_cl.co_lu); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +static int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + int i; + + osc->oo_oinfo = cconf->u.coc_oinfo; + spin_lock_init(&osc->oo_seatbelt); + for (i = 0; i < CRT_NR; ++i) + INIT_LIST_HEAD(&osc->oo_inflight[i]); + + INIT_LIST_HEAD(&osc->oo_ready_item); + INIT_LIST_HEAD(&osc->oo_hp_ready_item); + INIT_LIST_HEAD(&osc->oo_write_item); + INIT_LIST_HEAD(&osc->oo_read_item); + + osc->oo_root.rb_node = NULL; + INIT_LIST_HEAD(&osc->oo_hp_exts); + INIT_LIST_HEAD(&osc->oo_urgent_exts); + INIT_LIST_HEAD(&osc->oo_rpc_exts); + INIT_LIST_HEAD(&osc->oo_reading_exts); + atomic_set(&osc->oo_nr_reads, 0); + atomic_set(&osc->oo_nr_writes, 0); + spin_lock_init(&osc->oo_lock); + + cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); + + return 0; +} + +static void osc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + int i; + + for (i = 0; i < CRT_NR; ++i) + LASSERT(list_empty(&osc->oo_inflight[i])); + + LASSERT(list_empty(&osc->oo_ready_item)); + LASSERT(list_empty(&osc->oo_hp_ready_item)); + LASSERT(list_empty(&osc->oo_write_item)); + LASSERT(list_empty(&osc->oo_read_item)); + + LASSERT(osc->oo_root.rb_node == NULL); + LASSERT(list_empty(&osc->oo_hp_exts)); + LASSERT(list_empty(&osc->oo_urgent_exts)); + LASSERT(list_empty(&osc->oo_rpc_exts)); + LASSERT(list_empty(&osc->oo_reading_exts)); + LASSERT(atomic_read(&osc->oo_nr_reads) == 0); + LASSERT(atomic_read(&osc->oo_nr_writes) == 0); + + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(osc, osc_object_kmem); +} + +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb) +{ + return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" " + "ctime: "LPU64" blocks: "LPU64, + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); +} + +static int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct osc_async_rc *ar = &oinfo->loi_ar; + + (*p)(env, cookie, "id: "DOSTID" " + "idx: %d gen: %d kms_valid: %u kms "LPU64" " + "rc: %d force_sync: %d min_xid: "LPU64" ", + POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, + oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, + ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); + osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); + return 0; +} + + +static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + cl_lvb2attr(attr, &oinfo->loi_lvb); + attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; + return 0; +} + +int osc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + struct ost_lvb *lvb = &oinfo->loi_lvb; + + if (valid & CAT_SIZE) + lvb->lvb_size = attr->cat_size; + if (valid & CAT_MTIME) + lvb->lvb_mtime = attr->cat_mtime; + if (valid & CAT_ATIME) + lvb->lvb_atime = attr->cat_atime; + if (valid & CAT_CTIME) + lvb->lvb_ctime = attr->cat_ctime; + if (valid & CAT_BLOCKS) + lvb->lvb_blocks = attr->cat_blocks; + if (valid & CAT_KMS) { + CDEBUG(D_CACHE, "set kms from "LPU64"to "LPU64"\n", + oinfo->loi_kms, (__u64)attr->cat_kms); + loi_kms_set(oinfo, attr->cat_kms); + } + return 0; +} + +static int osc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + ENTRY; + lvb->lvb_size = oinfo->loi_kms; + lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; + RETURN(0); +} + + +void osc_object_set_contended(struct osc_object *obj) +{ + obj->oo_contention_time = cfs_time_current(); + /* mb(); */ + obj->oo_contended = 1; +} + +void osc_object_clear_contended(struct osc_object *obj) +{ + obj->oo_contended = 0; +} + +int osc_object_is_contended(struct osc_object *obj) +{ + struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); + int osc_contention_time = dev->od_contention_time; + cfs_time_t cur_time = cfs_time_current(); + cfs_time_t retry_time; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) + return 1; + + if (!obj->oo_contended) + return 0; + + /* + * I like copy-paste. the code is copied from + * ll_file_is_contended. + */ + retry_time = cfs_time_add(obj->oo_contention_time, + cfs_time_seconds(osc_contention_time)); + if (cfs_time_after(cur_time, retry_time)) { + osc_object_clear_contended(obj); + return 0; + } + return 1; +} + +static const struct cl_object_operations osc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = osc_lock_init, + .coo_io_init = osc_io_init, + .coo_attr_get = osc_attr_get, + .coo_attr_set = osc_attr_set, + .coo_glimpse = osc_object_glimpse +}; + +static const struct lu_object_operations osc_lu_obj_ops = { + .loo_object_init = osc_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = osc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *unused, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR_GFP(osc, osc_object_kmem, __GFP_IO); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &osc_ops; + obj->lo_ops = &osc_lu_obj_ops; + } else + obj = NULL; + return obj; +} + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c new file mode 100644 index 000000000000..07d3702ac574 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_page.c @@ -0,0 +1,926 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for OSC layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del); +static void osc_lru_add(struct client_obd *cli, struct osc_page *opg); +static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, + struct osc_page *opg); + +/** \addtogroup osc + * @{ + */ + +/* + * Comment out osc_page_protected because it may sleep inside the + * the client_obd_list_lock. + * client_obd_list_lock -> osc_ap_completion -> osc_completion -> + * -> osc_page_protected -> osc_page_is_dlocked -> osc_match_base + * -> ldlm_lock_match -> sptlrpc_import_check_ctx -> sleep. + */ +#if 0 +static int osc_page_is_dlocked(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int pending, int unref) +{ + struct cl_page *page; + struct osc_object *obj; + struct osc_thread_info *info; + struct ldlm_res_id *resname; + struct lustre_handle *lockh; + ldlm_policy_data_t *policy; + ldlm_mode_t dlmmode; + int flags; + + might_sleep(); + + info = osc_env_info(env); + resname = &info->oti_resname; + policy = &info->oti_policy; + lockh = &info->oti_handle; + page = opg->ops_cl.cpl_page; + obj = cl2osc(opg->ops_cl.cpl_obj); + + flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED; + if (pending) + flags |= LDLM_FL_CBPENDING; + + dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW; + osc_lock_build_res(env, obj, resname); + osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index); + return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, + dlmmode, &flags, NULL, lockh, unref); +} + +/** + * Checks an invariant that a page in the cache is covered by a lock, as + * needed. + */ +static int osc_page_protected(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int unref) +{ + struct cl_object_header *hdr; + struct cl_lock *scan; + struct cl_page *page; + struct cl_lock_descr *descr; + int result; + + LINVRNT(!opg->ops_temp); + + page = opg->ops_cl.cpl_page; + if (page->cp_owner != NULL && + cl_io_top(page->cp_owner)->ci_lockreq == CILR_NEVER) + /* + * If IO is done without locks (liblustre, or lloop), lock is + * not required. + */ + result = 1; + else + /* otherwise check for a DLM lock */ + result = osc_page_is_dlocked(env, opg, mode, 1, unref); + if (result == 0) { + /* maybe this page is a part of a lockless io? */ + hdr = cl_object_header(opg->ops_cl.cpl_obj); + descr = &osc_env_info(env)->oti_descr; + descr->cld_mode = mode; + descr->cld_start = page->cp_index; + descr->cld_end = page->cp_index; + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { + /* + * Lock-less sub-lock has to be either in HELD state + * (when io is actively going on), or in CACHED state, + * when top-lock is being unlocked: + * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse(). + */ + if ((scan->cll_state == CLS_HELD || + scan->cll_state == CLS_CACHED) && + cl_lock_ext_match(&scan->cll_descr, descr)) { + struct osc_lock *olck; + + olck = osc_lock_at(scan); + result = osc_lock_is_lockless(olck); + break; + } + } + spin_unlock(&hdr->coh_lock_guard); + } + return result; +} +#else +static int osc_page_protected(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int unref) +{ + return 1; +} +#endif + +/***************************************************************************** + * + * Page operations. + * + */ +static void osc_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + CDEBUG(D_TRACE, "%p\n", opg); + LASSERT(opg->ops_lock == NULL); +} + +static void osc_page_transfer_get(struct osc_page *opg, const char *label) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + LASSERT(!opg->ops_transfer_pinned); + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, label, page); + opg->ops_transfer_pinned = 1; +} + +static void osc_page_transfer_put(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + if (opg->ops_transfer_pinned) { + lu_ref_del(&page->cp_reference, "transfer", page); + opg->ops_transfer_pinned = 0; + cl_page_put(env, page); + } +} + +/** + * This is called once for every page when it is submitted for a transfer + * either opportunistic (osc_page_cache_add()), or immediate + * (osc_page_submit()). + */ +static void osc_page_transfer_add(const struct lu_env *env, + struct osc_page *opg, enum cl_req_type crt) +{ + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + /* ops_lru and ops_inflight share the same field, so take it from LRU + * first and then use it as inflight. */ + osc_lru_del(osc_cli(obj), opg, false); + + spin_lock(&obj->oo_seatbelt); + list_add(&opg->ops_inflight, &obj->oo_inflight[crt]); + opg->ops_submitter = current; + spin_unlock(&obj->oo_seatbelt); +} + +static int osc_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + struct osc_page *opg = cl2osc_page(slice); + int result; + ENTRY; + + LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0)); + + osc_page_transfer_get(opg, "transfer\0cache"); + result = osc_queue_async_io(env, io, opg); + if (result != 0) + osc_page_transfer_put(env, opg); + else + osc_page_transfer_add(env, opg, CRT_WRITE); + + /* for sync write, kernel will wait for this page to be flushed before + * osc_io_end() is called, so release it earlier. + * for mkwrite(), it's known there is no further pages. */ + if (cl_io_is_sync_write(io) || cl_io_is_mkwrite(io)) { + if (oio->oi_active != NULL) { + osc_extent_release(env, oio->oi_active); + oio->oi_active = NULL; + } + } + + RETURN(result); +} + +void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end) +{ + memset(policy, 0, sizeof *policy); + policy->l_extent.start = cl_offset(obj, start); + policy->l_extent.end = cl_offset(obj, end + 1) - 1; +} + +static int osc_page_addref_lock(const struct lu_env *env, + struct osc_page *opg, + struct cl_lock *lock) +{ + struct osc_lock *olock; + int rc; + + LASSERT(opg->ops_lock == NULL); + + olock = osc_lock_at(lock); + if (atomic_inc_return(&olock->ols_pageref) <= 0) { + atomic_dec(&olock->ols_pageref); + rc = -ENODATA; + } else { + cl_lock_get(lock); + opg->ops_lock = lock; + rc = 0; + } + return rc; +} + +static void osc_page_putref_lock(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_lock *lock = opg->ops_lock; + struct osc_lock *olock; + + LASSERT(lock != NULL); + olock = osc_lock_at(lock); + + atomic_dec(&olock->ols_pageref); + opg->ops_lock = NULL; + + cl_lock_put(env, lock); +} + +static int osc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + struct cl_lock *lock; + int result = -ENODATA; + + ENTRY; + lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page, + NULL, 1, 0); + if (lock != NULL) { + if (osc_page_addref_lock(env, cl2osc_page(slice), lock) == 0) + result = -EBUSY; + cl_lock_put(env, lock); + } + RETURN(result); +} + +static void osc_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + + if (unlikely(opg->ops_lock)) + osc_page_putref_lock(env, opg); +} + +static void osc_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + + if (likely(opg->ops_lock)) + osc_page_putref_lock(env, opg); + osc_lru_add(osc_cli(obj), opg); +} + +static void osc_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(slice->cpl_obj); + + osc_lru_add(osc_cli(obj), opg); +} + +static int osc_page_fail(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *unused) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + + +static const char *osc_list(struct list_head *head) +{ + return list_empty(head) ? "-" : "+"; +} + +static inline cfs_time_t osc_submit_duration(struct osc_page *opg) +{ + if (opg->ops_submit_time == 0) + return 0; + + return (cfs_time_current() - opg->ops_submit_time); +} + +static int osc_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = cl2osc(slice->cpl_obj); + struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; + + return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: " + "1< %#x %d %u %s %s > " + "2< "LPU64" %u %u %#x %#x | %p %p %p > " + "3< %s %p %d %lu %d > " + "4< %d %d %d %lu %s | %s %s %s %s > " + "5< %s %s %s %s | %d %s | %d %s %s>\n", + opg, + /* 1 */ + oap->oap_magic, oap->oap_cmd, + oap->oap_interrupted, + osc_list(&oap->oap_pending_item), + osc_list(&oap->oap_rpc_item), + /* 2 */ + oap->oap_obj_off, oap->oap_page_off, oap->oap_count, + oap->oap_async_flags, oap->oap_brw_flags, + oap->oap_request, oap->oap_cli, obj, + /* 3 */ + osc_list(&opg->ops_inflight), + opg->ops_submitter, opg->ops_transfer_pinned, + osc_submit_duration(opg), opg->ops_srvlock, + /* 4 */ + cli->cl_r_in_flight, cli->cl_w_in_flight, + cli->cl_max_rpcs_in_flight, + cli->cl_avail_grant, + osc_list(&cli->cl_cache_waiters), + osc_list(&cli->cl_loi_ready_list), + osc_list(&cli->cl_loi_hp_ready_list), + osc_list(&cli->cl_loi_write_list), + osc_list(&cli->cl_loi_read_list), + /* 5 */ + osc_list(&obj->oo_ready_item), + osc_list(&obj->oo_hp_ready_item), + osc_list(&obj->oo_write_item), + osc_list(&obj->oo_read_item), + atomic_read(&obj->oo_nr_reads), + osc_list(&obj->oo_reading_exts), + atomic_read(&obj->oo_nr_writes), + osc_list(&obj->oo_hp_exts), + osc_list(&obj->oo_urgent_exts)); +} + +static void osc_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + int rc; + + LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1)); + + ENTRY; + CDEBUG(D_TRACE, "%p\n", opg); + osc_page_transfer_put(env, opg); + rc = osc_teardown_async_page(env, obj, opg); + if (rc) { + CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(slice->cpl_page), + "Trying to teardown failed: %d\n", rc); + LASSERT(0); + } + + spin_lock(&obj->oo_seatbelt); + if (opg->ops_submitter != NULL) { + LASSERT(!list_empty(&opg->ops_inflight)); + list_del_init(&opg->ops_inflight); + opg->ops_submitter = NULL; + } + spin_unlock(&obj->oo_seatbelt); + + osc_lru_del(osc_cli(obj), opg, true); + EXIT; +} + +void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice, + int from, int to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + opg->ops_from = from; + opg->ops_to = to; + spin_lock(&oap->oap_lock); + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + spin_unlock(&oap->oap_lock); +} + +static int osc_page_cancel(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + /* Check if the transferring against this page + * is completed, or not even queued. */ + if (opg->ops_transfer_pinned) + /* FIXME: may not be interrupted.. */ + rc = osc_cancel_async_page(env, opg); + LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); + return rc; +} + +static int osc_page_flush(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct osc_page *opg = cl2osc_page(slice); + int rc = 0; + ENTRY; + rc = osc_flush_async_page(env, io, opg); + RETURN(rc); +} + +static const struct cl_page_operations osc_page_ops = { + .cpo_fini = osc_page_fini, + .cpo_print = osc_page_print, + .cpo_delete = osc_page_delete, + .cpo_is_under_lock = osc_page_is_under_lock, + .cpo_disown = osc_page_disown, + .io = { + [CRT_READ] = { + .cpo_cache_add = osc_page_fail, + .cpo_completion = osc_page_completion_read + }, + [CRT_WRITE] = { + .cpo_cache_add = osc_page_cache_add, + .cpo_completion = osc_page_completion_write + } + }, + .cpo_clip = osc_page_clip, + .cpo_cancel = osc_page_cancel, + .cpo_flush = osc_page_flush +}; + +int osc_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage) +{ + struct osc_object *osc = cl2osc(obj); + struct osc_page *opg = cl_object_page_slice(obj, page); + int result; + + opg->ops_from = 0; + opg->ops_to = PAGE_CACHE_SIZE; + + result = osc_prep_async_page(osc, opg, vmpage, + cl_offset(obj, page->cp_index)); + if (result == 0) { + struct osc_io *oio = osc_env_io(env); + opg->ops_srvlock = osc_io_srvlock(oio); + cl_page_slice_add(page, &opg->ops_cl, obj, + &osc_page_ops); + } + /* + * Cannot assert osc_page_protected() here as read-ahead + * creates temporary pages outside of a lock. + */ + /* ops_inflight and ops_lru are the same field, but it doesn't + * hurt to initialize it twice :-) */ + INIT_LIST_HEAD(&opg->ops_inflight); + INIT_LIST_HEAD(&opg->ops_lru); + + /* reserve an LRU space for this page */ + if (page->cp_type == CPT_CACHEABLE && result == 0) + result = osc_lru_reserve(env, osc, opg); + + return result; +} + +/** + * Helper function called by osc_io_submit() for every page in an immediate + * transfer (i.e., transferred synchronously). + */ +void osc_page_submit(const struct lu_env *env, struct osc_page *opg, + enum cl_req_type crt, int brw_flags) +{ + struct osc_async_page *oap = &opg->ops_oap; + struct osc_object *obj = oap->oap_obj; + + LINVRNT(osc_page_protected(env, opg, + crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1)); + + LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, " + "magic 0x%x\n", oap, oap->oap_magic); + LASSERT(oap->oap_async_flags & ASYNC_READY); + LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); + + oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + oap->oap_page_off = opg->ops_from; + oap->oap_count = opg->ops_to - opg->ops_from; + oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags; + + if (!client_is_remote(osc_export(obj)) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) { + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + oap->oap_cmd |= OBD_BRW_NOQUOTA; + } + + opg->ops_submit_time = cfs_time_current(); + osc_page_transfer_get(opg, "transfer\0imm"); + osc_page_transfer_add(env, opg, crt); +} + +/* --------------- LRU page management ------------------ */ + +/* OSC is a natural place to manage LRU pages as applications are specialized + * to write OSC by OSC. Ideally, if one OSC is used more frequently it should + * occupy more LRU slots. On the other hand, we should avoid using up all LRU + * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep + * for free LRU slots - this will be very bad so the algorithm requires each + * OSC to free slots voluntarily to maintain a reasonable number of free slots + * at any time. + */ + +static CFS_DECL_WAITQ(osc_lru_waitq); +static atomic_t osc_lru_waiters = ATOMIC_INIT(0); +/* LRU pages are freed in batch mode. OSC should at least free this + * number of pages to avoid running out of LRU budget, and.. */ +static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT); /* 2M */ +/* free this number at most otherwise it will take too long time to finsih. */ +static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */ + +/* Check if we can free LRU slots from this OSC. If there exists LRU waiters, + * we should free slots aggressively. In this way, slots are freed in a steady + * step to maintain fairness among OSCs. + * + * Return how many LRU pages should be freed. */ +static int osc_cache_too_much(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + int pages = atomic_read(&cli->cl_lru_in_list) >> 1; + + if (atomic_read(&osc_lru_waiters) > 0 && + atomic_read(cli->cl_lru_left) < lru_shrink_max) + /* drop lru pages aggressively */ + return min(pages, lru_shrink_max); + + /* if it's going to run out LRU slots, we should free some, but not + * too much to maintain faireness among OSCs. */ + if (atomic_read(cli->cl_lru_left) < cache->ccc_lru_max >> 4) { + unsigned long tmp; + + tmp = cache->ccc_lru_max / atomic_read(&cache->ccc_users); + if (pages > tmp) + return min(pages, lru_shrink_max); + + return pages > lru_shrink_min ? lru_shrink_min : 0; + } + + return 0; +} + +/* Return how many pages are not discarded in @pvec. */ +static int discard_pagevec(const struct lu_env *env, struct cl_io *io, + struct cl_page **pvec, int max_index) +{ + int count; + int i; + + for (count = 0, i = 0; i < max_index; i++) { + struct cl_page *page = pvec[i]; + if (cl_page_own_try(env, io, page) == 0) { + /* free LRU page only if nobody is using it. + * This check is necessary to avoid freeing the pages + * having already been removed from LRU and pinned + * for IO. */ + if (!cl_page_in_use(page)) { + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + ++count; + } + cl_page_disown(env, io, page); + } + cl_page_put(env, page); + pvec[i] = NULL; + } + return max_index - count; +} + +/** + * Drop @target of pages from LRU at most. + */ +int osc_lru_shrink(struct client_obd *cli, int target) +{ + struct cl_env_nest nest; + struct lu_env *env; + struct cl_io *io; + struct cl_object *clobj = NULL; + struct cl_page **pvec; + struct osc_page *opg; + int maxscan = 0; + int count = 0; + int index = 0; + int rc = 0; + ENTRY; + + LASSERT(atomic_read(&cli->cl_lru_in_list) >= 0); + if (atomic_read(&cli->cl_lru_in_list) == 0 || target <= 0) + RETURN(0); + + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + pvec = osc_env_info(env)->oti_pvec; + io = &osc_env_info(env)->oti_io; + + client_obd_list_lock(&cli->cl_lru_list_lock); + atomic_inc(&cli->cl_lru_shrinkers); + maxscan = min(target << 1, atomic_read(&cli->cl_lru_in_list)); + while (!list_empty(&cli->cl_lru_list)) { + struct cl_page *page; + + if (--maxscan < 0) + break; + + opg = list_entry(cli->cl_lru_list.next, struct osc_page, + ops_lru); + page = cl_page_top(opg->ops_cl.cpl_page); + if (cl_page_in_use_noref(page)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + continue; + } + + LASSERT(page->cp_obj != NULL); + if (clobj != page->cp_obj) { + struct cl_object *tmp = page->cp_obj; + + cl_object_get(tmp); + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + count -= discard_pagevec(env, io, pvec, index); + index = 0; + + cl_io_fini(env, io); + cl_object_put(env, clobj); + clobj = NULL; + } + + clobj = tmp; + io->ci_obj = clobj; + io->ci_ignore_layout = 1; + rc = cl_io_init(env, io, CIT_MISC, clobj); + + client_obd_list_lock(&cli->cl_lru_list_lock); + + if (rc != 0) + break; + + ++maxscan; + continue; + } + + /* move this page to the end of list as it will be discarded + * soon. The page will be finally removed from LRU list in + * osc_page_delete(). */ + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + + /* it's okay to grab a refcount here w/o holding lock because + * it has to grab cl_lru_list_lock to delete the page. */ + cl_page_get(page); + pvec[index++] = page; + if (++count >= target) + break; + + if (unlikely(index == OTI_PVEC_SIZE)) { + client_obd_list_unlock(&cli->cl_lru_list_lock); + count -= discard_pagevec(env, io, pvec, index); + index = 0; + + client_obd_list_lock(&cli->cl_lru_list_lock); + } + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (clobj != NULL) { + count -= discard_pagevec(env, io, pvec, index); + + cl_io_fini(env, io); + cl_object_put(env, clobj); + } + cl_env_nested_put(&nest, env); + + atomic_dec(&cli->cl_lru_shrinkers); + RETURN(count > 0 ? count : rc); +} + +static void osc_lru_add(struct client_obd *cli, struct osc_page *opg) +{ + bool wakeup = false; + + if (!opg->ops_in_lru) + return; + + atomic_dec(&cli->cl_lru_busy); + client_obd_list_lock(&cli->cl_lru_list_lock); + if (list_empty(&opg->ops_lru)) { + list_move_tail(&opg->ops_lru, &cli->cl_lru_list); + atomic_inc_return(&cli->cl_lru_in_list); + wakeup = atomic_read(&osc_lru_waiters) > 0; + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + + if (wakeup) { + osc_lru_shrink(cli, osc_cache_too_much(cli)); + wake_up_all(&osc_lru_waitq); + } +} + +/* delete page from LRUlist. The page can be deleted from LRUlist for two + * reasons: redirtied or deleted from page cache. */ +static void osc_lru_del(struct client_obd *cli, struct osc_page *opg, bool del) +{ + if (opg->ops_in_lru) { + client_obd_list_lock(&cli->cl_lru_list_lock); + if (!list_empty(&opg->ops_lru)) { + LASSERT(atomic_read(&cli->cl_lru_in_list) > 0); + list_del_init(&opg->ops_lru); + atomic_dec(&cli->cl_lru_in_list); + if (!del) + atomic_inc(&cli->cl_lru_busy); + } else if (del) { + LASSERT(atomic_read(&cli->cl_lru_busy) > 0); + atomic_dec(&cli->cl_lru_busy); + } + client_obd_list_unlock(&cli->cl_lru_list_lock); + if (del) { + atomic_inc(cli->cl_lru_left); + /* this is a great place to release more LRU pages if + * this osc occupies too many LRU pages and kernel is + * stealing one of them. + * cl_lru_shrinkers is to avoid recursive call in case + * we're already in the context of osc_lru_shrink(). */ + if (atomic_read(&cli->cl_lru_shrinkers) == 0) + osc_lru_shrink(cli, osc_cache_too_much(cli)); + wake_up(&osc_lru_waitq); + } + } else { + LASSERT(list_empty(&opg->ops_lru)); + } +} + +static inline int max_to_shrink(struct client_obd *cli) +{ + return min(atomic_read(&cli->cl_lru_in_list) >> 1, lru_shrink_max); +} + +static int osc_lru_reclaim(struct client_obd *cli) +{ + struct cl_client_cache *cache = cli->cl_cache; + int max_scans; + int rc; + + LASSERT(cache != NULL); + LASSERT(!list_empty(&cache->ccc_lru)); + + rc = osc_lru_shrink(cli, lru_shrink_min); + if (rc != 0) { + CDEBUG(D_CACHE, "%s: Free %d pages from own LRU: %p.\n", + cli->cl_import->imp_obd->obd_name, rc, cli); + return rc; + } + + CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %d, busy: %d.\n", + cli->cl_import->imp_obd->obd_name, cli, + atomic_read(&cli->cl_lru_in_list), + atomic_read(&cli->cl_lru_busy)); + + /* Reclaim LRU slots from other client_obd as it can't free enough + * from its own. This should rarely happen. */ + spin_lock(&cache->ccc_lru_lock); + cache->ccc_lru_shrinkers++; + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + + max_scans = atomic_read(&cache->ccc_users); + while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { + cli = list_entry(cache->ccc_lru.next, struct client_obd, + cl_lru_osc); + + CDEBUG(D_CACHE, "%s: cli %p LRU pages: %d, busy: %d.\n", + cli->cl_import->imp_obd->obd_name, cli, + atomic_read(&cli->cl_lru_in_list), + atomic_read(&cli->cl_lru_busy)); + + list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); + if (atomic_read(&cli->cl_lru_in_list) > 0) { + spin_unlock(&cache->ccc_lru_lock); + + rc = osc_lru_shrink(cli, max_to_shrink(cli)); + spin_lock(&cache->ccc_lru_lock); + if (rc != 0) + break; + } + } + spin_unlock(&cache->ccc_lru_lock); + + CDEBUG(D_CACHE, "%s: cli %p freed %d pages.\n", + cli->cl_import->imp_obd->obd_name, cli, rc); + return rc; +} + +static int osc_lru_reserve(const struct lu_env *env, struct osc_object *obj, + struct osc_page *opg) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct client_obd *cli = osc_cli(obj); + int rc = 0; + ENTRY; + + if (cli->cl_cache == NULL) /* shall not be in LRU */ + RETURN(0); + + LASSERT(atomic_read(cli->cl_lru_left) >= 0); + while (!cfs_atomic_add_unless(cli->cl_lru_left, -1, 0)) { + int gen; + + /* run out of LRU spaces, try to drop some by itself */ + rc = osc_lru_reclaim(cli); + if (rc < 0) + break; + if (rc > 0) + continue; + + cond_resched(); + + /* slowest case, all of caching pages are busy, notifying + * other OSCs that we're lack of LRU slots. */ + atomic_inc(&osc_lru_waiters); + + gen = atomic_read(&cli->cl_lru_in_list); + rc = l_wait_event(osc_lru_waitq, + atomic_read(cli->cl_lru_left) > 0 || + (atomic_read(&cli->cl_lru_in_list) > 0 && + gen != atomic_read(&cli->cl_lru_in_list)), + &lwi); + + atomic_dec(&osc_lru_waiters); + if (rc < 0) + break; + } + + if (rc >= 0) { + atomic_inc(&cli->cl_lru_busy); + opg->ops_in_lru = 1; + rc = 0; + } + + RETURN(rc); +} + +/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c new file mode 100644 index 000000000000..69caab76ced3 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_quota.c @@ -0,0 +1,332 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Code originally extracted from quota directory + */ + +#include +#include "osc_internal.h" + +static inline struct osc_quota_info *osc_oqi_alloc(obd_uid id) +{ + struct osc_quota_info *oqi; + + OBD_SLAB_ALLOC_PTR(oqi, osc_quota_kmem); + if (oqi != NULL) + oqi->oqi_id = id; + + return oqi; +} + +int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) +{ + int type; + ENTRY; + + for (type = 0; type < MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if (oqi) { + obd_uid id = oqi->oqi_id; + + LASSERTF(id == qid[type], + "The ids don't match %u != %u\n", + id, qid[type]); + + /* the slot is busy, the user is about to run out of + * quota space on this OST */ + CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", + type == USRQUOTA ? "user" : "grout", qid[type]); + RETURN(NO_QUOTA); + } + } + + RETURN(QUOTA_OK); +} + +#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \ + : OBD_MD_FLGRPQUOTA) +#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \ + : OBD_FL_NO_GRPQUOTA) + +int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], + obd_flag valid, obd_flag flags) +{ + int type; + int rc = 0; + ENTRY; + + if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0) + RETURN(0); + + for (type = 0; type < MAXQUOTAS; type++) { + struct osc_quota_info *oqi; + + if ((valid & MD_QUOTA_FLAG(type)) == 0) + continue; + + /* lookup the ID in the per-type hash table */ + oqi = cfs_hash_lookup(cli->cl_quota_hash[type], &qid[type]); + if ((flags & FL_QUOTA_FLAG(type)) != 0) { + /* This ID is getting close to its quota limit, let's + * switch to sync I/O */ + if (oqi != NULL) + continue; + + oqi = osc_oqi_alloc(qid[type]); + if (oqi == NULL) { + rc = -ENOMEM; + break; + } + + rc = cfs_hash_add_unique(cli->cl_quota_hash[type], + &qid[type], &oqi->oqi_hash); + /* race with others? */ + if (rc == -EALREADY) { + rc = 0; + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + } + + CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", + cli->cl_import->imp_obd->obd_name, + type == USRQUOTA ? "user" : "group", + qid[type], rc); + } else { + /* This ID is now off the hook, let's remove it from + * the hash table */ + if (oqi == NULL) + continue; + + oqi = cfs_hash_del_key(cli->cl_quota_hash[type], + &qid[type]); + if (oqi) + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); + + CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", + cli->cl_import->imp_obd->obd_name, + type == USRQUOTA ? "user" : "group", + qid[type], oqi); + } + } + + RETURN(rc); +} + +/* + * Hash operations for uid/gid <-> osc_quota_info + */ +static unsigned +oqi_hashfn(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_u32_hash(*((__u32*)key), mask); +} + +static int +oqi_keycmp(const void *key, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + obd_uid uid; + + LASSERT(key != NULL); + uid = *((obd_uid*)key); + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + return uid == oqi->oqi_id; +} + +static void * +oqi_key(struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + return &oqi->oqi_id; +} + +static void * +oqi_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct osc_quota_info, oqi_hash); +} + +static void +oqi_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ +} + +static void +oqi_exit(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct osc_quota_info *oqi; + + oqi = hlist_entry(hnode, struct osc_quota_info, oqi_hash); + + OBD_SLAB_FREE_PTR(oqi, osc_quota_kmem); +} + +#define HASH_QUOTA_BKT_BITS 5 +#define HASH_QUOTA_CUR_BITS 5 +#define HASH_QUOTA_MAX_BITS 15 + +static cfs_hash_ops_t quota_hash_ops = { + .hs_hash = oqi_hashfn, + .hs_keycmp = oqi_keycmp, + .hs_key = oqi_key, + .hs_object = oqi_object, + .hs_get = oqi_get, + .hs_put_locked = oqi_put_locked, + .hs_exit = oqi_exit, +}; + +int osc_quota_setup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int i, type; + ENTRY; + + for (type = 0; type < MAXQUOTAS; type++) { + cli->cl_quota_hash[type] = cfs_hash_create("QUOTA_HASH", + HASH_QUOTA_CUR_BITS, + HASH_QUOTA_MAX_BITS, + HASH_QUOTA_BKT_BITS, + 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + "a_hash_ops, + CFS_HASH_DEFAULT); + if (cli->cl_quota_hash[type] == NULL) + break; + } + + if (type == MAXQUOTAS) + RETURN(0); + + for (i = 0; i < type; i++) + cfs_hash_putref(cli->cl_quota_hash[i]); + + RETURN(-ENOMEM); +} + +int osc_quota_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int type; + ENTRY; + + for (type = 0; type < MAXQUOTAS; type++) + cfs_hash_putref(cli->cl_quota_hash[type]); + + RETURN(0); +} + +int osc_quotactl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, + OST_QUOTACTL); + if (req == NULL) + RETURN(-ENOMEM); + + oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *oqc = *oqctl; + + ptlrpc_request_set_replen(req); + ptlrpc_at_set_req_timeout(req); + req->rq_no_resend = 1; + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + + if (req->rq_repmsg && + (oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL))) { + *oqctl = *oqc; + } else if (!rc) { + CERROR ("Can't unpack obd_quotactl\n"); + rc = -EPROTO; + } + ptlrpc_req_finished(req); + + RETURN(rc); +} + +int osc_quotacheck(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct obd_quotactl *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTACHECK, LUSTRE_OST_VERSION, + OST_QUOTACHECK); + if (req == NULL) + RETURN(-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); + *body = *oqctl; + + ptlrpc_request_set_replen(req); + + /* the next poll will find -ENODATA, that means quotacheck is + * going on */ + cli->cl_qchk_stat = -ENODATA; + rc = ptlrpc_queue_wait(req); + if (rc) + cli->cl_qchk_stat = rc; + ptlrpc_req_finished(req); + RETURN(rc); +} + +int osc_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + int rc; + ENTRY; + + qchk->obd_uuid = cli->cl_target_uuid; + memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME)); + + rc = cli->cl_qchk_stat; + /* the client is not the previous one */ + if (rc == CL_NOT_QUOTACHECKED) + rc = -EINTR; + RETURN(rc); +} diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c new file mode 100644 index 000000000000..3062e47de8f9 --- /dev/null +++ b/drivers/staging/lustre/lustre/osc/osc_request.c @@ -0,0 +1,3668 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include + + +#include +#include +#include +#include +#include +#include + +#ifdef __CYGWIN__ +# include +#endif + +#include +#include +#include +#include +#include +#include +#include "osc_internal.h" +#include "osc_cl_internal.h" + +static void osc_release_ppga(struct brw_page **ppga, obd_count count); +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc); +int osc_cleanup(struct obd_device *obd); + +/* Pack OSC object metadata for disk storage (LE byte order). */ +static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, + struct lov_stripe_md *lsm) +{ + int lmm_size; + ENTRY; + + lmm_size = sizeof(**lmmp); + if (lmmp == NULL) + RETURN(lmm_size); + + if (*lmmp != NULL && lsm == NULL) { + OBD_FREE(*lmmp, lmm_size); + *lmmp = NULL; + RETURN(0); + } else if (unlikely(lsm != NULL && ostid_id(&lsm->lsm_oi) == 0)) { + RETURN(-EBADF); + } + + if (*lmmp == NULL) { + OBD_ALLOC(*lmmp, lmm_size); + if (*lmmp == NULL) + RETURN(-ENOMEM); + } + + if (lsm) + ostid_cpu_to_le(&lsm->lsm_oi, &(*lmmp)->lmm_oi); + + RETURN(lmm_size); +} + +/* Unpack OSC object metadata from disk storage (LE byte order). */ +static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmm, int lmm_bytes) +{ + int lsm_size; + struct obd_import *imp = class_exp2cliimp(exp); + ENTRY; + + if (lmm != NULL) { + if (lmm_bytes < sizeof(*lmm)) { + CERROR("%s: lov_mds_md too small: %d, need %d\n", + exp->exp_obd->obd_name, lmm_bytes, + (int)sizeof(*lmm)); + RETURN(-EINVAL); + } + /* XXX LOV_MAGIC etc check? */ + + if (unlikely(ostid_id(&lmm->lmm_oi) == 0)) { + CERROR("%s: zero lmm_object_id: rc = %d\n", + exp->exp_obd->obd_name, -EINVAL); + RETURN(-EINVAL); + } + } + + lsm_size = lov_stripe_md_size(1); + if (lsmp == NULL) + RETURN(lsm_size); + + if (*lsmp != NULL && lmm == NULL) { + OBD_FREE((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + OBD_FREE(*lsmp, lsm_size); + *lsmp = NULL; + RETURN(0); + } + + if (*lsmp == NULL) { + OBD_ALLOC(*lsmp, lsm_size); + if (unlikely(*lsmp == NULL)) + RETURN(-ENOMEM); + OBD_ALLOC((*lsmp)->lsm_oinfo[0], sizeof(struct lov_oinfo)); + if (unlikely((*lsmp)->lsm_oinfo[0] == NULL)) { + OBD_FREE(*lsmp, lsm_size); + RETURN(-ENOMEM); + } + loi_init((*lsmp)->lsm_oinfo[0]); + } else if (unlikely(ostid_id(&(*lsmp)->lsm_oi) == 0)) { + RETURN(-EBADF); + } + + if (lmm != NULL) + /* XXX zero *lsmp? */ + ostid_le_to_cpu(&lmm->lmm_oi, &(*lsmp)->lsm_oi); + + if (imp != NULL && + (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES)) + (*lsmp)->lsm_maxbytes = imp->imp_connect_data.ocd_maxbytes; + else + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; + + RETURN(lsm_size); +} + +static inline void osc_pack_capa(struct ptlrpc_request *req, + struct ost_body *body, void *capa) +{ + struct obd_capa *oc = (struct obd_capa *)capa; + struct lustre_capa *c; + + if (!capa) + return; + + c = req_capsule_client_get(&req->rq_pill, &RMF_CAPA1); + LASSERT(c); + capa_cpy(c, oc); + body->oa.o_valid |= OBD_MD_FLOSSCAPA; + DEBUG_CAPA(D_SEC, c, "pack"); +} + +static inline void osc_pack_req_body(struct ptlrpc_request *req, + struct obd_info *oinfo) +{ + struct ost_body *body; + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + + lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); +} + +static inline void osc_set_capa_size(struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa *oc) +{ + if (oc == NULL) + req_capsule_set_size(&req->rq_pill, field, RCL_CLIENT, 0); + else + /* it is already calculated as sizeof struct obd_capa */ + ; +} + +static int osc_getattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body) { + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(aa->aa_oi->oi_oa, &body->oa); + + /* This should really be sent by the OST */ + aa->aa_oi->oi_oa->o_blksize = DT_MAX_BRW_SIZE; + aa->aa_oi->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + } else { + CDEBUG(D_INFO, "can't unpack ost_body\n"); + rc = -EPROTO; + aa->aa_oi->oi_oa->o_valid = 0; + } +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_getattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_getattr_interpret; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(set, req); + RETURN(0); +} + +static int osc_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + + oinfo->oi_oa->o_blksize = cli_brw_size(exp->exp_obd); + oinfo->oi_oa->o_valid |= OBD_MD_FLBLKSZ; + + EXIT; + out: + ptlrpc_req_finished(req); + return rc; +} + +static int osc_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti) +{ + struct ptlrpc_request *req; + struct ost_body *body; + int rc; + ENTRY; + + LASSERT(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(oinfo->oi_oa, &body->oa); + + EXIT; +out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int osc_setattr_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_setattr_args *sa, int rc) +{ + struct ost_body *body; + ENTRY; + + if (rc != 0) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out, rc = -EPROTO); + + lustre_get_wire_obdo(sa->sa_oa, &body->oa); +out: + rc = sa->sa_upcall(sa->sa_cookie, rc); + RETURN(rc); +} + +int osc_setattr_async_base(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + if (oti && oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) + oinfo->oi_oa->o_lcookie = *oti->oti_logcookies; + + osc_pack_req_body(req, oinfo); + + ptlrpc_request_set_replen(req); + + /* do mds to ost setattr asynchronously */ + if (!rqset) { + /* Do not wait for response. */ + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + } else { + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_setattr_interpret; + + CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oinfo->oi_oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + } + + RETURN(0); +} + +static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + return osc_setattr_async_base(exp, oinfo, oti, + oinfo->oi_cb_up, oinfo, rqset); +} + +int osc_real_create(struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md **ea, struct obd_trans_info *oti) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct lov_stripe_md *lsm; + int rc; + ENTRY; + + LASSERT(oa); + LASSERT(ea); + + lsm = *ea; + if (!lsm) { + rc = obd_alloc_memmd(exp, &lsm); + if (rc < 0) + RETURN(rc); + } + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); + if (rc) { + ptlrpc_request_free(req); + GOTO(out, rc); + } + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&body->oa, oa); + + ptlrpc_request_set_replen(req); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_DELORPHAN) { + DEBUG_REQ(D_HA, req, + "delorphan from OST integration"); + /* Don't resend the delorphan req */ + req->rq_no_resend = req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out_req, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) + GOTO(out_req, rc = -EPROTO); + + lustre_get_wire_obdo(oa, &body->oa); + + oa->o_blksize = cli_brw_size(exp->exp_obd); + oa->o_valid |= OBD_MD_FLBLKSZ; + + /* XXX LOV STACKING: the lsm that is passed to us from LOV does not + * have valid lsm_oinfo data structs, so don't go touching that. + * This needs to be fixed in a big way. + */ + lsm->lsm_oi = oa->o_oi; + *ea = lsm; + + if (oti != NULL) { + oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); + + if (oa->o_valid & OBD_MD_FLCOOKIE) { + if (!oti->oti_logcookies) + oti_alloc_cookies(oti, 1); + *oti->oti_logcookies = oa->o_lcookie; + } + } + + CDEBUG(D_HA, "transno: "LPD64"\n", + lustre_msg_get_transno(req->rq_repmsg)); +out_req: + ptlrpc_req_finished(req); +out: + if (rc && !*ea) + obd_free_memmd(exp, &lsm); + RETURN(rc); +} + +int osc_punch_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct osc_setattr_args *sa; + struct ost_body *body; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); + + ptlrpc_request_set_replen(req); + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; + CLASSERT (sizeof(*sa) <= sizeof(req->rq_async_args)); + sa = ptlrpc_req_async_args(req); + sa->sa_oa = oinfo->oi_oa; + sa->sa_upcall = upcall; + sa->sa_cookie = cookie; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + RETURN(0); +} + +static int osc_punch(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start; + oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end; + oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + return osc_punch_base(exp, oinfo, + oinfo->oi_cb_up, oinfo, rqset); +} + +static int osc_sync_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc) +{ + struct osc_fsync_args *fa = arg; + struct ost_body *body; + ENTRY; + + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + GOTO(out, rc = -EPROTO); + } + + *fa->fa_oi->oi_oa = body->oa; +out: + rc = fa->fa_upcall(fa->fa_cookie, rc); + RETURN(rc); +} + +int osc_sync_base(struct obd_export *exp, struct obd_info *oinfo, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) +{ + struct ptlrpc_request *req; + struct ost_body *body; + struct osc_fsync_args *fa; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); + if (req == NULL) + RETURN(-ENOMEM); + + osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + /* overload the size and blocks fields in the oa with start/end */ + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&body->oa, oinfo->oi_oa); + osc_pack_capa(req, body, oinfo->oi_capa); + + ptlrpc_request_set_replen(req); + req->rq_interpret_reply = osc_sync_interpret; + + CLASSERT(sizeof(*fa) <= sizeof(req->rq_async_args)); + fa = ptlrpc_req_async_args(req); + fa->fa_oi = oinfo; + fa->fa_upcall = upcall; + fa->fa_cookie = cookie; + + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + + RETURN (0); +} + +static int osc_sync(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set) +{ + ENTRY; + + if (!oinfo->oi_oa) { + CDEBUG(D_INFO, "oa NULL\n"); + RETURN(-EINVAL); + } + + oinfo->oi_oa->o_size = start; + oinfo->oi_oa->o_blocks = end; + oinfo->oi_oa->o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + + RETURN(osc_sync_base(exp, oinfo, oinfo->oi_cb_up, oinfo, set)); +} + +/* Find and cancel locally locks matched by @mode in the resource found by + * @objid. Found locks are added into @cancel list. Returns the amount of + * locks added to @cancels list. */ +static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, + struct list_head *cancels, + ldlm_mode_t mode, int lock_flags) +{ + struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; + struct ldlm_res_id res_id; + struct ldlm_resource *res; + int count; + ENTRY; + + /* Return, i.e. cancel nothing, only if ELC is supported (flag in + * export) but disabled through procfs (flag in NS). + * + * This distinguishes from a case when ELC is not supported originally, + * when we still want to cancel locks in advance and just cancel them + * locally, without sending any RPC. */ + if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) + RETURN(0); + + ostid_build_res_name(&oa->o_oi, &res_id); + res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); + if (res == NULL) + RETURN(0); + + LDLM_RESOURCE_ADDREF(res); + count = ldlm_cancel_resource_local(res, cancels, NULL, mode, + lock_flags, 0, NULL); + LDLM_RESOURCE_DELREF(res); + ldlm_resource_putref(res); + RETURN(count); +} + +static int osc_destroy_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, + int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + + atomic_dec(&cli->cl_destroy_in_flight); + wake_up(&cli->cl_destroy_waitq); + return 0; +} + +static int osc_can_send_destroy(struct client_obd *cli) +{ + if (atomic_inc_return(&cli->cl_destroy_in_flight) <= + cli->cl_max_rpcs_in_flight) { + /* The destroy request can be sent */ + return 1; + } + if (atomic_dec_return(&cli->cl_destroy_in_flight) < + cli->cl_max_rpcs_in_flight) { + /* + * The counter has been modified between the two atomic + * operations. + */ + wake_up(&cli->cl_destroy_waitq); + } + return 0; +} + +int osc_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc = 0; + ENTRY; + + LASSERT(oa); + LASSERT(ea); + LASSERT(oa->o_valid & OBD_MD_FLGROUP); + + if ((oa->o_valid & OBD_MD_FLFLAGS) && + oa->o_flags == OBD_FL_RECREATE_OBJS) { + RETURN(osc_real_create(exp, oa, ea, oti)); + } + + if (!fid_seq_is_mdt(ostid_seq(&oa->o_oi))) + RETURN(osc_real_create(exp, oa, ea, oti)); + + /* we should not get here anymore */ + LBUG(); + + RETURN(rc); +} + +/* Destroy requests can be async always on the client, and we don't even really + * care about the return code since the client cannot do anything at all about + * a destroy failure. + * When the MDS is unlinking a filename, it saves the file objects into a + * recovery llog, and these object records are cancelled when the OST reports + * they were destroyed and sync'd to disk (i.e. transaction committed). + * If the client dies, or the OST is down when the object should be destroyed, + * the records are not cancelled, and when the OST reconnects to the MDS next, + * it will retrieve the llog unlink logs and then sends the log cancellation + * cookies to the MDS after committing destroy transactions. */ +static int osc_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_export, + void *capa) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct ost_body *body; + LIST_HEAD(cancels); + int rc, count; + ENTRY; + + if (!oa) { + CDEBUG(D_INFO, "oa NULL\n"); + RETURN(-EINVAL); + } + + count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, + LDLM_FL_DISCARD_DATA); + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); + if (req == NULL) { + ldlm_lock_list_put(&cancels, l_bl_ast, count); + RETURN(-ENOMEM); + } + + osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa); + rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, + 0, &cancels, count); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + + if (oti != NULL && oa->o_valid & OBD_MD_FLCOOKIE) + oa->o_lcookie = *oti->oti_logcookies; + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + lustre_set_wire_obdo(&body->oa, oa); + + osc_pack_capa(req, body, (struct obd_capa *)capa); + ptlrpc_request_set_replen(req); + + /* If osc_destory is for destroying the unlink orphan, + * sent from MDT to OST, which should not be blocked here, + * because the process might be triggered by ptlrpcd, and + * it is not good to block ptlrpcd thread (b=16006)*/ + if (!(oa->o_flags & OBD_FL_DELORPHAN)) { + req->rq_interpret_reply = osc_destroy_interpret; + if (!osc_can_send_destroy(cli)) { + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, + NULL); + + /* + * Wait until the number of on-going destroy RPCs drops + * under max_rpc_in_flight + */ + l_wait_event_exclusive(cli->cl_destroy_waitq, + osc_can_send_destroy(cli), &lwi); + } + } + + /* Do not wait for response */ + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + RETURN(0); +} + +static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, + long writing_bytes) +{ + obd_flag bits = OBD_MD_FLBLOCKS|OBD_MD_FLGRANT; + + LASSERT(!(oa->o_valid & bits)); + + oa->o_valid |= bits; + client_obd_list_lock(&cli->cl_loi_list_lock); + oa->o_dirty = cli->cl_dirty; + if (unlikely(cli->cl_dirty - cli->cl_dirty_transit > + cli->cl_dirty_max)) { + CERROR("dirty %lu - %lu > dirty_max %lu\n", + cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); + oa->o_undirty = 0; + } else if (unlikely(atomic_read(&obd_unstable_pages) + + atomic_read(&obd_dirty_pages) - + atomic_read(&obd_dirty_transit_pages) > + (long)(obd_max_dirty_pages + 1))) { + /* The atomic_read() allowing the atomic_inc() are + * not covered by a lock thus they may safely race and trip + * this CERROR() unless we add in a small fudge factor (+1). */ + CERROR("%s: dirty %d + %d - %d > system dirty_max %d\n", + cli->cl_import->imp_obd->obd_name, + atomic_read(&obd_unstable_pages), + atomic_read(&obd_dirty_pages), + atomic_read(&obd_dirty_transit_pages), + obd_max_dirty_pages); + oa->o_undirty = 0; + } else if (unlikely(cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff)) { + CERROR("dirty %lu - dirty_max %lu too big???\n", + cli->cl_dirty, cli->cl_dirty_max); + oa->o_undirty = 0; + } else { + long max_in_flight = (cli->cl_max_pages_per_rpc << + PAGE_CACHE_SHIFT)* + (cli->cl_max_rpcs_in_flight + 1); + oa->o_undirty = max(cli->cl_dirty_max, max_in_flight); + } + oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; + oa->o_dropped = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n", + oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant); + +} + +void osc_update_next_shrink(struct client_obd *cli) +{ + cli->cl_next_shrink_grant = + cfs_time_shift(cli->cl_grant_shrink_interval); + CDEBUG(D_CACHE, "next time %ld to shrink grant \n", + cli->cl_next_shrink_grant); +} + +static void __osc_update_grant(struct client_obd *cli, obd_size grant) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant += grant; + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +static void osc_update_grant(struct client_obd *cli, struct ost_body *body) +{ + if (body->oa.o_valid & OBD_MD_FLGRANT) { + CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant); + __osc_update_grant(cli, body->oa.o_grant); + } +} + +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set); + +static int osc_shrink_grant_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void *aa, int rc) +{ + struct client_obd *cli = &req->rq_import->imp_obd->u.cli; + struct obdo *oa = ((struct osc_grant_args *)aa)->aa_oa; + struct ost_body *body; + + if (rc != 0) { + __osc_update_grant(cli, oa->o_grant); + GOTO(out, rc); + } + + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + LASSERT(body); + osc_update_grant(cli, body); +out: + OBDO_FREE(oa); + return rc; +} + +static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) +{ + client_obd_list_lock(&cli->cl_loi_list_lock); + oa->o_grant = cli->cl_avail_grant / 4; + cli->cl_avail_grant -= oa->o_grant; + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (!(oa->o_valid & OBD_MD_FLFLAGS)) { + oa->o_valid |= OBD_MD_FLFLAGS; + oa->o_flags = 0; + } + oa->o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); +} + +/* Shrink the current grant, either from some large amount to enough for a + * full set of in-flight RPCs, or if we have already shrunk to that limit + * then to enough for a single RPC. This avoids keeping more grant than + * needed, and avoids shrinking the grant piecemeal. */ +static int osc_shrink_grant(struct client_obd *cli) +{ + __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * + (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT); + + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_avail_grant <= target_bytes) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + return osc_shrink_grant_to_target(cli, target_bytes); +} + +int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) +{ + int rc = 0; + struct ost_body *body; + ENTRY; + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* Don't shrink if we are already above or below the desired limit + * We don't want to shrink below a single RPC, as that will negatively + * impact block allocation and long-term performance. */ + if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT) + target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (target_bytes >= cli->cl_avail_grant) { + client_obd_list_unlock(&cli->cl_loi_list_lock); + RETURN(0); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + OBD_ALLOC_PTR(body); + if (!body) + RETURN(-ENOMEM); + + osc_announce_cached(cli, &body->oa, 0); + + client_obd_list_lock(&cli->cl_loi_list_lock); + body->oa.o_grant = cli->cl_avail_grant - target_bytes; + cli->cl_avail_grant = target_bytes; + client_obd_list_unlock(&cli->cl_loi_list_lock); + if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_SHRINK_GRANT; + osc_update_next_shrink(cli); + + rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, + sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, + sizeof(*body), body, NULL); + if (rc != 0) + __osc_update_grant(cli, body->oa.o_grant); + OBD_FREE_PTR(body); + RETURN(rc); +} + +static int osc_should_shrink_grant(struct client_obd *client) +{ + cfs_time_t time = cfs_time_current(); + cfs_time_t next_shrink = client->cl_next_shrink_grant; + + if ((client->cl_import->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_GRANT_SHRINK) == 0) + return 0; + + if (cfs_time_aftereq(time, next_shrink - 5 * CFS_TICK)) { + /* Get the current RPC size directly, instead of going via: + * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) + * Keep comment here so that it can be found by searching. */ + int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; + + if (client->cl_import->imp_state == LUSTRE_IMP_FULL && + client->cl_avail_grant > brw_size) + return 1; + else + osc_update_next_shrink(client); + } + return 0; +} + +static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data) +{ + struct client_obd *client; + + list_for_each_entry(client, &item->ti_obd_list, + cl_grant_shrink_list) { + if (osc_should_shrink_grant(client)) + osc_shrink_grant(client); + } + return 0; +} + +static int osc_add_shrink_grant(struct client_obd *client) +{ + int rc; + + rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval, + TIMEOUT_GRANT, + osc_grant_shrink_grant_cb, NULL, + &client->cl_grant_shrink_list); + if (rc) { + CERROR("add grant client %s error %d\n", + client->cl_import->imp_obd->obd_name, rc); + return rc; + } + CDEBUG(D_CACHE, "add grant client %s \n", + client->cl_import->imp_obd->obd_name); + osc_update_next_shrink(client); + return 0; +} + +static int osc_del_shrink_grant(struct client_obd *client) +{ + return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list, + TIMEOUT_GRANT); +} + +static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) +{ + /* + * ocd_grant is the total grant amount we're expect to hold: if we've + * been evicted, it's the new avail_grant amount, cl_dirty will drop + * to 0 as inflight RPCs fail out; otherwise, it's avail_grant + dirty. + * + * race is tolerable here: if we're evicted, but imp_state already + * left EVICTED state, then cl_dirty must be 0 already. + */ + client_obd_list_lock(&cli->cl_loi_list_lock); + if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) + cli->cl_avail_grant = ocd->ocd_grant; + else + cli->cl_avail_grant = ocd->ocd_grant - cli->cl_dirty; + + if (cli->cl_avail_grant < 0) { + CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", + cli->cl_import->imp_obd->obd_name, cli->cl_avail_grant, + ocd->ocd_grant, cli->cl_dirty); + /* workaround for servers which do not have the patch from + * LU-2679 */ + cli->cl_avail_grant = ocd->ocd_grant; + } + + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld." + "chunk bits: %d.\n", cli->cl_import->imp_obd->obd_name, + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits); + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && + list_empty(&cli->cl_grant_shrink_list)) + osc_add_shrink_grant(cli); +} + +/* We assume that the reason this OSC got a short read is because it read + * beyond the end of a stripe file; i.e. lustre is reading a sparse file + * via the LOV, and it _knows_ it's reading inside the file, it's just that + * this stripe never got written at or beyond this stripe offset yet. */ +static void handle_short_read(int nob_read, obd_count page_count, + struct brw_page **pga) +{ + char *ptr; + int i = 0; + + /* skip bytes read OK */ + while (nob_read > 0) { + LASSERT (page_count > 0); + + if (pga[i]->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga[i]->pg) + + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr + nob_read, 0, pga[i]->count - nob_read); + kunmap(pga[i]->pg); + page_count--; + i++; + break; + } + + nob_read -= pga[i]->count; + page_count--; + i++; + } + + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga[i]->pg) + (pga[i]->off & ~CFS_PAGE_MASK); + memset(ptr, 0, pga[i]->count); + kunmap(pga[i]->pg); + i++; + } +} + +static int check_write_rcs(struct ptlrpc_request *req, + int requested_nob, int niocount, + obd_count page_count, struct brw_page **pga) +{ + int i; + __u32 *remote_rcs; + + remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, + sizeof(*remote_rcs) * + niocount); + if (remote_rcs == NULL) { + CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); + return(-EPROTO); + } + + /* return error if any niobuf was in error */ + for (i = 0; i < niocount; i++) { + if ((int)remote_rcs[i] < 0) + return(remote_rcs[i]); + + if (remote_rcs[i] != 0) { + CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", + i, remote_rcs[i], req); + return(-EPROTO); + } + } + + if (req->rq_bulk->bd_nob_transferred != requested_nob) { + CERROR("Unexpected # bytes transferred: %d (requested %d)\n", + req->rq_bulk->bd_nob_transferred, requested_nob); + return(-EPROTO); + } + + return (0); +} + +static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) +{ + if (p1->flag != p2->flag) { + unsigned mask = ~(OBD_BRW_FROM_GRANT| OBD_BRW_NOCACHE| + OBD_BRW_SYNC|OBD_BRW_ASYNC|OBD_BRW_NOQUOTA); + + /* warn if we try to combine flags that we don't know to be + * safe to combine */ + if (unlikely((p1->flag & mask) != (p2->flag & mask))) { + CWARN("Saw flags 0x%x and 0x%x in the same brw, please " + "report this at http://bugs.whamcloud.com/\n", + p1->flag, p2->flag); + } + return 0; + } + + return (p1->off + p1->count == p2->off); +} + +static obd_count osc_checksum_bulk(int nob, obd_count pg_count, + struct brw_page **pga, int opc, + cksum_type_t cksum_type) +{ + __u32 cksum; + int i = 0; + struct cfs_crypto_hash_desc *hdesc; + unsigned int bufsize; + int err; + unsigned char cfs_alg = cksum_obd2cfs(cksum_type); + + LASSERT(pg_count > 0); + + hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_alg)); + return PTR_ERR(hdesc); + } + + while (nob > 0 && pg_count > 0) { + int count = pga[i]->count > nob ? nob : pga[i]->count; + + /* corrupt the data before we compute the checksum, to + * simulate an OST->client data error */ + if (i == 0 && opc == OST_READ && + OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { + unsigned char *ptr = kmap(pga[i]->pg); + int off = pga[i]->off & ~CFS_PAGE_MASK; + memcpy(ptr + off, "bad1", min(4, nob)); + kunmap(pga[i]->pg); + } + cfs_crypto_hash_update_page(hdesc, pga[i]->pg, + pga[i]->off & ~CFS_PAGE_MASK, + count); + LL_CDEBUG_PAGE(D_PAGE, pga[i]->pg, "off %d\n", + (int)(pga[i]->off & ~CFS_PAGE_MASK)); + + nob -= pga[i]->count; + pg_count--; + i++; + } + + bufsize = 4; + err = cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); + + if (err) + cfs_crypto_hash_final(hdesc, NULL, NULL); + + /* For sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo */ + if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) + cksum++; + + return cksum; +} + +static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, + struct lov_stripe_md *lsm, obd_count page_count, + struct brw_page **pga, + struct ptlrpc_request **reqp, + struct obd_capa *ocapa, int reserve, + int resend) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct ost_body *body; + struct obd_ioobj *ioobj; + struct niobuf_remote *niobuf; + int niocount, i, requested_nob, opc, rc; + struct osc_brw_async_args *aa; + struct req_capsule *pill; + struct brw_page *pg_prev; + + ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) + RETURN(-ENOMEM); /* Recoverable */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) + RETURN(-EINVAL); /* Fatal */ + + if ((cmd & OBD_BRW_WRITE) != 0) { + opc = OST_WRITE; + req = ptlrpc_request_alloc_pool(cli->cl_import, + cli->cl_import->imp_rq_pool, + &RQF_OST_BRW_WRITE); + } else { + opc = OST_READ; + req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); + } + if (req == NULL) + RETURN(-ENOMEM); + + for (niocount = i = 1; i < page_count; i++) { + if (!can_merge_pages(pga[i - 1], pga[i])) + niocount++; + } + + pill = &req->rq_pill; + req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, + sizeof(*ioobj)); + req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, + niocount * sizeof(*niobuf)); + osc_set_capa_size(req, &RMF_CAPA1, ocapa); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ + ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; + + desc = ptlrpc_prep_bulk_imp(req, page_count, + cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, + opc == OST_WRITE ? BULK_GET_SOURCE : BULK_PUT_SINK, + OST_BULK_PORTAL); + + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + /* NB request now owns desc and will free it when it gets freed */ + + body = req_capsule_client_get(pill, &RMF_OST_BODY); + ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); + niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); + LASSERT(body != NULL && ioobj != NULL && niobuf != NULL); + + lustre_set_wire_obdo(&body->oa, oa); + + obdo_to_ioobj(oa, ioobj); + ioobj->ioo_bufcnt = niocount; + /* The high bits of ioo_max_brw tells server _maximum_ number of bulks + * that might be send for this request. The actual number is decided + * when the RPC is finally sent in ptlrpc_register_bulk(). It sends + * "max - 1" for old client compatibility sending "0", and also so the + * the actual maximum is a power-of-two number, not one less. LU-1431 */ + ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); + osc_pack_capa(req, body, ocapa); + LASSERT(page_count > 0); + pg_prev = pga[0]; + for (requested_nob = i = 0; i < page_count; i++, niobuf++) { + struct brw_page *pg = pga[i]; + int poff = pg->off & ~CFS_PAGE_MASK; + + LASSERT(pg->count > 0); + /* make sure there is no gap in the middle of page array */ + LASSERTF(page_count == 1 || + (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) && + ergo(i > 0 && i < page_count - 1, + poff == 0 && pg->count == PAGE_CACHE_SIZE) && + ergo(i == page_count - 1, poff == 0)), + "i: %d/%d pg: %p off: "LPU64", count: %u\n", + i, page_count, pg, pg->off, pg->count); + LASSERTF(i == 0 || pg->off > pg_prev->off, + "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64 + " prev_pg %p [pri %lu ind %lu] off "LPU64"\n", + i, page_count, + pg->pg, page_private(pg->pg), pg->pg->index, pg->off, + pg_prev->pg, page_private(pg_prev->pg), + pg_prev->pg->index, pg_prev->off); + LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == + (pg->flag & OBD_BRW_SRVLOCK)); + + ptlrpc_prep_bulk_page_pin(desc, pg->pg, poff, pg->count); + requested_nob += pg->count; + + if (i > 0 && can_merge_pages(pg_prev, pg)) { + niobuf--; + niobuf->len += pg->count; + } else { + niobuf->offset = pg->off; + niobuf->len = pg->count; + niobuf->flags = pg->flag; + } + pg_prev = pg; + } + + LASSERTF((void *)(niobuf - niocount) == + req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), + "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, + &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); + + osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); + if (resend) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + body->oa.o_valid |= OBD_MD_FLFLAGS; + body->oa.o_flags = 0; + } + body->oa.o_flags |= OBD_FL_RECOV_RESEND; + } + + if (osc_should_shrink_grant(cli)) + osc_shrink_grant_local(cli, &body->oa); + + /* size[REQ_REC_OFF] still sizeof (*body) */ + if (opc == OST_WRITE) { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + /* store cl_cksum_type in a local variable since + * it can be changed via lprocfs */ + cksum_type_t cksum_type = cli->cl_cksum_type; + + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { + oa->o_flags &= OBD_FL_LOCAL_MASK; + body->oa.o_flags = 0; + } + body->oa.o_flags |= cksum_type_pack(cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + body->oa.o_cksum = osc_checksum_bulk(requested_nob, + page_count, pga, + OST_WRITE, + cksum_type); + CDEBUG(D_PAGE, "checksum at write origin: %x\n", + body->oa.o_cksum); + /* save this in 'oa', too, for later checking */ + oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + oa->o_flags |= cksum_type_pack(cksum_type); + } else { + /* clear out the checksum flag, in case this is a + * resend but cl_checksum is no longer set. b=11238 */ + oa->o_valid &= ~OBD_MD_FLCKSUM; + } + oa->o_cksum = body->oa.o_cksum; + /* 1 RC per niobuf */ + req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, + sizeof(__u32) * niocount); + } else { + if (cli->cl_checksum && + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { + if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) + body->oa.o_flags = 0; + body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type); + body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; + } + } + ptlrpc_request_set_replen(req); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oa = oa; + aa->aa_requested_nob = requested_nob; + aa->aa_nio_count = niocount; + aa->aa_page_count = page_count; + aa->aa_resends = 0; + aa->aa_ppga = pga; + aa->aa_cli = cli; + INIT_LIST_HEAD(&aa->aa_oaps); + if (ocapa && reserve) + aa->aa_ocapa = capa_get(ocapa); + + *reqp = req; + RETURN(0); + + out: + ptlrpc_req_finished(req); + RETURN(rc); +} + +static int check_write_checksum(struct obdo *oa, const lnet_process_id_t *peer, + __u32 client_cksum, __u32 server_cksum, int nob, + obd_count page_count, struct brw_page **pga, + cksum_type_t client_cksum_type) +{ + __u32 new_cksum; + char *msg; + cksum_type_t cksum_type; + + if (server_cksum == client_cksum) { + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + return 0; + } + + cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? + oa->o_flags : 0); + new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE, + cksum_type); + + if (cksum_type != client_cksum_type) + msg = "the server did not use the checksum type specified in " + "the original request - likely a protocol problem"; + else if (new_cksum == server_cksum) + msg = "changed on the client after we checksummed it - " + "likely false positive due to mmap IO (bug 11742)"; + else if (new_cksum == client_cksum) + msg = "changed in transit before arrival at OST"; + else + msg = "changed in transit AND doesn't match the original - " + "likely false positive due to mmap IO (bug 11742)"; + + LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode "DFID + " object "DOSTID" extent ["LPU64"-"LPU64"]\n", + msg, libcfs_nid2str(peer->nid), + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, + oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, + POSTID(&oa->o_oi), pga[0]->off, + pga[page_count-1]->off + pga[page_count-1]->count - 1); + CERROR("original client csum %x (type %x), server csum %x (type %x), " + "client csum now %x\n", client_cksum, client_cksum_type, + server_cksum, cksum_type, new_cksum); + return 1; +} + +/* Note rc enters this function as number of bytes transferred */ +static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) +{ + struct osc_brw_async_args *aa = (void *)&req->rq_async_args; + const lnet_process_id_t *peer = + &req->rq_import->imp_connection->c_peer; + struct client_obd *cli = aa->aa_cli; + struct ost_body *body; + __u32 client_cksum = 0; + ENTRY; + + if (rc < 0 && rc != -EDQUOT) { + DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc); + RETURN(rc); + } + + LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc); + body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); + if (body == NULL) { + DEBUG_REQ(D_INFO, req, "Can't unpack body\n"); + RETURN(-EPROTO); + } + + /* set/clear over quota flag for a uid/gid */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && + body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) { + unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid }; + + CDEBUG(D_QUOTA, "setdq for [%u %u] with valid "LPX64", flags %x\n", + body->oa.o_uid, body->oa.o_gid, body->oa.o_valid, + body->oa.o_flags); + osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags); + } + + osc_update_grant(cli, body); + + if (rc < 0) + RETURN(rc); + + if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) + client_cksum = aa->aa_oa->o_cksum; /* save for later */ + + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { + if (rc > 0) { + CERROR("Unexpected +ve rc %d\n", rc); + RETURN(-EPROTO); + } + LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob); + + if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) + RETURN(-EAGAIN); + + if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && + check_write_checksum(&body->oa, peer, client_cksum, + body->oa.o_cksum, aa->aa_requested_nob, + aa->aa_page_count, aa->aa_ppga, + cksum_type_unpack(aa->aa_oa->o_flags))) + RETURN(-EAGAIN); + + rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count, + aa->aa_page_count, aa->aa_ppga); + GOTO(out, rc); + } + + /* The rest of this function executes only for OST_READs */ + + /* if unwrap_bulk failed, return -EAGAIN to retry */ + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); + if (rc < 0) + GOTO(out, rc = -EAGAIN); + + if (rc > aa->aa_requested_nob) { + CERROR("Unexpected rc %d (%d requested)\n", rc, + aa->aa_requested_nob); + RETURN(-EPROTO); + } + + if (rc != req->rq_bulk->bd_nob_transferred) { + CERROR ("Unexpected rc %d (%d transferred)\n", + rc, req->rq_bulk->bd_nob_transferred); + return (-EPROTO); + } + + if (rc < aa->aa_requested_nob) + handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); + + if (body->oa.o_valid & OBD_MD_FLCKSUM) { + static int cksum_counter; + __u32 server_cksum = body->oa.o_cksum; + char *via; + char *router; + cksum_type_t cksum_type; + + cksum_type = cksum_type_unpack(body->oa.o_valid &OBD_MD_FLFLAGS? + body->oa.o_flags : 0); + client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, + aa->aa_ppga, OST_READ, + cksum_type); + + if (peer->nid == req->rq_bulk->bd_sender) { + via = router = ""; + } else { + via = " via "; + router = libcfs_nid2str(req->rq_bulk->bd_sender); + } + + if (server_cksum == ~0 && rc > 0) { + CERROR("Protocol error: server %s set the 'checksum' " + "bit, but didn't send a checksum. Not fatal, " + "but please notify on http://bugs.whamcloud.com/\n", + libcfs_nid2str(peer->nid)); + } else if (server_cksum != client_cksum) { + LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from " + "%s%s%s inode "DFID" object "DOSTID + " extent ["LPU64"-"LPU64"]\n", + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(peer->nid), + via, router, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_seq : (__u64)0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_oid : 0, + body->oa.o_valid & OBD_MD_FLFID ? + body->oa.o_parent_ver : 0, + POSTID(&body->oa.o_oi), + aa->aa_ppga[0]->off, + aa->aa_ppga[aa->aa_page_count-1]->off + + aa->aa_ppga[aa->aa_page_count-1]->count - + 1); + CERROR("client %x, server %x, cksum_type %x\n", + client_cksum, server_cksum, cksum_type); + cksum_counter = 0; + aa->aa_oa->o_cksum = client_cksum; + rc = -EAGAIN; + } else { + cksum_counter++; + CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); + rc = 0; + } + } else if (unlikely(client_cksum)) { + static int cksum_missed; + + cksum_missed++; + if ((cksum_missed & (-cksum_missed)) == cksum_missed) + CERROR("Checksum %u requested from %s but not sent\n", + cksum_missed, libcfs_nid2str(peer->nid)); + } else { + rc = 0; + } +out: + if (rc >= 0) + lustre_get_wire_obdo(aa->aa_oa, &body->oa); + + RETURN(rc); +} + +static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa, + struct lov_stripe_md *lsm, + obd_count page_count, struct brw_page **pga, + struct obd_capa *ocapa) +{ + struct ptlrpc_request *req; + int rc; + wait_queue_head_t waitq; + int generation, resends = 0; + struct l_wait_info lwi; + + ENTRY; + + init_waitqueue_head(&waitq); + generation = exp->exp_obd->u.cli.cl_import->imp_generation; + +restart_bulk: + rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm, + page_count, pga, &req, ocapa, 0, resends); + if (rc != 0) + return (rc); + + if (resends) { + req->rq_generation_set = 1; + req->rq_import_generation = generation; + req->rq_sent = cfs_time_current_sec() + resends; + } + + rc = ptlrpc_queue_wait(req); + + if (rc == -ETIMEDOUT && req->rq_resend) { + DEBUG_REQ(D_HA, req, "BULK TIMEOUT"); + ptlrpc_req_finished(req); + goto restart_bulk; + } + + rc = osc_brw_fini_request(req, rc); + + ptlrpc_req_finished(req); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already.*/ + if (osc_recoverable_error(rc)) { + resends++; + if (rc != -EINPROGRESS && + !client_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("%s: too many resend retries for object: " + ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name, + POSTID(&oa->o_oi), rc); + goto out; + } + if (generation != + exp->exp_obd->u.cli.cl_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", exp->exp_obd->obd_name, + POSTID(&oa->o_oi), rc); + goto out; + } + + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, + NULL); + l_wait_event(waitq, 0, &lwi); + + goto restart_bulk; + } +out: + if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + RETURN (rc); +} + +static int osc_brw_redo_request(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) +{ + struct ptlrpc_request *new_req; + struct osc_brw_async_args *new_aa; + struct osc_async_page *oap; + ENTRY; + + DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, + "redo for recoverable error %d", rc); + + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == + OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, + aa->aa_cli, aa->aa_oa, + NULL /* lsm unused by osc currently */, + aa->aa_page_count, aa->aa_ppga, + &new_req, aa->aa_ocapa, 0, 1); + if (rc) + RETURN(rc); + + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request != NULL) { + LASSERTF(request == oap->oap_request, + "request %p != oap_request %p\n", + request, oap->oap_request); + if (oap->oap_interrupted) { + ptlrpc_req_finished(new_req); + RETURN(-EINTR); + } + } + } + /* New request takes over pga and oaps from old request. + * Note that copying a list_head doesn't work, need to move it... */ + aa->aa_resends++; + new_req->rq_interpret_reply = request->rq_interpret_reply; + new_req->rq_async_args = request->rq_async_args; + new_req->rq_commit_cb = request->rq_commit_cb; + /* cap resend delay to the current request timeout, this is similar to + * what ptlrpc does (see after_reply()) */ + if (aa->aa_resends > new_req->rq_timeout) + new_req->rq_sent = cfs_time_current_sec() + new_req->rq_timeout; + else + new_req->rq_sent = cfs_time_current_sec() + aa->aa_resends; + new_req->rq_generation_set = 1; + new_req->rq_import_generation = request->rq_import_generation; + + new_aa = ptlrpc_req_async_args(new_req); + + INIT_LIST_HEAD(&new_aa->aa_oaps); + list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); + INIT_LIST_HEAD(&new_aa->aa_exts); + list_splice_init(&aa->aa_exts, &new_aa->aa_exts); + new_aa->aa_resends = aa->aa_resends; + + list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { + if (oap->oap_request) { + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = ptlrpc_request_addref(new_req); + } + } + + new_aa->aa_ocapa = aa->aa_ocapa; + aa->aa_ocapa = NULL; + + /* XXX: This code will run into problem if we're going to support + * to add a series of BRW RPCs into a self-defined ptlrpc_request_set + * and wait for all of them to be finished. We should inherit request + * set from old request. */ + ptlrpcd_add_req(new_req, PDL_POLICY_SAME, -1); + + DEBUG_REQ(D_INFO, new_req, "new request"); + RETURN(0); +} + +/* + * ugh, we want disk allocation on the target to happen in offset order. we'll + * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do + * fine for our small page arrays and doesn't require allocation. its an + * insertion sort that swaps elements that are strides apart, shrinking the + * stride down until its '1' and the array is sorted. + */ +static void sort_brw_pages(struct brw_page **array, int num) +{ + int stride, i, j; + struct brw_page *tmp; + + if (num == 1) + return; + for (stride = 1; stride < num ; stride = (stride * 3) + 1) + ; + + do { + stride /= 3; + for (i = stride ; i < num ; i++) { + tmp = array[i]; + j = i; + while (j >= stride && array[j - stride]->off > tmp->off) { + array[j] = array[j - stride]; + j -= stride; + } + array[j] = tmp; + } + } while (stride > 1); +} + +static obd_count max_unfragmented_pages(struct brw_page **pg, obd_count pages) +{ + int count = 1; + int offset; + int i = 0; + + LASSERT (pages > 0); + offset = pg[i]->off & ~CFS_PAGE_MASK; + + for (;;) { + pages--; + if (pages == 0) /* that's all */ + return count; + + if (offset + pg[i]->count < PAGE_CACHE_SIZE) + return count; /* doesn't end on page boundary */ + + i++; + offset = pg[i]->off & ~CFS_PAGE_MASK; + if (offset != 0) /* doesn't start on page boundary */ + return count; + + count++; + } +} + +static struct brw_page **osc_build_ppga(struct brw_page *pga, obd_count count) +{ + struct brw_page **ppga; + int i; + + OBD_ALLOC(ppga, sizeof(*ppga) * count); + if (ppga == NULL) + return NULL; + + for (i = 0; i < count; i++) + ppga[i] = pga + i; + return ppga; +} + +static void osc_release_ppga(struct brw_page **ppga, obd_count count) +{ + LASSERT(ppga != NULL); + OBD_FREE(ppga, sizeof(*ppga) * count); +} + +static int osc_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, + obd_count page_count, struct brw_page *pga, + struct obd_trans_info *oti) +{ + struct obdo *saved_oa = NULL; + struct brw_page **ppga, **orig; + struct obd_import *imp = class_exp2cliimp(exp); + struct client_obd *cli; + int rc, page_count_orig; + ENTRY; + + LASSERT((imp != NULL) && (imp->imp_obd != NULL)); + cli = &imp->imp_obd->u.cli; + + if (cmd & OBD_BRW_CHECK) { + /* The caller just wants to know if there's a chance that this + * I/O can succeed */ + + if (imp->imp_invalid) + RETURN(-EIO); + RETURN(0); + } + + /* test_brw with a failed create can trip this, maybe others. */ + LASSERT(cli->cl_max_pages_per_rpc); + + rc = 0; + + orig = ppga = osc_build_ppga(pga, page_count); + if (ppga == NULL) + RETURN(-ENOMEM); + page_count_orig = page_count; + + sort_brw_pages(ppga, page_count); + while (page_count) { + obd_count pages_per_brw; + + if (page_count > cli->cl_max_pages_per_rpc) + pages_per_brw = cli->cl_max_pages_per_rpc; + else + pages_per_brw = page_count; + + pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw); + + if (saved_oa != NULL) { + /* restore previously saved oa */ + *oinfo->oi_oa = *saved_oa; + } else if (page_count > pages_per_brw) { + /* save a copy of oa (brw will clobber it) */ + OBDO_ALLOC(saved_oa); + if (saved_oa == NULL) + GOTO(out, rc = -ENOMEM); + *saved_oa = *oinfo->oi_oa; + } + + rc = osc_brw_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md, + pages_per_brw, ppga, oinfo->oi_capa); + + if (rc != 0) + break; + + page_count -= pages_per_brw; + ppga += pages_per_brw; + } + +out: + osc_release_ppga(orig, page_count_orig); + + if (saved_oa != NULL) + OBDO_FREE(saved_oa); + + RETURN(rc); +} + +static int brw_interpret(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct osc_brw_async_args *aa = data; + struct osc_extent *ext; + struct osc_extent *tmp; + struct cl_object *obj = NULL; + struct client_obd *cli = aa->aa_cli; + ENTRY; + + rc = osc_brw_fini_request(req, rc); + CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); + /* When server return -EINPROGRESS, client should always retry + * regardless of the number of times the bulk was resent already. */ + if (osc_recoverable_error(rc)) { + if (req->rq_import_generation != + req->rq_import->imp_generation) { + CDEBUG(D_HA, "%s: resend cross eviction for object: " + ""DOSTID", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } else if (rc == -EINPROGRESS || + client_should_resend(aa->aa_resends, aa->aa_cli)) { + rc = osc_brw_redo_request(req, aa, rc); + } else { + CERROR("%s: too many resent retries for object: " + ""LPU64":"LPU64", rc = %d.\n", + req->rq_import->imp_obd->obd_name, + POSTID(&aa->aa_oa->o_oi), rc); + } + + if (rc == 0) + RETURN(0); + else if (rc == -EAGAIN || rc == -EINPROGRESS) + rc = -EIO; + } + + if (aa->aa_ocapa) { + capa_put(aa->aa_ocapa); + aa->aa_ocapa = NULL; + } + + list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { + if (obj == NULL && rc == 0) { + obj = osc2cl(ext->oe_obj); + cl_object_get(obj); + } + + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 1, rc); + } + LASSERT(list_empty(&aa->aa_exts)); + LASSERT(list_empty(&aa->aa_oaps)); + + if (obj != NULL) { + struct obdo *oa = aa->aa_oa; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + unsigned long valid = 0; + + LASSERT(rc == 0); + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + if (valid != 0) { + cl_object_attr_lock(obj); + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } + cl_object_put(env, obj); + } + OBDO_FREE(aa->aa_oa); + + cl_req_completion(env, aa->aa_clerq, rc < 0 ? rc : + req->rq_bulk->bd_nob_transferred); + osc_release_ppga(aa->aa_ppga, aa->aa_page_count); + ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); + + client_obd_list_lock(&cli->cl_loi_list_lock); + /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters + * is called so we know whether to go to sync BRWs or wait for more + * RPCs to complete */ + if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) + cli->cl_w_in_flight--; + else + cli->cl_r_in_flight--; + osc_wake_cache_waiters(cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(rc); +} + +static void brw_commit(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + /* If osc_inc_unstable_pages (via osc_extent_finish) races with + * this called via the rq_commit_cb, I need to ensure + * osc_dec_unstable_pages is still called. Otherwise unstable + * pages may be leaked. */ + if (req->rq_unstable) + osc_dec_unstable_pages(req); + else + req->rq_committed = 1; + spin_unlock(&req->rq_lock); +} + +/** + * Build an RPC by the list of extent @ext_list. The caller must ensure + * that the total pages in this list are NOT over max pages per RPC. + * Extents in the list must be in OES_RPC state. + */ +int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, + struct list_head *ext_list, int cmd, pdl_policy_t pol) +{ + struct ptlrpc_request *req = NULL; + struct osc_extent *ext; + LIST_HEAD(rpc_list); + struct brw_page **pga = NULL; + struct osc_brw_async_args *aa = NULL; + struct obdo *oa = NULL; + struct osc_async_page *oap; + struct osc_async_page *tmp; + struct cl_req *clerq = NULL; + enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; + struct ldlm_lock *lock = NULL; + struct cl_req_attr crattr; + obd_off starting_offset = OBD_OBJECT_EOF; + obd_off ending_offset = 0; + int i, rc, mpflag = 0, mem_tight = 0, page_count = 0; + + ENTRY; + LASSERT(!list_empty(ext_list)); + + /* add pages into rpc_list to build BRW rpc */ + list_for_each_entry(ext, ext_list, oe_link) { + LASSERT(ext->oe_state == OES_RPC); + mem_tight |= ext->oe_memalloc; + list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { + ++page_count; + list_add_tail(&oap->oap_rpc_item, &rpc_list); + if (starting_offset > oap->oap_obj_off) + starting_offset = oap->oap_obj_off; + else + LASSERT(oap->oap_page_off == 0); + if (ending_offset < oap->oap_obj_off + oap->oap_count) + ending_offset = oap->oap_obj_off + + oap->oap_count; + else + LASSERT(oap->oap_page_off + oap->oap_count == + PAGE_CACHE_SIZE); + } + } + + if (mem_tight) + mpflag = cfs_memory_pressure_get_and_set(); + + memset(&crattr, 0, sizeof crattr); + OBD_ALLOC(pga, sizeof(*pga) * page_count); + if (pga == NULL) + GOTO(out, rc = -ENOMEM); + + OBDO_ALLOC(oa); + if (oa == NULL) + GOTO(out, rc = -ENOMEM); + + i = 0; + list_for_each_entry(oap, &rpc_list, oap_rpc_item) { + struct cl_page *page = oap2cl_page(oap); + if (clerq == NULL) { + clerq = cl_req_alloc(env, page, crt, + 1 /* only 1-object rpcs for + * now */); + if (IS_ERR(clerq)) + GOTO(out, rc = PTR_ERR(clerq)); + lock = oap->oap_ldlm_lock; + } + if (mem_tight) + oap->oap_brw_flags |= OBD_BRW_MEMALLOC; + pga[i] = &oap->oap_brw_page; + pga[i]->off = oap->oap_obj_off + oap->oap_page_off; + CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", + pga[i]->pg, page_index(oap->oap_page), oap, pga[i]->flag); + i++; + cl_req_page_add(env, clerq, page); + } + + /* always get the data for the obdo for the rpc */ + LASSERT(clerq != NULL); + crattr.cra_oa = oa; + crattr.cra_capa = NULL; + memset(crattr.cra_jobid, 0, JOBSTATS_JOBID_SIZE); + cl_req_attr_set(env, clerq, &crattr, ~0ULL); + if (lock) { + oa->o_handle = lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + + rc = cl_req_prep(env, clerq); + if (rc != 0) { + CERROR("cl_req_prep failed: %d\n", rc); + GOTO(out, rc); + } + + sort_brw_pages(pga, page_count); + rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, + pga, &req, crattr.cra_capa, 1, 0); + if (rc != 0) { + CERROR("prep_req failed: %d\n", rc); + GOTO(out, rc); + } + + req->rq_commit_cb = brw_commit; + req->rq_interpret_reply = brw_interpret; + + if (mem_tight != 0) + req->rq_memalloc = 1; + + /* Need to update the timestamps after the request is built in case + * we race with setattr (locally or in queue at OST). If OST gets + * later setattr before earlier BRW (as determined by the request xid), + * the OST will not use BRW timestamps. Sadly, there is no obvious + * way to do this in a single call. bug 10150 */ + cl_req_attr_set(env, clerq, &crattr, + OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); + + lustre_msg_set_jobid(req->rq_reqmsg, crattr.cra_jobid); + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + INIT_LIST_HEAD(&aa->aa_oaps); + list_splice_init(&rpc_list, &aa->aa_oaps); + INIT_LIST_HEAD(&aa->aa_exts); + list_splice_init(ext_list, &aa->aa_exts); + aa->aa_clerq = clerq; + + /* queued sync pages can be torn down while the pages + * were between the pending list and the rpc */ + tmp = NULL; + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { + /* only one oap gets a request reference */ + if (tmp == NULL) + tmp = oap; + if (oap->oap_interrupted && !req->rq_intr) { + CDEBUG(D_INODE, "oap %p in req %p interrupted\n", + oap, req); + ptlrpc_mark_interrupted(req); + } + } + if (tmp != NULL) + tmp->oap_request = ptlrpc_request_addref(req); + + client_obd_list_lock(&cli->cl_loi_list_lock); + starting_offset >>= PAGE_CACHE_SHIFT; + if (cmd == OBD_BRW_READ) { + cli->cl_r_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); + lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, + starting_offset + 1); + } else { + cli->cl_w_in_flight++; + lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); + lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); + lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, + starting_offset + 1); + } + client_obd_list_unlock(&cli->cl_loi_list_lock); + + DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", + page_count, aa, cli->cl_r_in_flight, + cli->cl_w_in_flight); + + /* XXX: Maybe the caller can check the RPC bulk descriptor to + * see which CPU/NUMA node the majority of pages were allocated + * on, and try to assign the async RPC to the CPU core + * (PDL_POLICY_PREFERRED) to reduce cross-CPU memory traffic. + * + * But on the other hand, we expect that multiple ptlrpcd + * threads and the initial write sponsor can run in parallel, + * especially when data checksum is enabled, which is CPU-bound + * operation and single ptlrpcd thread cannot process in time. + * So more ptlrpcd threads sharing BRW load + * (with PDL_POLICY_ROUND) seems better. + */ + ptlrpcd_add_req(req, pol, -1); + rc = 0; + EXIT; + +out: + if (mem_tight != 0) + cfs_memory_pressure_restore(mpflag); + + capa_put(crattr.cra_capa); + if (rc != 0) { + LASSERT(req == NULL); + + if (oa) + OBDO_FREE(oa); + if (pga) + OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + while (!list_empty(ext_list)) { + ext = list_entry(ext_list->next, struct osc_extent, + oe_link); + list_del_init(&ext->oe_link); + osc_extent_finish(env, ext, 0, rc); + } + if (clerq && !IS_ERR(clerq)) + cl_req_completion(env, clerq, rc); + } + RETURN(rc); +} + +static int osc_set_lock_data_with_check(struct ldlm_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + void *data = einfo->ei_cbdata; + int set = 0; + + LASSERT(lock != NULL); + LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); + LASSERT(lock->l_resource->lr_type == einfo->ei_type); + LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); + LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); + + lock_res_and_lock(lock); + spin_lock(&osc_ast_guard); + + if (lock->l_ast_data == NULL) + lock->l_ast_data = data; + if (lock->l_ast_data == data) + set = 1; + + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(lock); + + return set; +} + +static int osc_set_data_with_check(struct lustre_handle *lockh, + struct ldlm_enqueue_info *einfo) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + int set = 0; + + if (lock != NULL) { + set = osc_set_lock_data_with_check(lock, einfo); + LDLM_LOCK_PUT(lock); + } else + CERROR("lockh %p, data %p - client evicted?\n", + lockh, einfo->ei_cbdata); + return set; +} + +static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, + ldlm_iterator_t replace, void *data) +{ + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + + ostid_build_res_name(&lsm->lsm_oi, &res_id); + ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); + return 0; +} + +/* find any ldlm lock of the inode in osc + * return 0 not find + * 1 find one + * < 0 error */ +static int osc_find_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, + ldlm_iterator_t replace, void *data) +{ + struct ldlm_res_id res_id; + struct obd_device *obd = class_exp2obd(exp); + int rc = 0; + + ostid_build_res_name(&lsm->lsm_oi, &res_id); + rc = ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); + if (rc == LDLM_ITER_STOP) + return(1); + if (rc == LDLM_ITER_CONTINUE) + return(0); + return(rc); +} + +static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, + obd_enqueue_update_f upcall, void *cookie, + __u64 *flags, int agl, int rc) +{ + int intent = *flags & LDLM_FL_HAS_INTENT; + ENTRY; + + if (intent) { + /* The request was created before ldlm_cli_enqueue call. */ + if (rc == ELDLM_LOCK_ABORTED) { + struct ldlm_reply *rep; + rep = req_capsule_server_get(&req->rq_pill, + &RMF_DLM_REP); + + LASSERT(rep != NULL); + if (rep->lock_policy_res1) + rc = rep->lock_policy_res1; + } + } + + if ((intent != 0 && rc == ELDLM_LOCK_ABORTED && agl == 0) || + (rc == 0)) { + *flags |= LDLM_FL_LVB_READY; + CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n", + lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime); + } + + /* Call the update callback. */ + rc = (*upcall)(cookie, rc); + RETURN(rc); +} + +static int osc_enqueue_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_enqueue_args *aa, int rc) +{ + struct ldlm_lock *lock; + struct lustre_handle handle; + __u32 mode; + struct ost_lvb *lvb; + __u32 lvb_len; + __u64 *flags = aa->oa_flags; + + /* Make a local copy of a lock handle and a mode, because aa->oa_* + * might be freed anytime after lock upcall has been called. */ + lustre_handle_copy(&handle, aa->oa_lockh); + mode = aa->oa_ei->ei_mode; + + /* ldlm_cli_enqueue is holding a reference on the lock, so it must + * be valid. */ + lock = ldlm_handle2lock(&handle); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(&handle, mode); + + /* Let CP AST to grant the lock first. */ + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); + + if (aa->oa_agl && rc == ELDLM_LOCK_ABORTED) { + lvb = NULL; + lvb_len = 0; + } else { + lvb = aa->oa_lvb; + lvb_len = sizeof(*aa->oa_lvb); + } + + /* Complete obtaining the lock procedure. */ + rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1, + mode, flags, lvb, lvb_len, &handle, rc); + /* Complete osc stuff. */ + rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie, + flags, aa->oa_agl, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + + /* Release the lock for async request. */ + if (lustre_handle_is_used(&handle) && rc == ELDLM_OK) + /* + * Releases a reference taken by ldlm_cli_enqueue(), if it is + * not already released by + * ldlm_cli_enqueue_fini()->failed_lock_cleanup() + */ + ldlm_lock_decref(&handle, mode); + + LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n", + aa->oa_lockh, req, aa); + ldlm_lock_decref(&handle, mode); + LDLM_LOCK_PUT(lock); + return rc; +} + +void osc_update_enqueue(struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, + struct ost_lvb *lvb, __u32 mode, int rc) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); + + if (rc == ELDLM_OK) { + __u64 tmp; + + LASSERT(lock != NULL); + loi->loi_lvb = *lvb; + tmp = loi->loi_lvb.lvb_size; + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (tmp > lock->l_policy_data.l_extent.end) + tmp = lock->l_policy_data.l_extent.end + 1; + if (tmp >= loi->loi_kms) { + LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64 + ", kms="LPU64, loi->loi_lvb.lvb_size, tmp); + loi_kms_set(loi, tmp); + } else { + LDLM_DEBUG(lock, "lock acquired, setting rss=" + LPU64"; leaving kms="LPU64", end="LPU64, + loi->loi_lvb.lvb_size, loi->loi_kms, + lock->l_policy_data.l_extent.end); + } + ldlm_lock_allow_match(lock); + } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) { + LASSERT(lock != NULL); + loi->loi_lvb = *lvb; + ldlm_lock_allow_match(lock); + CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" + " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms); + rc = ELDLM_OK; + } + + if (lock != NULL) { + if (rc != ELDLM_OK) + ldlm_lock_fail_match(lock); + + LDLM_LOCK_PUT(lock); + } +} +EXPORT_SYMBOL(osc_update_enqueue); + +struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; + +/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock + * from the 2nd OSC before a lock from the 1st one. This does not deadlock with + * other synchronous requests, however keeping some locks and trying to obtain + * others may take a considerable amount of time in a case of ost failure; and + * when other sync requests do not get released lock from a client, the client + * is excluded from the cluster -- such scenarious make the life difficult, so + * release locks just after they are obtained. */ +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u64 *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async, int agl) +{ + struct obd_device *obd = exp->exp_obd; + struct ptlrpc_request *req = NULL; + int intent = *flags & LDLM_FL_HAS_INTENT; + int match_lvb = (agl != 0 ? 0 : LDLM_FL_LVB_READY); + ldlm_mode_t mode; + int rc; + ENTRY; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother. */ + policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; + policy->l_extent.end |= ~CFS_PAGE_MASK; + + /* + * kms is not valid when either object is completely fresh (so that no + * locks are cached), or object was evicted. In the latter case cached + * lock cannot be used, because it would prime inode state with + * potentially stale LVB. + */ + if (!kms_valid) + goto no_match; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. + * + * There are problems with conversion deadlocks, so instead of + * converting a read lock to a write lock, we'll just enqueue a new + * one. + * + * At some point we should cancel the read lock instead of making them + * send us a blocking callback, but there are problems with canceling + * locks out from other users right now, too. */ + mode = einfo->ei_mode; + if (einfo->ei_mode == LCK_PR) + mode |= LCK_PW; + mode = ldlm_lock_match(obd->obd_namespace, *flags | match_lvb, res_id, + einfo->ei_type, policy, mode, lockh, 0); + if (mode) { + struct ldlm_lock *matched = ldlm_handle2lock(lockh); + + if ((agl != 0) && !(matched->l_flags & LDLM_FL_LVB_READY)) { + /* For AGL, if enqueue RPC is sent but the lock is not + * granted, then skip to process this strpe. + * Return -ECANCELED to tell the caller. */ + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(matched); + RETURN(-ECANCELED); + } else if (osc_set_lock_data_with_check(matched, einfo)) { + *flags |= LDLM_FL_LVB_READY; + /* addref the lock only if not async requests and PW + * lock is matched whereas we asked for PR. */ + if (!rqset && einfo->ei_mode != mode) + ldlm_lock_addref(lockh, LCK_PR); + if (intent) { + /* I would like to be able to ASSERT here that + * rss <= kms, but I can't, for reasons which + * are explained in lov_enqueue() */ + } + + /* We already have a lock, and it's referenced. + * + * At this point, the cl_lock::cll_state is CLS_QUEUING, + * AGL upcall may change it to CLS_HELD directly. */ + (*upcall)(cookie, ELDLM_OK); + + if (einfo->ei_mode != mode) + ldlm_lock_decref(lockh, LCK_PW); + else if (rqset) + /* For async requests, decref the lock. */ + ldlm_lock_decref(lockh, einfo->ei_mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } else { + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(matched); + } + } + + no_match: + if (intent) { + LIST_HEAD(cancels); + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_LDLM_ENQUEUE_LVB); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ldlm_prep_enqueue_req(exp, req, &cancels, 0); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + ptlrpc_request_set_replen(req); + } + + /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ + *flags &= ~LDLM_FL_BLOCK_GRANTED; + + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + sizeof(*lvb), LVB_T_OST, lockh, async); + if (rqset) { + if (!rc) { + struct osc_enqueue_args *aa; + CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->oa_ei = einfo; + aa->oa_exp = exp; + aa->oa_flags = flags; + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_lvb = lvb; + aa->oa_lockh = lockh; + aa->oa_agl = !!agl; + + req->rq_interpret_reply = + (ptlrpc_interpterer_t)osc_enqueue_interpret; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + else + ptlrpc_set_add_req(rqset, req); + } else if (intent) { + ptlrpc_req_finished(req); + } + RETURN(rc); + } + + rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, agl, rc); + if (intent) + ptlrpc_req_finished(req); + + RETURN(rc); +} + +static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset) +{ + struct ldlm_res_id res_id; + int rc; + ENTRY; + + ostid_build_res_name(&oinfo->oi_md->lsm_oi, &res_id); + rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy, + &oinfo->oi_md->lsm_oinfo[0]->loi_lvb, + oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid, + oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh, + rqset, rqset != NULL, 0); + RETURN(rc); +} + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + int *flags, void *data, struct lustre_handle *lockh, + int unref) +{ + struct obd_device *obd = exp->exp_obd; + int lflags = *flags; + ldlm_mode_t rc; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) + RETURN(-EIO); + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother */ + policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; + policy->l_extent.end |= ~CFS_PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + rc = mode; + if (mode == LCK_PR) + rc |= LCK_PW; + rc = ldlm_lock_match(obd->obd_namespace, lflags, + res_id, type, policy, rc, lockh, unref); + if (rc) { + if (data != NULL) { + if (!osc_set_data_with_check(lockh, data)) { + if (!(lflags & LDLM_FL_TEST_LOCK)) + ldlm_lock_decref(lockh, rc); + RETURN(0); + } + } + if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) { + ldlm_lock_addref(lockh, LCK_PR); + ldlm_lock_decref(lockh, LCK_PW); + } + RETURN(rc); + } + RETURN(rc); +} + +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode) +{ + ENTRY; + + if (unlikely(mode == LCK_GROUP)) + ldlm_lock_decref_and_cancel(lockh, mode); + else + ldlm_lock_decref(lockh, mode); + + RETURN(0); +} + +static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *lockh) +{ + ENTRY; + RETURN(osc_cancel_base(lockh, mode)); +} + +static int osc_cancel_unused(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_cancel_flags_t flags, + void *opaque) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ldlm_res_id res_id, *resp = NULL; + + if (lsm != NULL) { + ostid_build_res_name(&lsm->lsm_oi, &res_id); + resp = &res_id; + } + + return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque); +} + +static int osc_statfs_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + struct osc_async_args *aa, int rc) +{ + struct obd_statfs *msfs; + ENTRY; + + if (rc == -EBADR) + /* The request has in fact never been sent + * due to issues at a higher level (LOV). + * Exit immediately since the caller is + * aware of the problem and takes care + * of the clean up */ + RETURN(rc); + + if ((rc == -ENOTCONN || rc == -EAGAIN) && + (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) + GOTO(out, rc = 0); + + if (rc != 0) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + GOTO(out, rc = -EPROTO); + } + + *aa->aa_oi->oi_osfs = *msfs; +out: + rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + RETURN(rc); +} + +static int osc_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, __u64 max_age, + struct ptlrpc_request_set *rqset) +{ + struct obd_device *obd = class_exp2obd(exp); + struct ptlrpc_request *req; + struct osc_async_args *aa; + int rc; + ENTRY; + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (oinfo->oi_flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret; + CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + aa->aa_oi = oinfo; + + ptlrpc_set_add_req(rqset, req); + RETURN(0); +} + +static int osc_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags) +{ + struct obd_device *obd = class_exp2obd(exp); + struct obd_statfs *msfs; + struct ptlrpc_request *req; + struct obd_import *imp = NULL; + int rc; + ENTRY; + + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684*/ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + + /* We could possibly pass max_age in the request (as an absolute + * timestamp or a "seconds.usec ago") so the target can avoid doing + * extra calls into the filesystem if that isn't necessary (e.g. + * during mount that would help a bit). Having relative timestamps + * is not so great if request processing is slow, while absolute + * timestamps are not ideal because they need time synchronization. */ + req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); + + class_import_put(imp); + + if (req == NULL) + RETURN(-ENOMEM); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + ptlrpc_request_set_replen(req); + req->rq_request_portal = OST_CREATE_PORTAL; + ptlrpc_at_set_req_timeout(req); + + if (flags & OBD_STATFS_NODELAY) { + /* procfs requests not want stat in wait for avoid deadlock */ + req->rq_no_resend = 1; + req->rq_no_delay = 1; + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); + if (msfs == NULL) { + GOTO(out, rc = -EPROTO); + } + + *osfs = *msfs; + + EXIT; + out: + ptlrpc_req_finished(req); + return rc; +} + +/* Retrieve object striping information. + * + * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_MAGIC (we only use 1 slot here). + */ +static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) +{ + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum, *lumk; + struct lov_user_ost_data_v1 *lmm_objects; + int rc = 0, lum_size; + ENTRY; + + if (!lsm) + RETURN(-ENODATA); + + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(&lum, lump, lum_size)) + RETURN(-EFAULT); + + if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) + RETURN(-EINVAL); + + /* lov_user_md_vX and lov_mds_md_vX must have the same size */ + LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1)); + LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3)); + LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0])); + + /* we can use lov_mds_md_size() to compute lum_size + * because lov_user_md_vX and lov_mds_md_vX have the same size */ + if (lum.lmm_stripe_count > 0) { + lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic); + OBD_ALLOC(lumk, lum_size); + if (!lumk) + RETURN(-ENOMEM); + + if (lum.lmm_magic == LOV_USER_MAGIC_V1) + lmm_objects = + &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); + else + lmm_objects = &(lumk->lmm_objects[0]); + lmm_objects->l_ost_oi = lsm->lsm_oi; + } else { + lum_size = lov_mds_md_size(0, lum.lmm_magic); + lumk = &lum; + } + + lumk->lmm_oi = lsm->lsm_oi; + lumk->lmm_stripe_count = 1; + + if (copy_to_user(lump, lumk, lum_size)) + rc = -EFAULT; + + if (lumk != &lum) + OBD_FREE(lumk, lum_size); + + RETURN(rc); +} + + +static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = exp->exp_obd; + struct obd_ioctl_data *data = karg; + int err = 0; + ENTRY; + + if (!try_module_get(THIS_MODULE)) { + CERROR("Can't get module. Is it alive?"); + return -EINVAL; + } + switch (cmd) { + case OBD_IOC_LOV_GET_CONFIG: { + char *buf; + struct lov_desc *desc; + struct obd_uuid uuid; + + buf = NULL; + len = 0; + if (obd_ioctl_getdata(&buf, &len, (void *)uarg)) + GOTO(out, err = -EINVAL); + + data = (struct obd_ioctl_data *)buf; + + if (sizeof(*desc) > data->ioc_inllen1) { + obd_ioctl_freedata(buf, len); + GOTO(out, err = -EINVAL); + } + + if (data->ioc_inllen2 < sizeof(uuid)) { + obd_ioctl_freedata(buf, len); + GOTO(out, err = -EINVAL); + } + + desc = (struct lov_desc *)data->ioc_inlbuf1; + desc->ld_tgt_count = 1; + desc->ld_active_tgt_count = 1; + desc->ld_default_stripe_count = 1; + desc->ld_default_stripe_size = 0; + desc->ld_default_stripe_offset = 0; + desc->ld_pattern = 0; + memcpy(&desc->ld_uuid, &obd->obd_uuid, sizeof(uuid)); + + memcpy(data->ioc_inlbuf2, &obd->obd_uuid, sizeof(uuid)); + + err = copy_to_user((void *)uarg, buf, len); + if (err) + err = -EFAULT; + obd_ioctl_freedata(buf, len); + GOTO(out, err); + } + case LL_IOC_LOV_SETSTRIPE: + err = obd_alloc_memmd(exp, karg); + if (err > 0) + err = 0; + GOTO(out, err); + case LL_IOC_LOV_GETSTRIPE: + err = osc_getstripe(karg, uarg); + GOTO(out, err); + case OBD_IOC_CLIENT_RECOVER: + err = ptlrpc_recover_import(obd->u.cli.cl_import, + data->ioc_inlbuf1, 0); + if (err > 0) + err = 0; + GOTO(out, err); + case IOC_OSC_SET_ACTIVE: + err = ptlrpc_set_import_active(obd->u.cli.cl_import, + data->ioc_offset); + GOTO(out, err); + case OBD_IOC_POLL_QUOTACHECK: + err = osc_quota_poll_check(exp, (struct if_quotacheck *)karg); + GOTO(out, err); + case OBD_IOC_PING_TARGET: + err = ptlrpc_obd_ping(obd); + GOTO(out, err); + default: + CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", + cmd, current_comm()); + GOTO(out, err = -ENOTTY); + } +out: + module_put(THIS_MODULE); + return err; +} + +static int osc_get_info(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + ENTRY; + if (!vallen || !val) + RETURN(-EFAULT); + + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { + __u32 *stripe = val; + *vallen = sizeof(*stripe); + *stripe = 0; + RETURN(0); + } else if (KEY_IS(KEY_LAST_ID)) { + struct ptlrpc_request *req; + obd_id *reply; + char *tmp; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_LAST_ID); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + + req->rq_no_delay = req->rq_no_resend = 1; + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_OBD_ID); + if (reply == NULL) + GOTO(out, rc = -EPROTO); + + *((obd_id *)val) = *reply; + out: + ptlrpc_req_finished(req); + RETURN(rc); + } else if (KEY_IS(KEY_FIEMAP)) { + struct ptlrpc_request *req; + struct ll_user_fiemap *reply; + char *tmp; + int rc; + + req = ptlrpc_request_alloc(class_exp2cliimp(exp), + &RQF_OST_GET_INFO_FIEMAP); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, + RCL_CLIENT, *vallen); + req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, + RCL_SERVER, *vallen); + + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); + memcpy(tmp, val, *vallen); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out1, rc); + + reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); + if (reply == NULL) + GOTO(out1, rc = -EPROTO); + + memcpy(val, reply, *vallen); + out1: + ptlrpc_req_finished(req); + + RETURN(rc); + } + + RETURN(-EINVAL); +} + +static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, + obd_count keylen, void *key, obd_count vallen, + void *val, struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + struct obd_device *obd = exp->exp_obd; + struct obd_import *imp = class_exp2cliimp(exp); + char *tmp; + int rc; + ENTRY; + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); + + if (KEY_IS(KEY_CHECKSUM)) { + if (vallen != sizeof(int)) + RETURN(-EINVAL); + exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; + RETURN(0); + } + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(obd); + RETURN(0); + } + + if (KEY_IS(KEY_FLUSH_CTX)) { + sptlrpc_import_flush_my_ctx(imp); + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_SET)) { + struct client_obd *cli = &obd->u.cli; + + LASSERT(cli->cl_cache == NULL); /* only once */ + cli->cl_cache = (struct cl_client_cache *)val; + atomic_inc(&cli->cl_cache->ccc_users); + cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; + + /* add this osc into entity list */ + LASSERT(list_empty(&cli->cl_lru_osc)); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + + RETURN(0); + } + + if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { + struct client_obd *cli = &obd->u.cli; + int nr = atomic_read(&cli->cl_lru_in_list) >> 1; + int target = *(int *)val; + + nr = osc_lru_shrink(cli, min(nr, target)); + *(int *)val -= nr; + RETURN(0); + } + + if (!set && !KEY_IS(KEY_GRANT_SHRINK)) + RETURN(-EINVAL); + + /* We pass all other commands directly to OST. Since nobody calls osc + methods directly and everybody is supposed to go through LOV, we + assume lov checked invalid values for us. + The only recognised values so far are evict_by_nid and mds_conn. + Even if something bad goes through, we'd get a -EINVAL from OST + anyway. */ + + req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? + &RQF_OST_SET_GRANT_INFO : + &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + if (!KEY_IS(KEY_GRANT_SHRINK)) + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? + &RMF_OST_BODY : + &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + if (KEY_IS(KEY_GRANT_SHRINK)) { + struct osc_grant_args *aa; + struct obdo *oa; + + CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + OBDO_ALLOC(oa); + if (!oa) { + ptlrpc_req_finished(req); + RETURN(-ENOMEM); + } + *oa = ((struct ost_body *)val)->oa; + aa->aa_oa = oa; + req->rq_interpret_reply = osc_shrink_grant_interpret; + } + + ptlrpc_request_set_replen(req); + if (!KEY_IS(KEY_GRANT_SHRINK)) { + LASSERT(set != NULL); + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + + RETURN(0); +} + + +static int osc_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *index) +{ + /* this code is not supposed to be used with LOD/OSP + * to be removed soon */ + LBUG(); + return 0; +} + +static int osc_llog_finish(struct obd_device *obd, int count) +{ + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); + if (ctxt) { + llog_cat_close(NULL, ctxt->loc_handle); + llog_cleanup(NULL, ctxt); + } + + ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) + llog_cleanup(NULL, ctxt); + RETURN(0); +} + +static int osc_reconnect(const struct lu_env *env, + struct obd_export *exp, struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + struct client_obd *cli = &obd->u.cli; + + if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { + long lost_grant; + + client_obd_list_lock(&cli->cl_loi_list_lock); + data->ocd_grant = (cli->cl_avail_grant + cli->cl_dirty) ?: + 2 * cli_brw_size(obd); + lost_grant = cli->cl_lost_grant; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + + CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d" + " ocd_grant: %d, lost: %ld.\n", data->ocd_connect_flags, + data->ocd_version, data->ocd_grant, lost_grant); + } + + RETURN(0); +} + +static int osc_disconnect(struct obd_export *exp) +{ + struct obd_device *obd = class_exp2obd(exp); + struct llog_ctxt *ctxt; + int rc; + + ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) { + if (obd->u.cli.cl_conn_count == 1) { + /* Flush any remaining cancel messages out to the + * target */ + llog_sync(ctxt, exp, 0); + } + llog_ctxt_put(ctxt); + } else { + CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n", + obd); + } + + rc = client_disconnect_export(exp); + /** + * Initially we put del_shrink_grant before disconnect_export, but it + * causes the following problem if setup (connect) and cleanup + * (disconnect) are tangled together. + * connect p1 disconnect p2 + * ptlrpc_connect_import + * ............... class_manual_cleanup + * osc_disconnect + * del_shrink_grant + * ptlrpc_connect_interrupt + * init_grant_shrink + * add this client to shrink list + * cleanup_osc + * Bang! pinger trigger the shrink. + * So the osc should be disconnected from the shrink list, after we + * are sure the import has been destroyed. BUG18662 + */ + if (obd->u.cli.cl_import == NULL) + osc_del_shrink_grant(&obd->u.cli); + return rc; +} + +static int osc_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + struct client_obd *cli; + int rc = 0; + + ENTRY; + LASSERT(imp->imp_obd == obd); + + switch (event) { + case IMP_EVENT_DISCON: { + cli = &obd->u.cli; + client_obd_list_lock(&cli->cl_loi_list_lock); + cli->cl_avail_grant = 0; + cli->cl_lost_grant = 0; + client_obd_list_unlock(&cli->cl_loi_list_lock); + break; + } + case IMP_EVENT_INACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); + break; + } + case IMP_EVENT_INVALIDATE: { + struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* Reset grants */ + cli = &obd->u.cli; + /* all pages go to failing rpcs due to the invalid + * import */ + osc_io_unplug(env, cli, NULL, PDL_POLICY_ROUND); + + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + break; + } + case IMP_EVENT_ACTIVE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); + break; + } + case IMP_EVENT_OCD: { + struct obd_connect_data *ocd = &imp->imp_connect_data; + + if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) + osc_init_grant(&obd->u.cli, ocd); + + /* See bug 7198 */ + if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) + imp->imp_client->cli_request_portal =OST_REQUEST_PORTAL; + + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); + break; + } + case IMP_EVENT_DEACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL); + break; + } + case IMP_EVENT_ACTIVATE: { + rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL); + break; + } + default: + CERROR("Unknown import event %d\n", event); + LBUG(); + } + RETURN(rc); +} + +/** + * Determine whether the lock can be canceled before replaying the lock + * during recovery, see bug16774 for detailed information. + * + * \retval zero the lock can't be canceled + * \retval other ok to cancel + */ +static int osc_cancel_for_recovery(struct ldlm_lock *lock) +{ + check_res_locked(lock->l_resource); + + /* + * Cancel all unused extent lock in granted mode LCK_PR or LCK_CR. + * + * XXX as a future improvement, we can also cancel unused write lock + * if it doesn't have dirty data and active mmaps. + */ + if (lock->l_resource->lr_type == LDLM_EXTENT && + (lock->l_granted_mode == LCK_PR || + lock->l_granted_mode == LCK_CR) && + (osc_dlm_lock_pageref(lock) == 0)) + RETURN(1); + + RETURN(0); +} + +static int brw_queue_work(const struct lu_env *env, void *data) +{ + struct client_obd *cli = data; + + CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); + + osc_io_unplug(env, cli, NULL, PDL_POLICY_SAME); + RETURN(0); +} + +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { 0 }; + struct client_obd *cli = &obd->u.cli; + void *handler; + int rc; + ENTRY; + + rc = ptlrpcd_addref(); + if (rc) + RETURN(rc); + + rc = client_obd_setup(obd, lcfg); + if (rc) + GOTO(out_ptlrpcd, rc); + + handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); + if (IS_ERR(handler)) + GOTO(out_client_setup, rc = PTR_ERR(handler)); + cli->cl_writeback_work = handler; + + rc = osc_quota_setup(obd); + if (rc) + GOTO(out_ptlrpcd_work, rc); + + cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + lprocfs_osc_init_vars(&lvars); + if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) { + lproc_osc_attach_seqstat(obd); + sptlrpc_lprocfs_cliobd_attach(obd); + ptlrpc_lprocfs_register_obd(obd); + } + + /* We need to allocate a few requests more, because + * brw_interpret tries to create new requests before freeing + * previous ones, Ideally we want to have 2x max_rpcs_in_flight + * reserved, but I'm afraid that might be too much wasted RAM + * in fact, so 2 is just my guess and still should work. */ + cli->cl_import->imp_rq_pool = + ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, + OST_MAXREQSIZE, + ptlrpc_add_rqs_to_pool); + + INIT_LIST_HEAD(&cli->cl_grant_shrink_list); + ns_register_cancel(obd->obd_namespace, osc_cancel_for_recovery); + RETURN(rc); + +out_ptlrpcd_work: + ptlrpcd_destroy_work(handler); +out_client_setup: + client_obd_cleanup(obd); +out_ptlrpcd: + ptlrpcd_decref(); + RETURN(rc); +} + +static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) +{ + int rc = 0; + ENTRY; + + switch (stage) { + case OBD_CLEANUP_EARLY: { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_HA, "Deactivating import %s\n", obd->obd_name); + /* ptlrpc_abort_inflight to stop an mds_lov_synchronize */ + ptlrpc_deactivate_import(imp); + spin_lock(&imp->imp_lock); + imp->imp_pingable = 0; + spin_unlock(&imp->imp_lock); + break; + } + case OBD_CLEANUP_EXPORTS: { + struct client_obd *cli = &obd->u.cli; + /* LU-464 + * for echo client, export may be on zombie list, wait for + * zombie thread to cull it, because cli.cl_import will be + * cleared in client_disconnect_export(): + * class_export_destroy() -> obd_cleanup() -> + * echo_device_free() -> echo_client_cleanup() -> + * obd_disconnect() -> osc_disconnect() -> + * client_disconnect_export() + */ + obd_zombie_barrier(); + if (cli->cl_writeback_work) { + ptlrpcd_destroy_work(cli->cl_writeback_work); + cli->cl_writeback_work = NULL; + } + obd_cleanup_client_import(obd); + ptlrpc_lprocfs_unregister_obd(obd); + lprocfs_obd_cleanup(obd); + rc = obd_llog_finish(obd, 0); + if (rc != 0) + CERROR("failed to cleanup llogging subsystems\n"); + break; + } + } + RETURN(rc); +} + +int osc_cleanup(struct obd_device *obd) +{ + struct client_obd *cli = &obd->u.cli; + int rc; + + ENTRY; + + /* lru cleanup */ + if (cli->cl_cache != NULL) { + LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); + spin_lock(&cli->cl_cache->ccc_lru_lock); + list_del_init(&cli->cl_lru_osc); + spin_unlock(&cli->cl_cache->ccc_lru_lock); + cli->cl_lru_left = NULL; + atomic_dec(&cli->cl_cache->ccc_users); + cli->cl_cache = NULL; + } + + /* free memory of osc quota cache */ + osc_quota_cleanup(obd); + + rc = client_obd_cleanup(obd); + + ptlrpcd_decref(); + RETURN(rc); +} + +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct lprocfs_static_vars lvars = { 0 }; + int rc = 0; + + lprocfs_osc_init_vars(&lvars); + + switch (lcfg->lcfg_command) { + default: + rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, + lcfg, obd); + if (rc > 0) + rc = 0; + break; + } + + return(rc); +} + +static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + return osc_process_config_base(obd, buf); +} + +struct obd_ops osc_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = osc_setup, + .o_precleanup = osc_precleanup, + .o_cleanup = osc_cleanup, + .o_add_conn = client_import_add_conn, + .o_del_conn = client_import_del_conn, + .o_connect = client_connect_import, + .o_reconnect = osc_reconnect, + .o_disconnect = osc_disconnect, + .o_statfs = osc_statfs, + .o_statfs_async = osc_statfs_async, + .o_packmd = osc_packmd, + .o_unpackmd = osc_unpackmd, + .o_create = osc_create, + .o_destroy = osc_destroy, + .o_getattr = osc_getattr, + .o_getattr_async = osc_getattr_async, + .o_setattr = osc_setattr, + .o_setattr_async = osc_setattr_async, + .o_brw = osc_brw, + .o_punch = osc_punch, + .o_sync = osc_sync, + .o_enqueue = osc_enqueue, + .o_change_cbdata = osc_change_cbdata, + .o_find_cbdata = osc_find_cbdata, + .o_cancel = osc_cancel, + .o_cancel_unused = osc_cancel_unused, + .o_iocontrol = osc_iocontrol, + .o_get_info = osc_get_info, + .o_set_info_async = osc_set_info_async, + .o_import_event = osc_import_event, + .o_llog_init = osc_llog_init, + .o_llog_finish = osc_llog_finish, + .o_process_config = osc_process_config, + .o_quotactl = osc_quotactl, + .o_quotacheck = osc_quotacheck, +}; + +extern struct lu_kmem_descr osc_caches[]; +extern spinlock_t osc_ast_guard; +extern struct lock_class_key osc_ast_guard_class; + +int __init osc_init(void) +{ + struct lprocfs_static_vars lvars = { 0 }; + int rc; + ENTRY; + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); + + rc = lu_kmem_init(osc_caches); + + lprocfs_osc_init_vars(&lvars); + + rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars, + LUSTRE_OSC_NAME, &osc_device_type); + if (rc) { + lu_kmem_fini(osc_caches); + RETURN(rc); + } + + spin_lock_init(&osc_ast_guard); + lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); + + RETURN(rc); +} + +static void /*__exit*/ osc_exit(void) +{ + class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); +MODULE_LICENSE("GPL"); + +cfs_module(osc, LUSTRE_VERSION_STRING, osc_init, osc_exit); diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile new file mode 100644 index 000000000000..983eb66a554d --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/Makefile @@ -0,0 +1,23 @@ +obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o +LDLM := ../../lustre/ldlm/ + +ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o +ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o +ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o +ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o +ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o +ldlm_objs += $(LDLM)ldlm_pool.o +ldlm_objs += $(LDLM)interval_tree.o +ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o +ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o +ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o +ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o +ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o sec_lproc.o +ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o + +ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs) + +obj-$(CONFIG_PTLRPC_GSS) += gss/ + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c new file mode 100644 index 000000000000..22f7e654c9d8 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/client.c @@ -0,0 +1,3059 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** Implementation of client-side PortalRPC interfaces */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +static int ptlrpc_send_new_req(struct ptlrpc_request *req); + +/** + * Initialize passed in client structure \a cl. + */ +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *cl) +{ + cl->cli_request_portal = req_portal; + cl->cli_reply_portal = rep_portal; + cl->cli_name = name; +} +EXPORT_SYMBOL(ptlrpc_init_client); + +/** + * Return PortalRPC connection for remore uud \a uuid + */ +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) +{ + struct ptlrpc_connection *c; + lnet_nid_t self; + lnet_process_id_t peer; + int err; + + /* ptlrpc_uuid_to_peer() initializes its 2nd parameter + * before accessing its values. */ + /* coverity[uninit_use_in_call] */ + err = ptlrpc_uuid_to_peer(uuid, &peer, &self); + if (err != 0) { + CNETERR("cannot find peer %s!\n", uuid->uuid); + return NULL; + } + + c = ptlrpc_connection_get(peer, self, uuid); + if (c) { + memcpy(c->c_remote_uuid.uuid, + uuid->uuid, sizeof(c->c_remote_uuid.uuid)); + } + + CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); + + return c; +} +EXPORT_SYMBOL(ptlrpc_uuid_to_connection); + +/** + * Allocate and initialize new bulk descriptor on the sender. + * Returns pointer to the descriptor or NULL on error. + */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + unsigned type, unsigned portal) +{ + struct ptlrpc_bulk_desc *desc; + int i; + + OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[npages])); + if (!desc) + return NULL; + + spin_lock_init(&desc->bd_lock); + init_waitqueue_head(&desc->bd_waitq); + desc->bd_max_iov = npages; + desc->bd_iov_count = 0; + desc->bd_portal = portal; + desc->bd_type = type; + desc->bd_md_count = 0; + LASSERT(max_brw > 0); + desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); + /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this + * node. Negotiated ocd_brw_size will always be <= this number. */ + for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) + LNetInvalidateHandle(&desc->bd_mds[i]); + + return desc; +} + +/** + * Prepare bulk descriptor for specified outgoing request \a req that + * can fit \a npages * pages. \a type is bulk type. \a portal is where + * the bulk to be sent. Used on client-side. + * Returns pointer to newly allocatrd initialized bulk descriptor or NULL on + * error. + */ +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned npages, unsigned max_brw, + unsigned type, unsigned portal) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_bulk_desc *desc; + + ENTRY; + LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); + desc = ptlrpc_new_bulk(npages, max_brw, type, portal); + if (desc == NULL) + RETURN(NULL); + + desc->bd_import_generation = req->rq_import_generation; + desc->bd_import = class_import_get(imp); + desc->bd_req = req; + + desc->bd_cbid.cbid_fn = client_bulk_callback; + desc->bd_cbid.cbid_arg = desc; + + /* This makes req own desc, and free it when she frees herself */ + req->rq_bulk = desc; + + return desc; +} +EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); + +/** + * Add a page \a page to the bulk descriptor \a desc. + * Data to transfer in the page starts at offset \a pageoffset and + * amount of data to transfer from the page is \a len + */ +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, int pin) +{ + LASSERT(desc->bd_iov_count < desc->bd_max_iov); + LASSERT(page != NULL); + LASSERT(pageoffset >= 0); + LASSERT(len > 0); + LASSERT(pageoffset + len <= PAGE_CACHE_SIZE); + + desc->bd_nob += len; + + if (pin) + page_cache_get(page); + + ptlrpc_add_bulk_page(desc, page, pageoffset, len); +} +EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); + +/** + * Uninitialize and free bulk descriptor \a desc. + * Works on bulk descriptors both from server and client side. + */ +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) +{ + int i; + ENTRY; + + LASSERT(desc != NULL); + LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ + LASSERT(desc->bd_md_count == 0); /* network hands off */ + LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); + + sptlrpc_enc_pool_put_pages(desc); + + if (desc->bd_export) + class_export_put(desc->bd_export); + else + class_import_put(desc->bd_import); + + if (unpin) { + for (i = 0; i < desc->bd_iov_count ; i++) + page_cache_release(desc->bd_iov[i].kiov_page); + } + + OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, + bd_iov[desc->bd_max_iov])); + EXIT; +} +EXPORT_SYMBOL(__ptlrpc_free_bulk); + +/** + * Set server timelimit for this req, i.e. how long are we willing to wait + * for reply before timing out this request. + */ +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) +{ + __u32 serv_est; + int idx; + struct imp_at *at; + + LASSERT(req->rq_import); + + if (AT_OFF) { + /* non-AT settings */ + /** + * \a imp_server_timeout means this is reverse import and + * we send (currently only) ASTs to the client and cannot afford + * to wait too long for the reply, otherwise the other client + * (because of which we are sending this request) would + * timeout waiting for us + */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + } else { + at = &req->rq_import->imp_at; + idx = import_at_get_index(req->rq_import, + req->rq_request_portal); + serv_est = at_get(&at->iat_service_estimate[idx]); + req->rq_timeout = at_est2timeout(serv_est); + } + /* We could get even fancier here, using history to predict increased + loading... */ + + /* Let the server know what this RPC timeout is by putting it in the + reqmsg*/ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} +EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req, + unsigned int serv_est) +{ + int idx; + unsigned int oldse; + struct imp_at *at; + + LASSERT(req->rq_import); + at = &req->rq_import->imp_at; + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + /* max service estimates are tracked on the server side, + so just keep minimal history here */ + oldse = at_measured(&at->iat_service_estimate[idx], serv_est); + if (oldse != 0) + CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name,req->rq_request_portal, + oldse, at_get(&at->iat_service_estimate[idx])); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) +{ + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, + unsigned int service_time) +{ + unsigned int nl, oldnl; + struct imp_at *at; + time_t now = cfs_time_current_sec(); + + LASSERT(req->rq_import); + at = &req->rq_import->imp_at; + + /* Network latency is total time less server processing time */ + nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/; + if (service_time > now - req->rq_sent + 3 /* bz16408 */) + CWARN("Reported service time %u > total measured time " + CFS_DURATION_T"\n", service_time, + cfs_time_sub(now, req->rq_sent)); + + oldnl = at_measured(&at->iat_net_latency, nl); + if (oldnl != 0) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) " + "has changed from %d to %d\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str( + &req->rq_import->imp_connection->c_remote_uuid), + oldnl, at_get(&at->iat_net_latency)); +} + +static int unpack_reply(struct ptlrpc_request *req) +{ + int rc; + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); + return(-EPROTO); + } + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); + return(-EPROTO); + } + return 0; +} + +/** + * Handle an early reply message, called with the rq_lock held. + * If anything goes wrong just ignore it - same as if it never happened + */ +static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_request *early_req; + time_t olddl; + int rc; + ENTRY; + + req->rq_early = 0; + spin_unlock(&req->rq_lock); + + rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); + if (rc) { + spin_lock(&req->rq_lock); + RETURN(rc); + } + + rc = unpack_reply(early_req); + if (rc == 0) { + /* Expecting to increase the service time estimate here */ + ptlrpc_at_adj_service(req, + lustre_msg_get_timeout(early_req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_time(early_req->rq_repmsg)); + } + + sptlrpc_cli_finish_early_reply(early_req); + + if (rc != 0) { + spin_lock(&req->rq_lock); + RETURN(rc); + } + + /* Adjust the local timeout for this req */ + ptlrpc_at_set_req_timeout(req); + + spin_lock(&req->rq_lock); + olddl = req->rq_deadline; + /* server assumes it now has rq_timeout from when it sent the + * early reply, so client should give it at least that long. */ + req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in "CFS_DURATION_T"s " + "("CFS_DURATION_T"s)", req->rq_early_count, + cfs_time_sub(req->rq_deadline, cfs_time_current_sec()), + cfs_time_sub(req->rq_deadline, olddl)); + + RETURN(rc); +} + +/** + * Wind down request pool \a pool. + * Frees all requests from the pool too + */ +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) +{ + struct list_head *l, *tmp; + struct ptlrpc_request *req; + + LASSERT(pool != NULL); + + spin_lock(&pool->prp_lock); + list_for_each_safe(l, tmp, &pool->prp_req_list) { + req = list_entry(l, struct ptlrpc_request, rq_list); + list_del(&req->rq_list); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); + OBD_FREE_LARGE(req->rq_reqbuf, pool->prp_rq_size); + OBD_FREE(req, sizeof(*req)); + } + spin_unlock(&pool->prp_lock); + OBD_FREE(pool, sizeof(*pool)); +} +EXPORT_SYMBOL(ptlrpc_free_rq_pool); + +/** + * Allocates, initializes and adds \a num_rq requests to the pool \a pool + */ +void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) +{ + int i; + int size = 1; + + while (size < pool->prp_rq_size) + size <<= 1; + + LASSERTF(list_empty(&pool->prp_req_list) || + size == pool->prp_rq_size, + "Trying to change pool size with nonempty pool " + "from %d to %d bytes\n", pool->prp_rq_size, size); + + spin_lock(&pool->prp_lock); + pool->prp_rq_size = size; + for (i = 0; i < num_rq; i++) { + struct ptlrpc_request *req; + struct lustre_msg *msg; + + spin_unlock(&pool->prp_lock); + OBD_ALLOC(req, sizeof(struct ptlrpc_request)); + if (!req) + return; + OBD_ALLOC_LARGE(msg, size); + if (!msg) { + OBD_FREE(req, sizeof(struct ptlrpc_request)); + return; + } + req->rq_reqbuf = msg; + req->rq_reqbuf_len = size; + req->rq_pool = pool; + spin_lock(&pool->prp_lock); + list_add_tail(&req->rq_list, &pool->prp_req_list); + } + spin_unlock(&pool->prp_lock); + return; +} +EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); + +/** + * Create and initialize new request pool with given attributes: + * \a num_rq - initial number of requests to create for the pool + * \a msgsize - maximum message size possible for requests in thid pool + * \a populate_pool - function to be called when more requests need to be added + * to the pool + * Returns pointer to newly created pool or NULL on error. + */ +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int num_rq, int msgsize, + void (*populate_pool)(struct ptlrpc_request_pool *, int)) +{ + struct ptlrpc_request_pool *pool; + + OBD_ALLOC(pool, sizeof (struct ptlrpc_request_pool)); + if (!pool) + return NULL; + + /* Request next power of two for the allocation, because internally + kernel would do exactly this */ + + spin_lock_init(&pool->prp_lock); + INIT_LIST_HEAD(&pool->prp_req_list); + pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; + pool->prp_populate = populate_pool; + + populate_pool(pool, num_rq); + + if (list_empty(&pool->prp_req_list)) { + /* have not allocated a single request for the pool */ + OBD_FREE(pool, sizeof (struct ptlrpc_request_pool)); + pool = NULL; + } + return pool; +} +EXPORT_SYMBOL(ptlrpc_init_rq_pool); + +/** + * Fetches one request from pool \a pool + */ +static struct ptlrpc_request * +ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + struct lustre_msg *reqbuf; + + if (!pool) + return NULL; + + spin_lock(&pool->prp_lock); + + /* See if we have anything in a pool, and bail out if nothing, + * in writeout path, where this matters, this is safe to do, because + * nothing is lost in this case, and when some in-flight requests + * complete, this code will be called again. */ + if (unlikely(list_empty(&pool->prp_req_list))) { + spin_unlock(&pool->prp_lock); + return NULL; + } + + request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, + rq_list); + list_del_init(&request->rq_list); + spin_unlock(&pool->prp_lock); + + LASSERT(request->rq_reqbuf); + LASSERT(request->rq_pool); + + reqbuf = request->rq_reqbuf; + memset(request, 0, sizeof(*request)); + request->rq_reqbuf = reqbuf; + request->rq_reqbuf_len = pool->prp_rq_size; + request->rq_pool = pool; + + return request; +} + +/** + * Returns freed \a request to pool. + */ +static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) +{ + struct ptlrpc_request_pool *pool = request->rq_pool; + + spin_lock(&pool->prp_lock); + LASSERT(list_empty(&request->rq_list)); + LASSERT(!request->rq_receiving_reply); + list_add_tail(&request->rq_list, &pool->prp_req_list); + spin_unlock(&pool->prp_lock); +} + +static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + struct obd_import *imp = request->rq_import; + int rc; + ENTRY; + + if (unlikely(ctx)) + request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); + else { + rc = sptlrpc_req_get_ctx(request); + if (rc) + GOTO(out_free, rc); + } + + sptlrpc_req_set_flavor(request, opcode); + + rc = lustre_pack_request(request, imp->imp_msg_magic, count, + lengths, bufs); + if (rc) { + LASSERT(!request->rq_pool); + GOTO(out_ctx, rc); + } + + lustre_msg_add_version(request->rq_reqmsg, version); + request->rq_send_state = LUSTRE_IMP_FULL; + request->rq_type = PTL_RPC_MSG_REQUEST; + request->rq_export = NULL; + + request->rq_req_cbid.cbid_fn = request_out_callback; + request->rq_req_cbid.cbid_arg = request; + + request->rq_reply_cbid.cbid_fn = reply_in_callback; + request->rq_reply_cbid.cbid_arg = request; + + request->rq_reply_deadline = 0; + request->rq_phase = RQ_PHASE_NEW; + request->rq_next_phase = RQ_PHASE_UNDEFINED; + + request->rq_request_portal = imp->imp_client->cli_request_portal; + request->rq_reply_portal = imp->imp_client->cli_reply_portal; + + ptlrpc_at_set_req_timeout(request); + + spin_lock_init(&request->rq_lock); + INIT_LIST_HEAD(&request->rq_list); + INIT_LIST_HEAD(&request->rq_timed_list); + INIT_LIST_HEAD(&request->rq_replay_list); + INIT_LIST_HEAD(&request->rq_ctx_chain); + INIT_LIST_HEAD(&request->rq_set_chain); + INIT_LIST_HEAD(&request->rq_history_list); + INIT_LIST_HEAD(&request->rq_exp_list); + init_waitqueue_head(&request->rq_reply_waitq); + init_waitqueue_head(&request->rq_set_waitq); + request->rq_xid = ptlrpc_next_xid(); + atomic_set(&request->rq_refcount, 1); + + lustre_msg_set_opc(request->rq_reqmsg, opcode); + + RETURN(0); +out_ctx: + sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); +out_free: + class_import_put(imp); + return rc; +} + +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx) +{ + int count; + + count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); + return __ptlrpc_request_bufs_pack(request, version, opcode, count, + request->rq_pill.rc_area[RCL_CLIENT], + bufs, ctx); +} +EXPORT_SYMBOL(ptlrpc_request_bufs_pack); + +/** + * Pack request buffers for network transfer, performing necessary encryption + * steps if necessary. + */ +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode) +{ + int rc; + rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); + if (rc) + return rc; + + /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of + * ptlrpc_body sent from server equal to local ptlrpc_body size, so we + * have to send old ptlrpc_body to keep interoprability with these + * clients. + * + * Only three kinds of server->client RPCs so far: + * - LDLM_BL_CALLBACK + * - LDLM_CP_CALLBACK + * - LDLM_GL_CALLBACK + * + * XXX This should be removed whenever we drop the interoprability with + * the these old clients. + */ + if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || + opcode == LDLM_GL_CALLBACK) + req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, + sizeof(struct ptlrpc_body_v2), RCL_CLIENT); + + return rc; +} +EXPORT_SYMBOL(ptlrpc_request_pack); + +/** + * Helper function to allocate new request on import \a imp + * and possibly using existing request from pool \a pool if provided. + * Returns allocated request structure with import field filled or + * NULL on error. + */ +static inline +struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request = NULL; + + if (pool) + request = ptlrpc_prep_req_from_pool(pool); + + if (!request) + OBD_ALLOC_PTR(request); + + if (request) { + LASSERTF((unsigned long)imp > 0x1000, "%p", imp); + LASSERT(imp != LP_POISON); + LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p", + imp->imp_client); + LASSERT(imp->imp_client != LP_POISON); + + request->rq_import = class_import_get(imp); + } else { + CERROR("request allocation out of memory\n"); + } + + return request; +} + +/** + * Helper function for creating a request. + * Calls __ptlrpc_request_alloc to allocate new request sturcture and inits + * buffer structures according to capsule template \a format. + * Returns allocated request structure pointer or NULL on error. + */ +static struct ptlrpc_request * +ptlrpc_request_alloc_internal(struct obd_import *imp, + struct ptlrpc_request_pool * pool, + const struct req_format *format) +{ + struct ptlrpc_request *request; + + request = __ptlrpc_request_alloc(imp, pool); + if (request == NULL) + return NULL; + + req_capsule_init(&request->rq_pill, request, RCL_CLIENT); + req_capsule_set(&request->rq_pill, format); + return request; +} + +/** + * Allocate new request structure for import \a imp and initialize its + * buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, NULL, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc); + +/** + * Allocate new request structure for import \a imp from pool \a pool and + * initialize its buffer structure according to capsule template \a format. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool * pool, + const struct req_format *format) +{ + return ptlrpc_request_alloc_internal(imp, pool, format); +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pool); + +/** + * For requests not from pool, free memory of the request structure. + * For requests obtained from a pool earlier, return request back to pool. + */ +void ptlrpc_request_free(struct ptlrpc_request *request) +{ + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + OBD_FREE_PTR(request); +} +EXPORT_SYMBOL(ptlrpc_request_free); + +/** + * Allocate new request for operatione \a opcode and immediatelly pack it for + * network transfer. + * Only used for simple requests like OBD_PING where the only important + * part of the request is operation itself. + * Returns allocated request or NULL on error. + */ +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode) +{ + struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); + int rc; + + if (req) { + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + } + } + return req; +} +EXPORT_SYMBOL(ptlrpc_request_alloc_pack); + +/** + * Prepare request (fetched from pool \a poolif not NULL) on import \a imp + * for operation \a opcode. Request would contain \a count buffers. + * Sizes of buffers are described in array \a lengths and buffers themselves + * are provided by a pointer \a bufs. + * Returns prepared request structure pointer or NULL on error. + */ +struct ptlrpc_request * +ptlrpc_prep_req_pool(struct obd_import *imp, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_request_pool *pool) +{ + struct ptlrpc_request *request; + int rc; + + request = __ptlrpc_request_alloc(imp, pool); + if (!request) + return NULL; + + rc = __ptlrpc_request_bufs_pack(request, version, opcode, count, + lengths, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + request = NULL; + } + return request; +} +EXPORT_SYMBOL(ptlrpc_prep_req_pool); + +/** + * Same as ptlrpc_prep_req_pool, but without pool + */ +struct ptlrpc_request * +ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, + __u32 *lengths, char **bufs) +{ + return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs, + NULL); +} +EXPORT_SYMBOL(ptlrpc_prep_req); + +/** + * Allocate and initialize new request set structure. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_set(void) +{ + struct ptlrpc_request_set *set; + + ENTRY; + OBD_ALLOC(set, sizeof *set); + if (!set) + RETURN(NULL); + atomic_set(&set->set_refcount, 1); + INIT_LIST_HEAD(&set->set_requests); + init_waitqueue_head(&set->set_waitq); + atomic_set(&set->set_new_count, 0); + atomic_set(&set->set_remaining, 0); + spin_lock_init(&set->set_new_req_lock); + INIT_LIST_HEAD(&set->set_new_requests); + INIT_LIST_HEAD(&set->set_cblist); + set->set_max_inflight = UINT_MAX; + set->set_producer = NULL; + set->set_producer_arg = NULL; + set->set_rc = 0; + + RETURN(set); +} +EXPORT_SYMBOL(ptlrpc_prep_set); + +/** + * Allocate and initialize new request set structure with flow control + * extension. This extension allows to control the number of requests in-flight + * for the whole set. A callback function to generate requests must be provided + * and the request set will keep the number of requests sent over the wire to + * @max_inflight. + * Returns a pointer to the newly allocated set structure or NULL on error. + */ +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg) + +{ + struct ptlrpc_request_set *set; + + set = ptlrpc_prep_set(); + if (!set) + RETURN(NULL); + + set->set_max_inflight = max; + set->set_producer = func; + set->set_producer_arg = arg; + + RETURN(set); +} +EXPORT_SYMBOL(ptlrpc_prep_fcset); + +/** + * Wind down and free request set structure previously allocated with + * ptlrpc_prep_set. + * Ensures that all requests on the set have completed and removes + * all requests from the request list in a set. + * If any unsent request happen to be on the list, pretends that they got + * an error in flight and calls their completion handler. + */ +void ptlrpc_set_destroy(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct list_head *next; + int expected_phase; + int n = 0; + ENTRY; + + /* Requests on the set should either all be completed, or all be new */ + expected_phase = (atomic_read(&set->set_remaining) == 0) ? + RQ_PHASE_COMPLETE : RQ_PHASE_NEW; + list_for_each (tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + n++; + } + + LASSERTF(atomic_read(&set->set_remaining) == 0 || + atomic_read(&set->set_remaining) == n, "%d / %d\n", + atomic_read(&set->set_remaining), n); + + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + list_del_init(&req->rq_set_chain); + + LASSERT(req->rq_phase == expected_phase); + + if (req->rq_phase == RQ_PHASE_NEW) { + ptlrpc_req_interpret(NULL, req, -EBADR); + atomic_dec(&set->set_remaining); + } + + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + ptlrpc_req_finished (req); + } + + LASSERT(atomic_read(&set->set_remaining) == 0); + + ptlrpc_reqset_put(set); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_set_destroy); + +/** + * Add a callback function \a fn to the set. + * This function would be called when all requests on this set are completed. + * The function will be passed \a data argument. + */ +int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, + set_interpreter_func fn, void *data) +{ + struct ptlrpc_set_cbdata *cbdata; + + OBD_ALLOC_PTR(cbdata); + if (cbdata == NULL) + RETURN(-ENOMEM); + + cbdata->psc_interpret = fn; + cbdata->psc_data = data; + list_add_tail(&cbdata->psc_item, &set->set_cblist); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_set_add_cb); + +/** + * Add a new request to the general purpose request set. + * Assumes request reference from the caller. + */ +void ptlrpc_set_add_req(struct ptlrpc_request_set *set, + struct ptlrpc_request *req) +{ + LASSERT(list_empty(&req->rq_set_chain)); + + /* The set takes over the caller's request reference */ + list_add_tail(&req->rq_set_chain, &set->set_requests); + req->rq_set = set; + atomic_inc(&set->set_remaining); + req->rq_queued_time = cfs_time_current(); + + if (req->rq_reqmsg != NULL) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + if (set->set_producer != NULL) + /* If the request set has a producer callback, the RPC must be + * sent straight away */ + ptlrpc_send_new_req(req); +} +EXPORT_SYMBOL(ptlrpc_set_add_req); + +/** + * Add a request to a request with dedicated server thread + * and wake the thread to make any necessary processing. + * Currently only used for ptlrpcd. + */ +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set = pc->pc_set; + int count, i; + + LASSERT(req->rq_set == NULL); + LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); + + spin_lock(&set->set_new_req_lock); + /* + * The set takes over the caller's request reference. + */ + req->rq_set = set; + req->rq_queued_time = cfs_time_current(); + list_add_tail(&req->rq_set_chain, &set->set_new_requests); + count = atomic_inc_return(&set->set_new_count); + spin_unlock(&set->set_new_req_lock); + + /* Only need to call wakeup once for the first entry. */ + if (count == 1) { + wake_up(&set->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} +EXPORT_SYMBOL(ptlrpc_set_add_new_req); + +/** + * Based on the current state of the import, determine if the request + * can be sent, is an error, or should be delayed. + * + * Returns true if this request should be delayed. If false, and + * *status is set, then the request can not be sent and *status is the + * error code. If false and status is 0, then request can be sent. + * + * The imp->imp_lock must be held. + */ +static int ptlrpc_import_delay_req(struct obd_import *imp, + struct ptlrpc_request *req, int *status) +{ + int delay = 0; + ENTRY; + + LASSERT (status != NULL); + *status = 0; + + if (req->rq_ctx_init || req->rq_ctx_fini) { + /* always allow ctx init/fini rpc go through */ + } else if (imp->imp_state == LUSTRE_IMP_NEW) { + DEBUG_REQ(D_ERROR, req, "Uninitialized import."); + *status = -EIO; + } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { + /* pings may safely race with umount */ + DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? + D_HA : D_ERROR, req, "IMP_CLOSED "); + *status = -EIO; + } else if (ptlrpc_send_limit_expired(req)) { + /* probably doesn't need to be a D_ERROR after initial testing */ + DEBUG_REQ(D_ERROR, req, "send limit expired "); + *status = -EIO; + } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && + imp->imp_state == LUSTRE_IMP_CONNECTING) { + /* allow CONNECT even if import is invalid */ ; + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } + } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { + if (!imp->imp_deactive) + DEBUG_REQ(D_NET, req, "IMP_INVALID"); + *status = -ESHUTDOWN; /* bz 12940 */ + } else if (req->rq_import_generation != imp->imp_generation) { + DEBUG_REQ(D_ERROR, req, "req wrong generation:"); + *status = -EIO; + } else if (req->rq_send_state != imp->imp_state) { + /* invalidate in progress - any requests should be drop */ + if (atomic_read(&imp->imp_inval_count) != 0) { + DEBUG_REQ(D_ERROR, req, "invalidate in flight"); + *status = -EIO; + } else if (imp->imp_dlm_fake || req->rq_no_delay) { + *status = -EWOULDBLOCK; + } else if (req->rq_allow_replay && + (imp->imp_state == LUSTRE_IMP_REPLAY || + imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || + imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || + imp->imp_state == LUSTRE_IMP_RECOVER)) { + DEBUG_REQ(D_HA, req, "allow during recovery.\n"); + } else { + delay = 1; + } + } + + RETURN(delay); +} + +/** + * Decide if the eror message regarding provided request \a req + * should be printed to the console or not. + * Makes it's decision on request status and other properties. + * Returns 1 to print error on the system console or 0 if not. + */ +static int ptlrpc_console_allow(struct ptlrpc_request *req) +{ + __u32 opc; + int err; + + LASSERT(req->rq_reqmsg != NULL); + opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Suppress particular reconnect errors which are to be expected. No + * errors are suppressed for the initial connection on an import */ + if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && + (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { + + /* Suppress timed out reconnect requests */ + if (req->rq_timedout) + return 0; + + /* Suppress unavailable/again reconnect requests */ + err = lustre_msg_get_status(req->rq_repmsg); + if (err == -ENODEV || err == -EAGAIN) + return 0; + } + + return 1; +} + +/** + * Check request processing status. + * Returns the status. + */ +static int ptlrpc_check_status(struct ptlrpc_request *req) +{ + int err; + ENTRY; + + err = lustre_msg_get_status(req->rq_repmsg); + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { + struct obd_import *imp = req->rq_import; + __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); + if (ptlrpc_console_allow(req)) + LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s," + " operation %s failed with %d.\n", + imp->imp_obd->obd_name, + libcfs_nid2str( + imp->imp_connection->c_peer.nid), + ll_opcode2str(opc), err); + RETURN(err < 0 ? err : -EINVAL); + } + + if (err < 0) { + DEBUG_REQ(D_INFO, req, "status is %d", err); + } else if (err > 0) { + /* XXX: translate this error from net to host */ + DEBUG_REQ(D_INFO, req, "status is %d", err); + } + + RETURN(err); +} + +/** + * save pre-versions of objects into request for replay. + * Versions are obtained from server reply. + * used for VBR. + */ +static void ptlrpc_save_versions(struct ptlrpc_request *req) +{ + struct lustre_msg *repmsg = req->rq_repmsg; + struct lustre_msg *reqmsg = req->rq_reqmsg; + __u64 *versions = lustre_msg_get_versions(repmsg); + ENTRY; + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) + return; + + LASSERT(versions); + lustre_msg_set_versions(reqmsg, versions); + CDEBUG(D_INFO, "Client save versions ["LPX64"/"LPX64"]\n", + versions[0], versions[1]); + + EXIT; +} + +/** + * Callback function called when client receives RPC reply for \a req. + * Returns 0 on success or error code. + * The return alue would be assigned to req->rq_status by the caller + * as request processing status. + * This function also decides if the request needs to be saved for later replay. + */ +static int after_reply(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct obd_device *obd = req->rq_import->imp_obd; + int rc; + struct timeval work_start; + long timediff; + ENTRY; + + LASSERT(obd != NULL); + /* repbuf must be unlinked */ + LASSERT(!req->rq_receiving_reply && !req->rq_must_unlink); + + if (req->rq_reply_truncate) { + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_ERROR, req, "reply buffer overflow," + " expected: %d, actual size: %d", + req->rq_nob_received, req->rq_repbuf_len); + RETURN(-EOVERFLOW); + } + + sptlrpc_cli_free_repbuf(req); + /* Pass the required reply buffer size (include + * space for early reply). + * NB: no need to roundup because alloc_repbuf + * will roundup it */ + req->rq_replen = req->rq_nob_received; + req->rq_nob_received = 0; + req->rq_resend = 1; + RETURN(0); + } + + /* + * NB Until this point, the whole of the incoming message, + * including buflens, status etc is in the sender's byte order. + */ + rc = sptlrpc_cli_unwrap_reply(req); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); + RETURN(rc); + } + + /* + * Security layer unwrap might ask resend this request. + */ + if (req->rq_resend) + RETURN(0); + + rc = unpack_reply(req); + if (rc) + RETURN(rc); + + /* retry indefinitely on EINPROGRESS */ + if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && + ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { + time_t now = cfs_time_current_sec(); + + DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); + req->rq_resend = 1; + req->rq_nr_resend++; + + /* allocate new xid to avoid reply reconstruction */ + if (!req->rq_bulk) { + /* new xid is already allocated for bulk in + * ptlrpc_check_set() */ + req->rq_xid = ptlrpc_next_xid(); + DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for " + "resend on EINPROGRESS"); + } + + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + /* delay resend to give a chance to the server to get ready. + * The delay is increased by 1s on every resend and is capped to + * the current request timeout (i.e. obd_timeout if AT is off, + * or AT service time x 125% + 5s, see at_est2timeout) */ + if (req->rq_nr_resend > req->rq_timeout) + req->rq_sent = now + req->rq_timeout; + else + req->rq_sent = now + req->rq_nr_resend; + + RETURN(0); + } + + do_gettimeofday(&work_start); + timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL); + if (obd->obd_svc_stats != NULL) { + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, + timediff); + ptlrpc_lprocfs_rpc_sent(req, timediff); + } + + if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && + lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", + lustre_msg_get_type(req->rq_repmsg)); + RETURN(-EPROTO); + } + + if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); + ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); + ptlrpc_at_adj_net_latency(req, + lustre_msg_get_service_time(req->rq_repmsg)); + + rc = ptlrpc_check_status(req); + imp->imp_connect_error = rc; + + if (rc) { + /* + * Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * the upcall. + */ + if (ll_rpc_recoverable_error(rc)) { + if (req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + RETURN(rc); + } + ptlrpc_request_handle_notconn(req); + RETURN(rc); + } + } else { + /* + * Let's look if server sent slv. Do it only for RPC with + * rc == 0. + */ + ldlm_cli_update_pool(req); + } + + /* + * Store transno in reqmsg for replay. + */ + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { + req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); + lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); + } + + if (imp->imp_replayable) { + spin_lock(&imp->imp_lock); + /* + * No point in adding already-committed requests to the replay + * list, we will just remove them immediately. b=9829 + */ + if (req->rq_transno != 0 && + (req->rq_transno > + lustre_msg_get_last_committed(req->rq_repmsg) || + req->rq_replay)) { + /** version recovery */ + ptlrpc_save_versions(req); + ptlrpc_retain_replayable_request(req, imp); + } else if (req->rq_commit_cb != NULL) { + spin_unlock(&imp->imp_lock); + req->rq_commit_cb(req); + spin_lock(&imp->imp_lock); + } + + /* + * Replay-enabled imports return commit-status information. + */ + if (lustre_msg_get_last_committed(req->rq_repmsg)) { + imp->imp_peer_committed_transno = + lustre_msg_get_last_committed(req->rq_repmsg); + } + + ptlrpc_free_committed(imp); + + if (!list_empty(&imp->imp_replay_list)) { + struct ptlrpc_request *last; + + last = list_entry(imp->imp_replay_list.prev, + struct ptlrpc_request, + rq_replay_list); + /* + * Requests with rq_replay stay on the list even if no + * commit is expected. + */ + if (last->rq_transno > imp->imp_peer_committed_transno) + ptlrpc_pinger_commit_expected(imp); + } + + spin_unlock(&imp->imp_lock); + } + + RETURN(rc); +} + +/** + * Helper function to send request \a req over the network for the first time + * Also adjusts request phase. + * Returns 0 on success or error code. + */ +static int ptlrpc_send_new_req(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + int rc; + ENTRY; + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()) && + (!req->rq_generation_set || + req->rq_import_generation == imp->imp_generation)) + RETURN (0); + + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); + + spin_lock(&imp->imp_lock); + + if (!req->rq_generation_set) + req->rq_import_generation = imp->imp_generation; + + if (ptlrpc_import_delay_req(imp, req, &rc)) { + spin_lock(&req->rq_lock); + req->rq_waiting = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: " + "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg), + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + if (rc != 0) { + spin_unlock(&imp->imp_lock); + req->rq_status = rc; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + RETURN(rc); + } + + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&req->rq_import->imp_inflight); + spin_unlock(&imp->imp_lock); + + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + rc = sptlrpc_req_refresh_ctx(req, -1); + if (rc) { + if (req->rq_err) { + req->rq_status = rc; + RETURN(1); + } else { + req->rq_wait_ctx = 1; + RETURN(0); + } + } + + CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc" + " %s:%s:%d:"LPU64":%s:%d\n", current_comm(), + imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + lustre_msg_get_opc(req->rq_reqmsg)); + + rc = ptl_send_rpc(req, 0); + if (rc) { + DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); + req->rq_net_err = 1; + RETURN(rc); + } + RETURN(0); +} + +static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) +{ + int remaining, rc; + ENTRY; + + LASSERT(set->set_producer != NULL); + + remaining = atomic_read(&set->set_remaining); + + /* populate the ->set_requests list with requests until we + * reach the maximum number of RPCs in flight for this set */ + while (atomic_read(&set->set_remaining) < set->set_max_inflight) { + rc = set->set_producer(set, set->set_producer_arg); + if (rc == -ENOENT) { + /* no more RPC to produce */ + set->set_producer = NULL; + set->set_producer_arg = NULL; + RETURN(0); + } + } + + RETURN((atomic_read(&set->set_remaining) - remaining)); +} + +/** + * this sends any unsent RPCs in \a set and returns 1 if all are sent + * and no more replies are expected. + * (it is possible to get less replies than requests sent e.g. due to timed out + * requests or requests that we had trouble to send out) + */ +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *next; + int force_timer_recalc = 0; + ENTRY; + + if (atomic_read(&set->set_remaining) == 0) + RETURN(1); + + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + struct obd_import *imp = req->rq_import; + int unregistered = 0; + int rc = 0; + + if (req->rq_phase == RQ_PHASE_NEW && + ptlrpc_send_new_req(req)) { + force_timer_recalc = 1; + } + + /* delayed send - skip */ + if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) + continue; + + /* delayed resend - skip */ + if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && + req->rq_sent > cfs_time_current_sec()) + continue; + + if (!(req->rq_phase == RQ_PHASE_RPC || + req->rq_phase == RQ_PHASE_BULK || + req->rq_phase == RQ_PHASE_INTERPRET || + req->rq_phase == RQ_PHASE_UNREGISTERING || + req->rq_phase == RQ_PHASE_COMPLETE)) { + DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); + LBUG(); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + LASSERT(req->rq_next_phase != req->rq_phase); + LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); + + /* + * Skip processing until reply is unlinked. We + * can't return to pool before that and we can't + * call interpret before that. We need to make + * sure that all rdma transfers finished and will + * not corrupt any data. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + + /* + * Turn fail_loc off to prevent it from looping + * forever. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, + OBD_FAIL_ONCE); + } + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, + OBD_FAIL_ONCE); + } + + /* + * Move to next phase if reply was successfully + * unlinked. + */ + ptlrpc_rqphase_move(req, req->rq_next_phase); + } + + if (req->rq_phase == RQ_PHASE_COMPLETE) + continue; + + if (req->rq_phase == RQ_PHASE_INTERPRET) + GOTO(interpret, req->rq_status); + + /* + * Note that this also will start async reply unlink. + */ + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req, 1); + + /* + * Check if we still need to wait for unlink. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + /* If there is no need to resend, fail it now. */ + if (req->rq_no_resend) { + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } else { + continue; + } + } + + if (req->rq_err) { + spin_lock(&req->rq_lock); + req->rq_replied = 0; + spin_unlock(&req->rq_lock); + if (req->rq_status == 0) + req->rq_status = -EIO; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr + * so it sets rq_intr regardless of individual rpc + * timeouts. The synchronous IO waiting path sets + * rq_intr irrespective of whether ptlrpcd + * has seen a timeout. Our policy is to only interpret + * interrupted rpcs after they have timed out, so we + * need to enforce that here. + */ + + if (req->rq_intr && (req->rq_timedout || req->rq_waiting || + req->rq_wait_ctx)) { + req->rq_status = -EINTR; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_RPC) { + if (req->rq_timedout || req->rq_resend || + req->rq_waiting || req->rq_wait_ctx) { + int status; + + if (!ptlrpc_unregister_reply(req, 1)) + continue; + + spin_lock(&imp->imp_lock); + if (ptlrpc_import_delay_req(imp, req, &status)){ + /* put on delay list - only if we wait + * recovery finished - before send */ + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp-> + imp_delayed_list); + spin_unlock(&imp->imp_lock); + continue; + } + + if (status != 0) { + req->rq_status = status; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + if (ptlrpc_no_resend(req) && + !req->rq_wait_ctx) { + req->rq_status = -ENOTCONN; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); + spin_unlock(&imp->imp_lock); + GOTO(interpret, req->rq_status); + } + + list_del_init(&req->rq_list); + list_add_tail(&req->rq_list, + &imp->imp_sending_list); + + spin_unlock(&imp->imp_lock); + + spin_lock(&req->rq_lock); + req->rq_waiting = 0; + spin_unlock(&req->rq_lock); + + if (req->rq_timedout || req->rq_resend) { + /* This is re-sending anyways, + * let's mark req as resend. */ + spin_lock(&req->rq_lock); + req->rq_resend = 1; + spin_unlock(&req->rq_lock); + if (req->rq_bulk) { + __u64 old_xid; + + if (!ptlrpc_unregister_bulk(req, 1)) + continue; + + /* ensure previous bulk fails */ + old_xid = req->rq_xid; + req->rq_xid = ptlrpc_next_xid(); + CDEBUG(D_HA, "resend bulk " + "old x"LPU64 + " new x"LPU64"\n", + old_xid, req->rq_xid); + } + } + /* + * rq_wait_ctx is only touched by ptlrpcd, + * so no lock is needed here. + */ + status = sptlrpc_req_refresh_ctx(req, -1); + if (status) { + if (req->rq_err) { + req->rq_status = status; + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + force_timer_recalc = 1; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 1; + spin_unlock(&req->rq_lock); + } + + continue; + } else { + spin_lock(&req->rq_lock); + req->rq_wait_ctx = 0; + spin_unlock(&req->rq_lock); + } + + rc = ptl_send_rpc(req, 0); + if (rc) { + DEBUG_REQ(D_HA, req, + "send failed: rc = %d", rc); + force_timer_recalc = 1; + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + } + /* need to reset the timeout */ + force_timer_recalc = 1; + } + + spin_lock(&req->rq_lock); + + if (ptlrpc_client_early(req)) { + ptlrpc_at_recv_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + + /* Still waiting for a reply? */ + if (ptlrpc_client_recv(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + /* Did we actually receive a reply? */ + if (!ptlrpc_client_replied(req)) { + spin_unlock(&req->rq_lock); + continue; + } + + spin_unlock(&req->rq_lock); + + /* unlink from net because we are going to + * swab in-place of reply buffer */ + unregistered = ptlrpc_unregister_reply(req, 1); + if (!unregistered) + continue; + + req->rq_status = after_reply(req); + if (req->rq_resend) + continue; + + /* If there is no bulk associated with this request, + * then we're done and should let the interpreter + * process the reply. Similarly if the RPC returned + * an error, and therefore the bulk will never arrive. + */ + if (req->rq_bulk == NULL || req->rq_status < 0) { + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + GOTO(interpret, req->rq_status); + } + + ptlrpc_rqphase_move(req, RQ_PHASE_BULK); + } + + LASSERT(req->rq_phase == RQ_PHASE_BULK); + if (ptlrpc_client_bulk_active(req)) + continue; + + if (req->rq_bulk->bd_failure) { + /* The RPC reply arrived OK, but the bulk screwed + * up! Dead weird since the server told us the RPC + * was good after getting the REPLY for her GET or + * the ACK for her PUT. */ + DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); + req->rq_status = -EIO; + } + + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); + + interpret: + LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); + + /* This moves to "unregistering" phase we need to wait for + * reply unlink. */ + if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { + /* start async bulk unlink too */ + ptlrpc_unregister_bulk(req, 1); + continue; + } + + if (!ptlrpc_unregister_bulk(req, 1)) + continue; + + /* When calling interpret receiving already should be + * finished. */ + LASSERT(!req->rq_receiving_reply); + + ptlrpc_req_interpret(env, req, req->rq_status); + + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); + + CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0, + "Completed RPC pname:cluuid:pid:xid:nid:" + "opc %s:%s:%d:"LPU64":%s:%d\n", + current_comm(), imp->imp_obd->obd_uuid.uuid, + lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + lustre_msg_get_opc(req->rq_reqmsg)); + + spin_lock(&imp->imp_lock); + /* Request already may be not on sending or delaying list. This + * may happen in the case of marking it erroneous for the case + * ptlrpc_import_delay_req(req, status) find it impossible to + * allow sending this rpc and returns *status != 0. */ + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + } + spin_unlock(&imp->imp_lock); + + atomic_dec(&set->set_remaining); + wake_up_all(&imp->imp_recovery_waitq); + + if (set->set_producer) { + /* produce a new request if possible */ + if (ptlrpc_set_producer(set) > 0) + force_timer_recalc = 1; + + /* free the request that has just been completed + * in order not to pollute set->set_requests */ + list_del_init(&req->rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_set = NULL; + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + + /* record rq_status to compute the final status later */ + if (req->rq_status != 0) + set->set_rc = req->rq_status; + ptlrpc_req_finished(req); + } + } + + /* If we hit an error, we want to recover promptly. */ + RETURN(atomic_read(&set->set_remaining) == 0 || force_timer_recalc); +} +EXPORT_SYMBOL(ptlrpc_check_set); + +/** + * Time out request \a req. is \a async_unlink is set, that means do not wait + * until LNet actually confirms network buffer unlinking. + * Return 1 if we should give up further retrying attempts or 0 otherwise. + */ +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) +{ + struct obd_import *imp = req->rq_import; + int rc = 0; + ENTRY; + + spin_lock(&req->rq_lock); + req->rq_timedout = 1; + spin_unlock(&req->rq_lock); + + DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T + "/real "CFS_DURATION_T"]", + req->rq_net_err ? "failed due to network error" : + ((req->rq_real_sent == 0 || + cfs_time_before(req->rq_real_sent, req->rq_sent) || + cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ? + "timed out for sent delay" : "timed out for slow reply"), + req->rq_sent, req->rq_real_sent); + + if (imp != NULL && obd_debug_peer_on_timeout) + LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); + + ptlrpc_unregister_reply(req, async_unlink); + ptlrpc_unregister_bulk(req, async_unlink); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + if (imp == NULL) { + DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); + RETURN(1); + } + + atomic_inc(&imp->imp_timeouts); + + /* The DLM server doesn't want recovery run on its imports. */ + if (imp->imp_dlm_fake) + RETURN(1); + + /* If this request is for recovery or other primordial tasks, + * then error it out here. */ + if (req->rq_ctx_init || req->rq_ctx_fini || + req->rq_send_state != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); + spin_lock(&req->rq_lock); + req->rq_status = -ETIMEDOUT; + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(1); + } + + /* if a request can't be resent we can't wait for an answer after + the timeout */ + if (ptlrpc_no_resend(req)) { + DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); + rc = 1; + } + + ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); + + RETURN(rc); +} + +/** + * Time out all uncompleted requests in request set pointed by \a data + * Callback used when waiting on sets with l_wait_event. + * Always returns 1. + */ +int ptlrpc_expired_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + time_t now = cfs_time_current_sec(); + ENTRY; + + LASSERT(set != NULL); + + /* + * A timeout expired. See which reqs it applies to... + */ + list_for_each (tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + /* don't expire request waiting for context */ + if (req->rq_wait_ctx) + continue; + + /* Request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting && !req->rq_resend) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_deadline > now) /* not expired */ + continue; + + /* Deal with this guy. Do it asynchronously to not block + * ptlrpcd thread. */ + ptlrpc_expire_one_request(req, 1); + } + + /* + * When waiting for a whole set, we always break out of the + * sleep so we can recalculate the timeout, or enable interrupts + * if everyone's timed out. + */ + RETURN(1); +} +EXPORT_SYMBOL(ptlrpc_expired_set); + +/** + * Sets rq_intr flag in \a req under spinlock. + */ +void ptlrpc_mark_interrupted(struct ptlrpc_request *req) +{ + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_mark_interrupted); + +/** + * Interrupts (sets interrupted flag) all uncompleted requests in + * a set \a data. Callback for l_wait_event for interruptible waits. + */ +void ptlrpc_interrupted_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + + LASSERT(set != NULL); + CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); + + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + + if (req->rq_phase != RQ_PHASE_RPC && + req->rq_phase != RQ_PHASE_UNREGISTERING) + continue; + + ptlrpc_mark_interrupted(req); + } +} +EXPORT_SYMBOL(ptlrpc_interrupted_set); + +/** + * Get the smallest timeout in the set; this does NOT set a timeout. + */ +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + time_t now = cfs_time_current_sec(); + int timeout = 0; + struct ptlrpc_request *req; + int deadline; + ENTRY; + + SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ + + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + /* + * Request in-flight? + */ + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + continue; + + /* + * Already timed out. + */ + if (req->rq_timedout) + continue; + + /* + * Waiting for ctx. + */ + if (req->rq_wait_ctx) + continue; + + if (req->rq_phase == RQ_PHASE_NEW) + deadline = req->rq_sent; + else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) + deadline = req->rq_sent; + else + deadline = req->rq_sent + req->rq_timeout; + + if (deadline <= now) /* actually expired already */ + timeout = 1; /* ASAP */ + else if (timeout == 0 || timeout > deadline - now) + timeout = deadline - now; + } + RETURN(timeout); +} +EXPORT_SYMBOL(ptlrpc_set_next_timeout); + +/** + * Send all unset request from the set and then wait untill all + * requests in the set complete (either get a reply, timeout, get an + * error or otherwise be interrupted). + * Returns 0 on success or error code otherwise. + */ +int ptlrpc_set_wait(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct ptlrpc_request *req; + struct l_wait_info lwi; + int rc, timeout; + ENTRY; + + if (set->set_producer) + (void)ptlrpc_set_producer(set); + else + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + if (req->rq_phase == RQ_PHASE_NEW) + (void)ptlrpc_send_new_req(req); + } + + if (list_empty(&set->set_requests)) + RETURN(0); + + do { + timeout = ptlrpc_set_next_timeout(set); + + /* wait until all complete, interrupted, or an in-flight + * req times out */ + CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", + set, timeout); + + if (timeout == 0 && !cfs_signal_pending()) + /* + * No requests are in-flight (ether timed out + * or delayed), so we can allow interrupts. + * We still want to block for a limited time, + * so we allow interrupts during the timeout. + */ + lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), + ptlrpc_expired_set, + ptlrpc_interrupted_set, set); + else + /* + * At least one request is in flight, so no + * interrupts are allowed. Wait until all + * complete, or an in-flight req times out. + */ + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout? timeout : 1), + ptlrpc_expired_set, set); + + rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); + + /* LU-769 - if we ignored the signal because it was already + * pending when we started, we need to handle it now or we risk + * it being ignored forever */ + if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && + cfs_signal_pending()) { + sigset_t blocked_sigs = + cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + /* In fact we only interrupt for the "fatal" signals + * like SIGINT or SIGKILL. We still ignore less + * important signals since ptlrpc set is not easily + * reentrant from userspace again */ + if (cfs_signal_pending()) + ptlrpc_interrupted_set(set); + cfs_restore_sigs(blocked_sigs); + } + + LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); + + /* -EINTR => all requests have been flagged rq_intr so next + * check completes. + * -ETIMEDOUT => someone timed out. When all reqs have + * timed out, signals are enabled allowing completion with + * EINTR. + * I don't really care if we go once more round the loop in + * the error cases -eeb. */ + if (rc == 0 && atomic_read(&set->set_remaining) == 0) { + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, + rq_set_chain); + spin_lock(&req->rq_lock); + req->rq_invalid_rqset = 1; + spin_unlock(&req->rq_lock); + } + } + } while (rc != 0 || atomic_read(&set->set_remaining) != 0); + + LASSERT(atomic_read(&set->set_remaining) == 0); + + rc = set->set_rc; /* rq_status of already freed requests if any */ + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); + if (req->rq_status != 0) + rc = req->rq_status; + } + + if (set->set_interpret != NULL) { + int (*interpreter)(struct ptlrpc_request_set *set,void *,int) = + set->set_interpret; + rc = interpreter (set, set->set_arg, rc); + } else { + struct ptlrpc_set_cbdata *cbdata, *n; + int err; + + list_for_each_entry_safe(cbdata, n, + &set->set_cblist, psc_item) { + list_del_init(&cbdata->psc_item); + err = cbdata->psc_interpret(set, cbdata->psc_data, rc); + if (err && !rc) + rc = err; + OBD_FREE_PTR(cbdata); + } + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_wait); + +/** + * Helper fuction for request freeing. + * Called when request count reached zero and request needs to be freed. + * Removes request from all sorts of sending/replay lists it might be on, + * frees network buffers if any are present. + * If \a locked is set, that means caller is already holding import imp_lock + * and so we no longer need to reobtain it (for certain lists manipulations) + */ +static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) +{ + ENTRY; + if (request == NULL) { + EXIT; + return; + } + + LASSERTF(!request->rq_receiving_reply, "req %p\n", request); + LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */ + LASSERTF(list_empty(&request->rq_list), "req %p\n", request); + LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); + LASSERTF(!request->rq_replay, "req %p\n", request); + + req_capsule_fini(&request->rq_pill); + + /* We must take it off the imp_replay_list first. Otherwise, we'll set + * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ + if (request->rq_import != NULL) { + if (!locked) + spin_lock(&request->rq_import->imp_lock); + list_del_init(&request->rq_replay_list); + if (!locked) + spin_unlock(&request->rq_import->imp_lock); + } + LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); + + if (atomic_read(&request->rq_refcount) != 0) { + DEBUG_REQ(D_ERROR, request, + "freeing request with nonzero refcount"); + LBUG(); + } + + if (request->rq_repbuf != NULL) + sptlrpc_cli_free_repbuf(request); + if (request->rq_export != NULL) { + class_export_put(request->rq_export); + request->rq_export = NULL; + } + if (request->rq_import != NULL) { + class_import_put(request->rq_import); + request->rq_import = NULL; + } + if (request->rq_bulk != NULL) + ptlrpc_free_bulk_pin(request->rq_bulk); + + if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL) + sptlrpc_cli_free_reqbuf(request); + + if (request->rq_cli_ctx) + sptlrpc_req_put_ctx(request, !locked); + + if (request->rq_pool) + __ptlrpc_free_req_to_pool(request); + else + OBD_FREE(request, sizeof(*request)); + EXIT; +} + +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked); +/** + * Drop one request reference. Must be called with import imp_lock held. + * When reference count drops to zero, reuqest is freed. + */ +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request) +{ + LASSERT(spin_is_locked(&request->rq_import->imp_lock)); + (void)__ptlrpc_req_finished(request, 1); +} +EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock); + +/** + * Helper function + * Drops one reference count for request \a request. + * \a locked set indicates that caller holds import imp_lock. + * Frees the request whe reference count reaches zero. + */ +static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) +{ + ENTRY; + if (request == NULL) + RETURN(1); + + if (request == LP_POISON || + request->rq_reqmsg == LP_POISON) { + CERROR("dereferencing freed request (bug 575)\n"); + LBUG(); + RETURN(1); + } + + DEBUG_REQ(D_INFO, request, "refcount now %u", + atomic_read(&request->rq_refcount) - 1); + + if (atomic_dec_and_test(&request->rq_refcount)) { + __ptlrpc_free_req(request, locked); + RETURN(1); + } + + RETURN(0); +} + +/** + * Drops one reference count for a request. + */ +void ptlrpc_req_finished(struct ptlrpc_request *request) +{ + __ptlrpc_req_finished(request, 0); +} +EXPORT_SYMBOL(ptlrpc_req_finished); + +/** + * Returns xid of a \a request + */ +__u64 ptlrpc_req_xid(struct ptlrpc_request *request) +{ + return request->rq_xid; +} +EXPORT_SYMBOL(ptlrpc_req_xid); + +/** + * Disengage the client's reply buffer from the network + * NB does _NOT_ unregister any client-side bulk. + * IDEMPOTENT, but _not_ safe against concurrent callers. + * The request owner (i.e. the thread doing the I/O) must call... + * Returns 0 on success or 1 if unregistering cannot be made. + */ +int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) +{ + int rc; + wait_queue_head_t *wq; + struct l_wait_info lwi; + + /* + * Might sleep. + */ + LASSERT(!in_interrupt()); + + /* + * Let's setup deadline for reply unlink. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + async && request->rq_reply_deadline == 0) + request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK; + + /* + * Nothing left to do. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + RETURN(1); + + LNetMDUnlink(request->rq_reply_md_h); + + /* + * Let's check it once again. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + RETURN(1); + + /* + * Move to "Unregistering" phase as reply was not unlinked yet. + */ + ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); + + /* + * Do not wait for unlink to finish. + */ + if (async) + RETURN(0); + + /* + * We have to l_wait_event() whatever the result, to give liblustre + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool. + */ + if (request->rq_set != NULL) + wq = &request->rq_set->set_waitq; + else + wq = &request->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), + &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(request, request->rq_next_phase); + RETURN(1); + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " + "rvcng=%d unlnk=%d", request->rq_receiving_reply, + request->rq_must_unlink); + } + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_unregister_reply); + +/** + * Iterates through replay_list on import and prunes + * all requests have transno smaller than last_committed for the + * import and don't have rq_replay set. + * Since requests are sorted in transno order, stops when meetign first + * transno bigger than last_committed. + * caller must hold imp->imp_lock + */ +void ptlrpc_free_committed(struct obd_import *imp) +{ + struct list_head *tmp, *saved; + struct ptlrpc_request *req; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ + ENTRY; + + LASSERT(imp != NULL); + + LASSERT(spin_is_locked(&imp->imp_lock)); + + + if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && + imp->imp_generation == imp->imp_last_generation_checked) { + CDEBUG(D_INFO, "%s: skip recheck: last_committed "LPU64"\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno); + EXIT; + return; + } + CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n", + imp->imp_obd->obd_name, imp->imp_peer_committed_transno, + imp->imp_generation); + imp->imp_last_transno_checked = imp->imp_peer_committed_transno; + imp->imp_last_generation_checked = imp->imp_generation; + + list_for_each_safe(tmp, saved, &imp->imp_replay_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ + LASSERT(req != last_req); + last_req = req; + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "zero transno during replay"); + LBUG(); + } + if (req->rq_import_generation < imp->imp_generation) { + DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); + GOTO(free_req, 0); + } + + if (req->rq_replay) { + DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); + continue; + } + + /* not yet committed */ + if (req->rq_transno > imp->imp_peer_committed_transno) { + DEBUG_REQ(D_RPCTRACE, req, "stopping search"); + break; + } + + DEBUG_REQ(D_INFO, req, "commit (last_committed "LPU64")", + imp->imp_peer_committed_transno); +free_req: + spin_lock(&req->rq_lock); + req->rq_replay = 0; + spin_unlock(&req->rq_lock); + if (req->rq_commit_cb != NULL) + req->rq_commit_cb(req); + list_del_init(&req->rq_replay_list); + __ptlrpc_req_finished(req, 1); + } + + EXIT; + return; +} + +void ptlrpc_cleanup_client(struct obd_import *imp) +{ + ENTRY; + EXIT; + return; +} +EXPORT_SYMBOL(ptlrpc_cleanup_client); + +/** + * Schedule previously sent request for resend. + * For bulk requests we assign new xid (to avoid problems with + * lost replies and therefore several transfers landing into same buffer + * from different sending attempts). + */ +void ptlrpc_resend_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "going to resend"); + lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); + req->rq_status = -EAGAIN; + + spin_lock(&req->rq_lock); + req->rq_resend = 1; + req->rq_net_err = 0; + req->rq_timedout = 0; + if (req->rq_bulk) { + __u64 old_xid = req->rq_xid; + + /* ensure previous bulk fails */ + req->rq_xid = ptlrpc_next_xid(); + CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n", + old_xid, req->rq_xid); + } + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_resend_req); + +/* XXX: this function and rq_status are currently unused */ +void ptlrpc_restart_req(struct ptlrpc_request *req) +{ + DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); + req->rq_status = -ERESTARTSYS; + + spin_lock(&req->rq_lock); + req->rq_restart = 1; + req->rq_timedout = 0; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); +} +EXPORT_SYMBOL(ptlrpc_restart_req); + +/** + * Grab additional reference on a request \a req + */ +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) +{ + ENTRY; + atomic_inc(&req->rq_refcount); + RETURN(req); +} +EXPORT_SYMBOL(ptlrpc_request_addref); + +/** + * Add a request to import replay_list. + * Must be called under imp_lock + */ +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp) +{ + struct list_head *tmp; + + LASSERT(spin_is_locked(&imp->imp_lock)); + + if (req->rq_transno == 0) { + DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); + LBUG(); + } + + /* clear this for new requests that were resent as well + as resent replayed requests. */ + lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); + + /* don't re-add requests that have been replayed */ + if (!list_empty(&req->rq_replay_list)) + return; + + lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); + + LASSERT(imp->imp_replayable); + /* Balanced in ptlrpc_free_committed, usually. */ + ptlrpc_request_addref(req); + list_for_each_prev(tmp, &imp->imp_replay_list) { + struct ptlrpc_request *iter = + list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* We may have duplicate transnos if we create and then + * open a file, or for closes retained if to match creating + * opens, so use req->rq_xid as a secondary key. + * (See bugs 684, 685, and 428.) + * XXX no longer needed, but all opens need transnos! + */ + if (iter->rq_transno > req->rq_transno) + continue; + + if (iter->rq_transno == req->rq_transno) { + LASSERT(iter->rq_xid != req->rq_xid); + if (iter->rq_xid > req->rq_xid) + continue; + } + + list_add(&req->rq_replay_list, &iter->rq_replay_list); + return; + } + + list_add(&req->rq_replay_list, &imp->imp_replay_list); +} +EXPORT_SYMBOL(ptlrpc_retain_replayable_request); + +/** + * Send request and wait until it completes. + * Returns request processing status. + */ +int ptlrpc_queue_wait(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *set; + int rc; + ENTRY; + + LASSERT(req->rq_set == NULL); + LASSERT(!req->rq_receiving_reply); + + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("Unable to allocate ptlrpc set."); + RETURN(-ENOMEM); + } + + /* for distributed debugging */ + lustre_msg_set_status(req->rq_reqmsg, current_pid()); + + /* add a ref for the set (see comment in ptlrpc_set_add_req) */ + ptlrpc_request_addref(req); + ptlrpc_set_add_req(set, req); + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_queue_wait); + +struct ptlrpc_replay_async_args { + int praa_old_state; + int praa_old_status; +}; + +/** + * Callback used for replayed requests reply processing. + * In case of succesful reply calls registeresd request replay callback. + * In case of error restart replay process. + */ +static int ptlrpc_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void * data, int rc) +{ + struct ptlrpc_replay_async_args *aa = data; + struct obd_import *imp = req->rq_import; + + ENTRY; + atomic_dec(&imp->imp_replay_inflight); + + if (!ptlrpc_client_replied(req)) { + CERROR("request replay timed out, restarting recovery\n"); + GOTO(out, rc = -ETIMEDOUT); + } + + if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && + (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || + lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) + GOTO(out, rc = lustre_msg_get_status(req->rq_repmsg)); + + /** VBR: check version failure */ + if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { + /** replay was failed due to version mismatch */ + DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 1; + imp->imp_no_lock_replay = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } else { + /** The transno had better not change over replay. */ + LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == + lustre_msg_get_transno(req->rq_repmsg) || + lustre_msg_get_transno(req->rq_repmsg) == 0, + LPX64"/"LPX64"\n", + lustre_msg_get_transno(req->rq_reqmsg), + lustre_msg_get_transno(req->rq_repmsg)); + } + + spin_lock(&imp->imp_lock); + /** if replays by version then gap occur on server, no trust to locks */ + if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) + imp->imp_no_lock_replay = 1; + imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); + spin_unlock(&imp->imp_lock); + LASSERT(imp->imp_last_replay_transno); + + /* transaction number shouldn't be bigger than the latest replayed */ + if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, + "Reported transno "LPU64" is bigger than the " + "replayed one: "LPU64, req->rq_transno, + lustre_msg_get_transno(req->rq_reqmsg)); + GOTO(out, rc = -EINVAL); + } + + DEBUG_REQ(D_HA, req, "got rep"); + + /* let the callback do fixups, possibly including in the request */ + if (req->rq_replay_cb) + req->rq_replay_cb(req); + + if (ptlrpc_client_replied(req) && + lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { + DEBUG_REQ(D_ERROR, req, "status %d, old was %d", + lustre_msg_get_status(req->rq_repmsg), + aa->praa_old_status); + } else { + /* Put it back for re-replay. */ + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); + } + + /* + * Errors while replay can set transno to 0, but + * imp_last_replay_transno shouldn't be set to 0 anyway + */ + if (req->rq_transno == 0) + CERROR("Transno is 0 during replay!\n"); + + /* continue with recovery */ + rc = ptlrpc_import_recovery_state_machine(imp); + out: + req->rq_send_state = aa->praa_old_state; + + if (rc != 0) + /* this replay failed, so restart recovery */ + ptlrpc_connect_import(imp); + + RETURN(rc); +} + +/** + * Prepares and queues request for replay. + * Adds it to ptlrpcd queue for actual sending. + * Returns 0 on success. + */ +int ptlrpc_replay_req(struct ptlrpc_request *req) +{ + struct ptlrpc_replay_async_args *aa; + ENTRY; + + LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); + + LASSERT (sizeof (*aa) <= sizeof (req->rq_async_args)); + aa = ptlrpc_req_async_args(req); + memset(aa, 0, sizeof *aa); + + /* Prepare request to be resent with ptlrpcd */ + aa->praa_old_state = req->rq_send_state; + req->rq_send_state = LUSTRE_IMP_REPLAY; + req->rq_phase = RQ_PHASE_NEW; + req->rq_next_phase = RQ_PHASE_UNDEFINED; + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + req->rq_status = 0; + req->rq_interpret_reply = ptlrpc_replay_interpret; + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + + /* Tell server the net_latency, so the server can calculate how long + * it should wait for next replay */ + lustre_msg_set_service_time(req->rq_reqmsg, + ptlrpc_at_get_net_latency(req)); + DEBUG_REQ(D_HA, req, "REPLAY"); + + atomic_inc(&req->rq_import->imp_replay_inflight); + ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ + + ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1); + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_replay_req); + +/** + * Aborts all in-flight request on import \a imp sending and delayed lists + */ +void ptlrpc_abort_inflight(struct obd_import *imp) +{ + struct list_head *tmp, *n; + ENTRY; + + /* Make sure that no new requests get processed for this import. + * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing + * this flag and then putting requests on sending_list or delayed_list. + */ + spin_lock(&imp->imp_lock); + + /* XXX locking? Maybe we should remove each request with the list + * locked? Also, how do we know if the requests on the list are + * being freed at this time? + */ + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "inflight"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); + + spin_lock(&req->rq_lock); + if (req->rq_import_generation < imp->imp_generation) { + req->rq_err = 1; + req->rq_status = -EIO; + ptlrpc_client_wake_req(req); + } + spin_unlock(&req->rq_lock); + } + + /* Last chance to free reqs left on the replay list, but we + * will still leak reqs that haven't committed. */ + if (imp->imp_replayable) + ptlrpc_free_committed(imp); + + spin_unlock(&imp->imp_lock); + + EXIT; +} +EXPORT_SYMBOL(ptlrpc_abort_inflight); + +/** + * Abort all uncompleted requests in request set \a set + */ +void ptlrpc_abort_set(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + + LASSERT(set != NULL); + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + spin_lock(&req->rq_lock); + if (req->rq_phase != RQ_PHASE_RPC) { + spin_unlock(&req->rq_lock); + continue; + } + + req->rq_err = 1; + req->rq_status = -EINTR; + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + } +} + +static __u64 ptlrpc_last_xid; +static spinlock_t ptlrpc_last_xid_lock; + +/** + * Initialize the XID for the node. This is common among all requests on + * this node, and only requires the property that it is monotonically + * increasing. It does not need to be sequential. Since this is also used + * as the RDMA match bits, it is important that a single client NOT have + * the same match bits for two different in-flight requests, hence we do + * NOT want to have an XID per target or similar. + * + * To avoid an unlikely collision between match bits after a client reboot + * (which would deliver old data into the wrong RDMA buffer) initialize + * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. + * If the time is clearly incorrect, we instead use a 62-bit random number. + * In the worst case the random number will overflow 1M RPCs per second in + * 9133 years, or permutations thereof. + */ +#define YEAR_2004 (1ULL << 30) +void ptlrpc_init_xid(void) +{ + time_t now = cfs_time_current_sec(); + + spin_lock_init(&ptlrpc_last_xid_lock); + if (now < YEAR_2004) { + cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); + ptlrpc_last_xid >>= 2; + ptlrpc_last_xid |= (1ULL << 61); + } else { + ptlrpc_last_xid = (__u64)now << 20; + } + + /* Need to always be aligned to a power-of-two for mutli-bulk BRW */ + CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0); + ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; +} + +/** + * Increase xid and returns resulting new value to the caller. + * + * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting + * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC + * itself uses the last bulk xid needed, so the server can determine the + * the number of bulk transfers from the RPC XID and a bitmask. The starting + * xid must align to a power-of-two value. + * + * This is assumed to be true due to the initial ptlrpc_last_xid + * value also being initialized to a power-of-two value. LU-1431 + */ +__u64 ptlrpc_next_xid(void) +{ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + ptlrpc_last_xid = next; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +} +EXPORT_SYMBOL(ptlrpc_next_xid); + +/** + * Get a glimpse at what next xid value might have been. + * Returns possible next xid. + */ +__u64 ptlrpc_sample_next_xid(void) +{ +#if BITS_PER_LONG == 32 + /* need to avoid possible word tearing on 32-bit systems */ + __u64 next; + + spin_lock(&ptlrpc_last_xid_lock); + next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; + spin_unlock(&ptlrpc_last_xid_lock); + + return next; +#else + /* No need to lock, since returned value is racy anyways */ + return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; +#endif +} +EXPORT_SYMBOL(ptlrpc_sample_next_xid); + +/** + * Functions for operating ptlrpc workers. + * + * A ptlrpc work is a function which will be running inside ptlrpc context. + * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. + * + * 1. after a work is created, it can be used many times, that is: + * handler = ptlrpcd_alloc_work(); + * ptlrpcd_queue_work(); + * + * queue it again when necessary: + * ptlrpcd_queue_work(); + * ptlrpcd_destroy_work(); + * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but + * it will only be queued once in any time. Also as its name implies, it may + * have delay before it really runs by ptlrpcd thread. + */ +struct ptlrpc_work_async_args { + __u64 magic; + int (*cb)(const struct lu_env *, void *); + void *cbdata; +}; + +#define PTLRPC_WORK_MAGIC 0x6655436b676f4f44ULL /* magic code */ + +static int work_interpreter(const struct lu_env *env, + struct ptlrpc_request *req, void *data, int rc) +{ + struct ptlrpc_work_async_args *arg = data; + + LASSERT(arg->magic == PTLRPC_WORK_MAGIC); + LASSERT(arg->cb != NULL); + + return arg->cb(env, arg->cbdata); +} + +/** + * Create a work for ptlrpc. + */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *cbdata) +{ + struct ptlrpc_request *req = NULL; + struct ptlrpc_work_async_args *args; + ENTRY; + + might_sleep(); + + if (cb == NULL) + RETURN(ERR_PTR(-EINVAL)); + + /* copy some code from deprecated fakereq. */ + OBD_ALLOC_PTR(req); + if (req == NULL) { + CERROR("ptlrpc: run out of memory!\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + + req->rq_send_state = LUSTRE_IMP_FULL; + req->rq_type = PTL_RPC_MSG_REQUEST; + req->rq_import = class_import_get(imp); + req->rq_export = NULL; + req->rq_interpret_reply = work_interpreter; + /* don't want reply */ + req->rq_receiving_reply = 0; + req->rq_must_unlink = 0; + req->rq_no_delay = req->rq_no_resend = 1; + + spin_lock_init(&req->rq_lock); + INIT_LIST_HEAD(&req->rq_list); + INIT_LIST_HEAD(&req->rq_replay_list); + INIT_LIST_HEAD(&req->rq_set_chain); + INIT_LIST_HEAD(&req->rq_history_list); + INIT_LIST_HEAD(&req->rq_exp_list); + init_waitqueue_head(&req->rq_reply_waitq); + init_waitqueue_head(&req->rq_set_waitq); + atomic_set(&req->rq_refcount, 1); + + CLASSERT (sizeof(*args) <= sizeof(req->rq_async_args)); + args = ptlrpc_req_async_args(req); + args->magic = PTLRPC_WORK_MAGIC; + args->cb = cb; + args->cbdata = cbdata; + + RETURN(req); +} +EXPORT_SYMBOL(ptlrpcd_alloc_work); + +void ptlrpcd_destroy_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + if (req) + ptlrpc_req_finished(req); +} +EXPORT_SYMBOL(ptlrpcd_destroy_work); + +int ptlrpcd_queue_work(void *handler) +{ + struct ptlrpc_request *req = handler; + + /* + * Check if the req is already being queued. + * + * Here comes a trick: it lacks a way of checking if a req is being + * processed reliably in ptlrpc. Here I have to use refcount of req + * for this purpose. This is okay because the caller should use this + * req as opaque data. - Jinshan + */ + LASSERT(atomic_read(&req->rq_refcount) > 0); + if (atomic_read(&req->rq_refcount) > 1) + return -EBUSY; + + if (atomic_inc_return(&req->rq_refcount) > 2) { /* race */ + atomic_dec(&req->rq_refcount); + return -EBUSY; + } + + /* re-initialize the req */ + req->rq_timeout = obd_timeout; + req->rq_sent = cfs_time_current_sec(); + req->rq_deadline = req->rq_sent + req->rq_timeout; + req->rq_reply_deadline = req->rq_deadline; + req->rq_phase = RQ_PHASE_INTERPRET; + req->rq_next_phase = RQ_PHASE_COMPLETE; + req->rq_xid = ptlrpc_next_xid(); + req->rq_import_generation = req->rq_import->imp_generation; + + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + return 0; +} +EXPORT_SYMBOL(ptlrpcd_queue_work); diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c new file mode 100644 index 000000000000..a0757f372be5 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/connection.c @@ -0,0 +1,248 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include + +#include "ptlrpc_internal.h" + +static cfs_hash_t *conn_hash = NULL; +static cfs_hash_ops_t conn_hash_ops; + +struct ptlrpc_connection * +ptlrpc_connection_get(lnet_process_id_t peer, lnet_nid_t self, + struct obd_uuid *uuid) +{ + struct ptlrpc_connection *conn, *conn2; + ENTRY; + + conn = cfs_hash_lookup(conn_hash, &peer); + if (conn) + GOTO(out, conn); + + OBD_ALLOC_PTR(conn); + if (!conn) + RETURN(NULL); + + conn->c_peer = peer; + conn->c_self = self; + INIT_HLIST_NODE(&conn->c_hash); + atomic_set(&conn->c_refcount, 1); + if (uuid) + obd_str2uuid(&conn->c_remote_uuid, uuid->uuid); + + /* + * Add the newly created conn to the hash, on key collision we + * lost a racing addition and must destroy our newly allocated + * connection. The object which exists in the has will be + * returned and may be compared against out object. + */ + /* In the function below, .hs_keycmp resolves to + * conn_keycmp() */ + /* coverity[overrun-buffer-val] */ + conn2 = cfs_hash_findadd_unique(conn_hash, &peer, &conn->c_hash); + if (conn != conn2) { + OBD_FREE_PTR(conn); + conn = conn2; + } + EXIT; +out: + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + return conn; +} +EXPORT_SYMBOL(ptlrpc_connection_get); + +int ptlrpc_connection_put(struct ptlrpc_connection *conn) +{ + int rc = 0; + ENTRY; + + if (!conn) + RETURN(rc); + + LASSERT(atomic_read(&conn->c_refcount) > 1); + + /* + * We do not remove connection from hashtable and + * do not free it even if last caller released ref, + * as we want to have it cached for the case it is + * needed again. + * + * Deallocating it and later creating new connection + * again would be wastful. This way we also avoid + * expensive locking to protect things from get/put + * race when found cached connection is freed by + * ptlrpc_connection_put(). + * + * It will be freed later in module unload time, + * when ptlrpc_connection_fini()->lh_exit->conn_exit() + * path is called. + */ + if (atomic_dec_return(&conn->c_refcount) == 1) + rc = 1; + + CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_connection_put); + +struct ptlrpc_connection * +ptlrpc_connection_addref(struct ptlrpc_connection *conn) +{ + ENTRY; + + atomic_inc(&conn->c_refcount); + CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", + conn, atomic_read(&conn->c_refcount), + libcfs_nid2str(conn->c_peer.nid)); + + RETURN(conn); +} +EXPORT_SYMBOL(ptlrpc_connection_addref); + +int ptlrpc_connection_init(void) +{ + ENTRY; + + conn_hash = cfs_hash_create("CONN_HASH", + HASH_CONN_CUR_BITS, + HASH_CONN_MAX_BITS, + HASH_CONN_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &conn_hash_ops, CFS_HASH_DEFAULT); + if (!conn_hash) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_connection_init); + +void ptlrpc_connection_fini(void) { + ENTRY; + cfs_hash_putref(conn_hash); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_connection_fini); + +/* + * Hash operations for net_peer<->connection + */ +static unsigned +conn_hashfn(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_process_id_t), mask); +} + +static int +conn_keycmp(const void *key, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + const lnet_process_id_t *conn_key; + + LASSERT(key != NULL); + conn_key = (lnet_process_id_t*)key; + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + + return conn_key->nid == conn->c_peer.nid && + conn_key->pid == conn->c_peer.pid; +} + +static void * +conn_key(struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + return &conn->c_peer; +} + +static void * +conn_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct ptlrpc_connection, c_hash); +} + +static void +conn_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_inc(&conn->c_refcount); +} + +static void +conn_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + atomic_dec(&conn->c_refcount); +} + +static void +conn_exit(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct ptlrpc_connection *conn; + + conn = hlist_entry(hnode, struct ptlrpc_connection, c_hash); + /* + * Nothing should be left. Connection user put it and + * connection also was deleted from table by this time + * so we should have 0 refs. + */ + LASSERTF(atomic_read(&conn->c_refcount) == 0, + "Busy connection with %d refs\n", + atomic_read(&conn->c_refcount)); + OBD_FREE_PTR(conn); +} + +static cfs_hash_ops_t conn_hash_ops = { + .hs_hash = conn_hashfn, + .hs_keycmp = conn_keycmp, + .hs_key = conn_key, + .hs_object = conn_object, + .hs_get = conn_get, + .hs_put_locked = conn_put_locked, + .hs_exit = conn_exit, +}; diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c new file mode 100644 index 000000000000..0264c102cb3e --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/events.c @@ -0,0 +1,595 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +# include +# ifdef __mips64__ +# include +# endif + +#include +#include +#include +#include "ptlrpc_internal.h" + +lnet_handle_eq_t ptlrpc_eq_h; + +/* + * Client's outgoing request callback + */ +void request_out_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + ENTRY; + + LASSERT (ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->unlinked); + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + sptlrpc_request_out_callback(req); + req->rq_real_sent = cfs_time_current_sec(); + + if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) { + + /* Failed send: make it seem like the reply timed out, just + * like failing sends in client.c does currently... */ + + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + + ptlrpc_client_wake_req(req); + } + + ptlrpc_req_finished(req); + + EXIT; +} + +/* + * Client's incoming reply callback + */ +void reply_in_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request *req = cbid->cbid_arg; + ENTRY; + + DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); + + LASSERT (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->md.start == req->rq_repbuf); + LASSERT (ev->offset + ev->mlength <= req->rq_repbuf_len); + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); + + spin_lock(&req->rq_lock); + + req->rq_receiving_reply = 0; + req->rq_early = 0; + if (ev->unlinked) + req->rq_must_unlink = 0; + + if (ev->status) + goto out_wake; + + if (ev->type == LNET_EVENT_UNLINK) { + LASSERT(ev->unlinked); + DEBUG_REQ(D_NET, req, "unlink"); + goto out_wake; + } + + if (ev->mlength < ev->rlength ) { + CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req, + req->rq_replen, ev->rlength, ev->offset); + req->rq_reply_truncate = 1; + req->rq_replied = 1; + req->rq_status = -EOVERFLOW; + req->rq_nob_received = ev->rlength + ev->offset; + goto out_wake; + } + + if ((ev->offset == 0) && + ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received: mlen=%u offset=%d replen=%d " + "replied=%d unlinked=%d", ev->mlength, ev->offset, + req->rq_replen, req->rq_replied, ev->unlinked); + + req->rq_early_count++; /* number received, client side */ + + if (req->rq_replied) /* already got the real reply */ + goto out_wake; + + req->rq_early = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ + req->rq_rep_swab_mask = 0; + req->rq_replied = 1; + req->rq_reply_off = ev->offset; + req->rq_nob_received = ev->mlength; + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); + } + + req->rq_import->imp_last_reply_time = cfs_time_current_sec(); + +out_wake: + /* NB don't unlock till after wakeup; req can disappear under us + * since we don't have our own ref */ + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); + EXIT; +} + +/* + * Client's bulk has been written/read + */ +void client_bulk_callback (lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; + struct ptlrpc_request *req; + ENTRY; + + LASSERT ((desc->bd_type == BULK_PUT_SINK && + ev->type == LNET_EVENT_PUT) || + (desc->bd_type == BULK_GET_SOURCE && + ev->type == LNET_EVENT_GET) || + ev->type == LNET_EVENT_UNLINK); + LASSERT (ev->unlinked); + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) + ev->status = -EIO; + + if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE)) + ev->status = -EIO; + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, desc %p\n", + ev->type, ev->status, desc); + + spin_lock(&desc->bd_lock); + req = desc->bd_req; + LASSERT(desc->bd_md_count > 0); + desc->bd_md_count--; + + if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { + desc->bd_nob_transferred += ev->mlength; + desc->bd_sender = ev->sender; + } else { + /* start reconnect and resend if network error hit */ + spin_lock(&req->rq_lock); + req->rq_net_err = 1; + spin_unlock(&req->rq_lock); + } + + if (ev->status != 0) + desc->bd_failure = 1; + + /* NB don't unlock till after wakeup; desc can disappear under us + * otherwise */ + if (desc->bd_md_count == 0) + ptlrpc_client_wake_req(desc->bd_req); + + spin_unlock(&desc->bd_lock); + EXIT; +} + +/* + * We will have percpt request history list for ptlrpc service in upcoming + * patches because we don't want to be serialized by current per-service + * history operations. So we require history ID can (somehow) show arriving + * order w/o grabbing global lock, and user can sort them in userspace. + * + * This is how we generate history ID for ptlrpc_request: + * ---------------------------------------------------- + * | 32 bits | 16 bits | (16 - X)bits | X bits | + * ---------------------------------------------------- + * | seconds | usec / 16 | sequence | CPT id | + * ---------------------------------------------------- + * + * it might not be precise but should be good enough. + */ + +#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits) + +#define REQS_SEC_SHIFT 32 +#define REQS_USEC_SHIFT 16 +#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt) + +static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + __u64 sec = req->rq_arrival_time.tv_sec; + __u32 usec = req->rq_arrival_time.tv_usec >> 4; /* usec / 16 */ + __u64 new_seq; + + /* set sequence ID for request and add it to history list, + * it must be called with hold svcpt::scp_lock */ + + new_seq = (sec << REQS_SEC_SHIFT) | + (usec << REQS_USEC_SHIFT) | + (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt); + + if (new_seq > svcpt->scp_hist_seq) { + /* This handles the initial case of scp_hist_seq == 0 or + * we just jumped into a new time window */ + svcpt->scp_hist_seq = new_seq; + } else { + LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT); + /* NB: increase sequence number in current usec bucket, + * however, it's possible that we used up all bits for + * sequence and jumped into the next usec bucket (future time), + * then we hope there will be less RPCs per bucket at some + * point, and sequence will catch up again */ + svcpt->scp_hist_seq += (1U << REQS_SEQ_SHIFT(svcpt)); + new_seq = svcpt->scp_hist_seq; + } + + req->rq_history_seq = new_seq; + + list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs); +} + +/* + * Server's incoming request callback + */ +void request_in_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *service = svcpt->scp_service; + struct ptlrpc_request *req; + ENTRY; + + LASSERT (ev->type == LNET_EVENT_PUT || + ev->type == LNET_EVENT_UNLINK); + LASSERT ((char *)ev->md.start >= rqbd->rqbd_buffer); + LASSERT ((char *)ev->md.start + ev->offset + ev->mlength <= + rqbd->rqbd_buffer + service->srv_buf_size); + + CDEBUG((ev->status == 0) ? D_NET : D_ERROR, + "event type %d, status %d, service %s\n", + ev->type, ev->status, service->srv_name); + + if (ev->unlinked) { + /* If this is the last request message to fit in the + * request buffer we can use the request object embedded in + * rqbd. Note that if we failed to allocate a request, + * we'd have to re-post the rqbd, which we can't do in this + * context. */ + req = &rqbd->rqbd_req; + memset(req, 0, sizeof (*req)); + } else { + LASSERT (ev->type == LNET_EVENT_PUT); + if (ev->status != 0) { + /* We moaned above already... */ + return; + } + OBD_ALLOC_GFP(req, sizeof(*req), ALLOC_ATOMIC_TRY); + if (req == NULL) { + CERROR("Can't allocate incoming request descriptor: " + "Dropping %s RPC from %s\n", + service->srv_name, + libcfs_id2str(ev->initiator)); + return; + } + } + + /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, + * flags are reset and scalars are zero. We only set the message + * size to non-zero if this was a successful receive. */ + req->rq_xid = ev->match_bits; + req->rq_reqbuf = ev->md.start + ev->offset; + if (ev->type == LNET_EVENT_PUT && ev->status == 0) + req->rq_reqdata_len = ev->mlength; + do_gettimeofday(&req->rq_arrival_time); + req->rq_peer = ev->initiator; + req->rq_self = ev->target.nid; + req->rq_rqbd = rqbd; + req->rq_phase = RQ_PHASE_NEW; + spin_lock_init(&req->rq_lock); + INIT_LIST_HEAD(&req->rq_timed_list); + INIT_LIST_HEAD(&req->rq_exp_list); + atomic_set(&req->rq_refcount, 1); + if (ev->type == LNET_EVENT_PUT) + CDEBUG(D_INFO, "incoming req@%p x"LPU64" msgsize %u\n", + req, req->rq_xid, ev->mlength); + + CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer)); + + spin_lock(&svcpt->scp_lock); + + ptlrpc_req_add_history(svcpt, req); + + if (ev->unlinked) { + svcpt->scp_nrqbds_posted--; + CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", + svcpt->scp_nrqbds_posted); + + /* Normally, don't complain about 0 buffers posted; LNET won't + * drop incoming reqs since we set the portal lazy */ + if (test_req_buffer_pressure && + ev->type != LNET_EVENT_UNLINK && + svcpt->scp_nrqbds_posted == 0) + CWARN("All %s request buffers busy\n", + service->srv_name); + + /* req takes over the network's ref on rqbd */ + } else { + /* req takes a ref on rqbd */ + rqbd->rqbd_refcount++; + } + + list_add_tail(&req->rq_list, &svcpt->scp_req_incoming); + svcpt->scp_nreqs_incoming++; + + /* NB everything can disappear under us once the request + * has been queued and we unlock, so do the wake now... */ + wake_up(&svcpt->scp_waitq); + + spin_unlock(&svcpt->scp_lock); + EXIT; +} + +/* + * Server's outgoing reply callback + */ +void reply_out_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + struct ptlrpc_reply_state *rs = cbid->cbid_arg; + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + ENTRY; + + LASSERT (ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_UNLINK); + + if (!rs->rs_difficult) { + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' */ + LASSERT (ev->unlinked); + ptlrpc_rs_decref(rs); + EXIT; + return; + } + + LASSERT (rs->rs_on_net); + + if (ev->unlinked) { + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_handle_rs() is done with it */ + spin_lock(&svcpt->scp_rep_lock); + spin_lock(&rs->rs_lock); + + rs->rs_on_net = 0; + if (!rs->rs_no_ack || + rs->rs_transno <= + rs->rs_export->exp_obd->obd_last_committed) + ptlrpc_schedule_difficult_reply(rs); + + spin_unlock(&rs->rs_lock); + spin_unlock(&svcpt->scp_rep_lock); + } + EXIT; +} + + +static void ptlrpc_master_callback(lnet_event_t *ev) +{ + struct ptlrpc_cb_id *cbid = ev->md.user_ptr; + void (*callback)(lnet_event_t *ev) = cbid->cbid_fn; + + /* Honestly, it's best to find out early. */ + LASSERT (cbid->cbid_arg != LP_POISON); + LASSERT (callback == request_out_callback || + callback == reply_in_callback || + callback == client_bulk_callback || + callback == request_in_callback || + callback == reply_out_callback + ); + + callback (ev); +} + +int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, + lnet_process_id_t *peer, lnet_nid_t *self) +{ + int best_dist = 0; + __u32 best_order = 0; + int count = 0; + int rc = -ENOENT; + int portals_compatibility; + int dist; + __u32 order; + lnet_nid_t dst_nid; + lnet_nid_t src_nid; + + portals_compatibility = LNetCtl(IOC_LIBCFS_PORTALS_COMPATIBILITY, NULL); + + peer->pid = LUSTRE_SRV_LNET_PID; + + /* Choose the matching UUID that's closest */ + while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) { + dist = LNetDist(dst_nid, &src_nid, &order); + if (dist < 0) + continue; + + if (dist == 0) { /* local! use loopback LND */ + peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0); + rc = 0; + break; + } + + if (rc < 0 || + dist < best_dist || + (dist == best_dist && order < best_order)) { + best_dist = dist; + best_order = order; + + if (portals_compatibility > 1) { + /* Strong portals compatibility: Zero the nid's + * NET, so if I'm reading new config logs, or + * getting configured by (new) lconf I can + * still talk to old servers. */ + dst_nid = LNET_MKNID(0, LNET_NIDADDR(dst_nid)); + src_nid = LNET_MKNID(0, LNET_NIDADDR(src_nid)); + } + peer->nid = dst_nid; + *self = src_nid; + rc = 0; + } + } + + CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); + return rc; +} + +void ptlrpc_ni_fini(void) +{ + wait_queue_head_t waitq; + struct l_wait_info lwi; + int rc; + int retries; + + /* Wait for the event queue to become idle since there may still be + * messages in flight with pending events (i.e. the fire-and-forget + * messages == client requests and "non-difficult" server + * replies */ + + for (retries = 0;; retries++) { + rc = LNetEQFree(ptlrpc_eq_h); + switch (rc) { + default: + LBUG(); + + case 0: + LNetNIFini(); + return; + + case -EBUSY: + if (retries != 0) + CWARN("Event queue still busy\n"); + + /* Wait for a bit */ + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(cfs_time_seconds(2), NULL, NULL); + l_wait_event(waitq, 0, &lwi); + break; + } + } + /* notreached */ +} + +lnet_pid_t ptl_get_pid(void) +{ + lnet_pid_t pid; + + pid = LUSTRE_SRV_LNET_PID; + return pid; +} + +int ptlrpc_ni_init(void) +{ + int rc; + lnet_pid_t pid; + + pid = ptl_get_pid(); + CDEBUG(D_NET, "My pid is: %x\n", pid); + + /* We're not passing any limits yet... */ + rc = LNetNIInit(pid); + if (rc < 0) { + CDEBUG (D_NET, "Can't init network interface: %d\n", rc); + return (-ENOENT); + } + + /* CAVEAT EMPTOR: how we process portals events is _radically_ + * different depending on... */ + /* kernel LNet calls our master callback when there are new event, + * because we are guaranteed to get every event via callback, + * so we just set EQ size to 0 to avoid overhread of serializing + * enqueue/dequeue operations in LNet. */ + rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h); + if (rc == 0) + return 0; + + CERROR ("Failed to allocate event queue: %d\n", rc); + LNetNIFini(); + + return (-ENOMEM); +} + + +int ptlrpc_init_portals(void) +{ + int rc = ptlrpc_ni_init(); + + if (rc != 0) { + CERROR("network initialisation failed\n"); + return -EIO; + } + rc = ptlrpcd_addref(); + if (rc == 0) + return 0; + + CERROR("rpcd initialisation failed\n"); + ptlrpc_ni_fini(); + return rc; +} + +void ptlrpc_exit_portals(void) +{ + ptlrpcd_decref(); + ptlrpc_ni_fini(); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile new file mode 100644 index 000000000000..8cdfbeed64e6 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/Makefile @@ -0,0 +1,8 @@ +obj-$(CONFIG_LUSTRE_FS) := ptlrpc_gss.o + +ptlrpc_gss-y := sec_gss.o gss_bulk.o gss_cli_upcall.o gss_svc_upcall.o \ + gss_rawobj.o lproc_gss.o gss_generic_token.o \ + gss_mech_switch.o gss_krb5_mech.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h new file mode 100644 index 000000000000..feac60482c97 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_api.h @@ -0,0 +1,179 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Somewhat simplified version of the gss api. + * + * Dug Song + * Andy Adamson + * Bruce Fields + * Copyright (c) 2000 The Regents of the University of Michigan + * + */ + +#ifndef __PTLRPC_GSS_GSS_API_H_ +#define __PTLRPC_GSS_GSS_API_H_ + +struct gss_api_mech; + +/* The mechanism-independent gss-api context: */ +struct gss_ctx { + struct gss_api_mech *mech_type; + void *internal_ctx_id; +}; + +#define GSS_C_NO_BUFFER ((rawobj_t) 0) +#define GSS_C_NO_CONTEXT ((struct gss_ctx *) 0) +#define GSS_C_NULL_OID ((rawobj_t) 0) + +/* + * gss-api prototypes; note that these are somewhat simplified versions of + * the prototypes specified in RFC 2744. + */ +__u32 lgss_import_sec_context( + rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx); +__u32 lgss_copy_reverse_context( + struct gss_ctx *ctx, + struct gss_ctx **ctx_new); +__u32 lgss_inquire_context( + struct gss_ctx *ctx, + unsigned long *endtime); +__u32 lgss_get_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); +__u32 lgss_verify_mic( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); +__u32 lgss_wrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); +__u32 lgss_unwrap( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); +__u32 lgss_prep_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); +__u32 lgss_wrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_unwrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_delete_sec_context( + struct gss_ctx **ctx); +int lgss_display( + struct gss_ctx *ctx, + char *buf, + int bufsize); + +struct subflavor_desc { + __u32 sf_subflavor; + __u32 sf_qop; + __u32 sf_service; + char *sf_name; +}; + +/* Each mechanism is described by the following struct: */ +struct gss_api_mech { + struct list_head gm_list; + module_t *gm_owner; + char *gm_name; + rawobj_t gm_oid; + atomic_t gm_count; + struct gss_api_ops *gm_ops; + int gm_sf_num; + struct subflavor_desc *gm_sfs; +}; + +/* and must provide the following operations: */ +struct gss_api_ops { + __u32 (*gss_import_sec_context)( + rawobj_t *input_token, + struct gss_ctx *ctx); + __u32 (*gss_copy_reverse_context)( + struct gss_ctx *ctx, + struct gss_ctx *ctx_new); + __u32 (*gss_inquire_context)( + struct gss_ctx *ctx, + unsigned long *endtime); + __u32 (*gss_get_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); + __u32 (*gss_verify_mic)( + struct gss_ctx *ctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token); + __u32 (*gss_wrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token); + __u32 (*gss_unwrap)( + struct gss_ctx *ctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg); + __u32 (*gss_prep_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); + __u32 (*gss_wrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + __u32 (*gss_unwrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + void (*gss_delete_sec_context)( + void *ctx); + int (*gss_display)( + struct gss_ctx *ctx, + char *buf, + int bufsize); +}; + +int lgss_mech_register(struct gss_api_mech *mech); +void lgss_mech_unregister(struct gss_api_mech *mech); + +struct gss_api_mech * lgss_OID_to_mech(rawobj_t *oid); +struct gss_api_mech * lgss_name_to_mech(char *name); +struct gss_api_mech * lgss_subflavor_to_mech(__u32 subflavor); + +struct gss_api_mech * lgss_mech_get(struct gss_api_mech *mech); +void lgss_mech_put(struct gss_api_mech *mech); + +#endif /* __PTLRPC_GSS_GSS_API_H_ */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h new file mode 100644 index 000000000000..c70eb00796f9 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_asn1.h @@ -0,0 +1,84 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * minimal asn1 for generic encoding/decoding of gss tokens + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#define SIZEOF_INT 4 + +/* from gssapi_err_generic.h */ +#define G_BAD_SERVICE_NAME (-2045022976L) +#define G_BAD_STRING_UID (-2045022975L) +#define G_NOUSER (-2045022974L) +#define G_VALIDATE_FAILED (-2045022973L) +#define G_BUFFER_ALLOC (-2045022972L) +#define G_BAD_MSG_CTX (-2045022971L) +#define G_WRONG_SIZE (-2045022970L) +#define G_BAD_USAGE (-2045022969L) +#define G_UNKNOWN_QOP (-2045022968L) +#define G_NO_HOSTNAME (-2045022967L) +#define G_BAD_HOSTNAME (-2045022966L) +#define G_WRONG_MECH (-2045022965L) +#define G_BAD_TOK_HEADER (-2045022964L) +#define G_BAD_DIRECTION (-2045022963L) +#define G_TOK_TRUNC (-2045022962L) +#define G_REFLECT (-2045022961L) +#define G_WRONG_TOKID (-2045022960L) + +#define g_OID_equal(o1,o2) \ + (((o1)->len == (o2)->len) && \ + (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) + +__u32 g_verify_token_header(rawobj_t *mech, + int *body_size, + unsigned char **buf_in, + int toksize); + +__u32 g_get_mech_oid(rawobj_t *mech, + rawobj_t *in_buf); + +int g_token_size(rawobj_t *mech, + unsigned int body_size); + +void g_make_token_header(rawobj_t *mech, + int body_size, + unsigned char **buf); diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c new file mode 100644 index 000000000000..ed95bbba95ca --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_bulk.c @@ -0,0 +1,512 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *msg; + struct ptlrpc_bulk_sec_desc *bsd; + rawobj_t token; + __u32 maj; + int offset; + int rc; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + LASSERT(req->rq_reqbuf->lm_bufcount >= 3); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 1; + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(req->rq_reqbuf->lm_bufcount >= 4); + msg = req->rq_reqbuf; + offset = msg->lm_bufcount - 2; + break; + case SPTLRPC_SVC_PRIV: + LASSERT(req->rq_clrbuf->lm_bufcount >= 2); + msg = req->rq_clrbuf; + offset = msg->lm_bufcount - 1; + break; + default: + LBUG(); + } + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + if (req->rq_bulk_read) { + /* + * bulk read: prepare receiving pages only for privacy mode. + */ + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + return gss_cli_prep_bulk(req, desc); + } else { + /* + * bulk write: sign or encrypt bulk pages. + */ + bsd->bsd_nob = desc->bd_nob; + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + /* integrity mode */ + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, + &token); + if (maj != GSS_S_COMPLETE) { + CWARN("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + } else { + /* privacy mode */ + if (desc->bd_iov_count == 0) + RETURN(0); + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + CERROR("bulk write: failed to allocate " + "encryption pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + CWARN("fail to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + } + } + + RETURN(0); +} + +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_cli_ctx *gctx; + struct lustre_msg *rmsg, *vmsg; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int roff, voff; + ENTRY; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + vmsg = req->rq_repdata; + voff = vmsg->lm_bufcount - 1; + LASSERT(vmsg && vmsg->lm_bufcount >= 3); + + rmsg = req->rq_reqbuf; + roff = rmsg->lm_bufcount - 1; /* last segment */ + LASSERT(rmsg && rmsg->lm_bufcount >= 3); + break; + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + vmsg = req->rq_repdata; + voff = vmsg->lm_bufcount - 2; + LASSERT(vmsg && vmsg->lm_bufcount >= 4); + + rmsg = req->rq_reqbuf; + roff = rmsg->lm_bufcount - 2; /* second last segment */ + LASSERT(rmsg && rmsg->lm_bufcount >= 4); + break; + case SPTLRPC_SVC_PRIV: + vmsg = req->rq_repdata; + voff = vmsg->lm_bufcount - 1; + LASSERT(vmsg && vmsg->lm_bufcount >= 2); + + rmsg = req->rq_clrbuf; + roff = rmsg->lm_bufcount - 1; /* last segment */ + LASSERT(rmsg && rmsg->lm_bufcount >= 2); + break; + default: + LBUG(); + } + + bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); + bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv)); + LASSERT(bsdr && bsdv); + + if (bsdr->bsd_version != bsdv->bsd_version || + bsdr->bsd_type != bsdv->bsd_type || + bsdr->bsd_svc != bsdv->bsd_svc) { + CERROR("bulk security descriptor mismatch: " + "(%u,%u,%u) != (%u,%u,%u)\n", + bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc, + bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc); + RETURN(-EPROTO); + } + + LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + /* + * in privacy mode if return success, make sure bd_nob_transferred + * is the actual size of the clear text, otherwise upper layer + * may be surprised. + */ + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) { + CERROR("server reported bulk i/o failure\n"); + RETURN(-EIO); + } + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + desc->bd_nob_transferred = desc->bd_nob; + } else { + /* + * bulk read, upon return success, bd_nob_transferred is + * the size of plain text actually received. + */ + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + int i, nob; + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len + nob > + desc->bd_nob_transferred) { + desc->bd_iov[i].kiov_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_iov[i].kiov_len; + } + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdv); + + maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, + &token); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to verify bulk read: %x\n", maj); + RETURN(-EACCES); + } + } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) { + desc->bd_nob = bsdv->bsd_nob; + if (desc->bd_nob == 0) + RETURN(0); + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdr); + + maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc, + &token, 1); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to decrypt bulk read: %x\n", + maj); + RETURN(-EACCES); + } + + desc->bd_nob_transferred = desc->bd_nob; + } + } + + RETURN(0); +} + +static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc, + struct gss_ctx *mechctx) +{ + int rc; + + if (desc->bd_iov_count == 0) + return 0; + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) + return rc; + + if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE) + return -EACCES; + + return 0; +} + +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + int rc; + ENTRY; + + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx); + if (rc) + CERROR("bulk read: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsd; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsd = grctx->src_reqbsd; + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx); + if (rc) + CERROR("bulk write: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to verify bulk signature: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + if (bsdr->bsd_nob != desc->bd_nob) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("prepared nob %d doesn't match the actual " + "nob %d\n", desc->bd_nob, bsdr->bsd_nob); + RETURN(-EPROTO); + } + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed decrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + } + + RETURN(0); +} + +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + bsdv->bsd_nob = desc->bd_nob; + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk read: failed to allocate encryption " + "pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 1); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + } + + RETURN(0); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c new file mode 100644 index 000000000000..142c789b1bc6 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_cli_upcall.c @@ -0,0 +1,447 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_cli_upcall.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +/********************************************** + * gss context init/fini helper * + **********************************************/ + +static +int ctx_init_pack_request(struct obd_import *imp, + struct ptlrpc_request *req, + int lustre_srv, + uid_t uid, gid_t gid, + long token_size, + char __user *token) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct gss_sec *gsec; + struct gss_header *ghdr; + struct ptlrpc_user_desc *pud; + __u32 *p, size, offset = 2; + rawobj_t obj; + + LASSERT(msg->lm_bufcount <= 4); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + + /* gss hdr */ + ghdr = lustre_msg_buf(msg, 0, sizeof(*ghdr)); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) imp->imp_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_INIT; + ghdr->gh_seq = 0; + ghdr->gh_svc = SPTLRPC_SVC_NULL; + ghdr->gh_handle.len = 0; + + /* fix the user desc */ + if (req->rq_pack_udesc) { + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = uid; + pud->pud_gid = pud->pud_fsgid = gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + offset++; + } + + /* security payload */ + p = lustre_msg_buf(msg, offset, 0); + size = msg->lm_buflens[offset]; + LASSERT(p); + + /* 1. lustre svc type */ + LASSERT(size > 4); + *p++ = cpu_to_le32(lustre_srv); + size -= 4; + + /* 2. target uuid */ + obj.len = strlen(imp->imp_obd->u.cli.cl_target_uuid.uuid) + 1; + obj.data = imp->imp_obd->u.cli.cl_target_uuid.uuid; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 3. reverse context handle. actually only needed by root user, + * but we send it anyway. */ + gsec = sec2gsec(req->rq_cli_ctx->cc_sec); + obj.len = sizeof(gsec->gs_rvs_hdl); + obj.data = (__u8 *) &gsec->gs_rvs_hdl; + if (rawobj_serialize(&obj, &p, &size)) + LBUG(); + + /* 4. now the token */ + LASSERT(size >= (sizeof(__u32) + token_size)); + *p++ = cpu_to_le32(((__u32) token_size)); + if (copy_from_user(p, token, token_size)) { + CERROR("can't copy token\n"); + return -EFAULT; + } + size -= sizeof(__u32) + cfs_size_round4(token_size); + + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, offset, + msg->lm_buflens[offset] - size, 0); + return 0; +} + +static +int ctx_init_parse_reply(struct lustre_msg *msg, int swabbed, + char __user *outbuf, long outlen) +{ + struct gss_rep_header *ghdr; + __u32 obj_len, round_len; + __u32 status, effective = 0; + + if (msg->lm_bufcount != 3) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + return -EPROTO; + } + + ghdr = (struct gss_rep_header *) gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("unable to extract gss reply header\n"); + return -EPROTO; + } + + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("invalid gss version %u\n", ghdr->gh_version); + return -EPROTO; + } + + if (outlen < (4 + 2) * 4 + cfs_size_round4(ghdr->gh_handle.len) + + cfs_size_round4(msg->lm_buflens[2])) { + CERROR("output buffer size %ld too small\n", outlen); + return -EFAULT; + } + + status = 0; + effective = 0; + + if (copy_to_user(outbuf, &status, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_major, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_minor, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, &ghdr->gh_seqwin, 4)) + return -EFAULT; + outbuf += 4; + effective += 4 * 4; + + /* handle */ + obj_len = ghdr->gh_handle.len; + round_len = (obj_len + 3) & ~ 3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, (char *) ghdr->gh_handle.data, round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + /* out token */ + obj_len = msg->lm_buflens[2]; + round_len = (obj_len + 3) & ~ 3; + if (copy_to_user(outbuf, &obj_len, 4)) + return -EFAULT; + outbuf += 4; + if (copy_to_user(outbuf, lustre_msg_buf(msg, 2, 0), round_len)) + return -EFAULT; + outbuf += round_len; + effective += 4 + round_len; + + return effective; +} + +/* XXX move to where lgssd could see */ +struct lgssd_ioctl_param { + int version; /* in */ + int secid; /* in */ + char *uuid; /* in */ + int lustre_svc; /* in */ + uid_t uid; /* in */ + gid_t gid; /* in */ + long send_token_size;/* in */ + char *send_token; /* in */ + long reply_buf_size; /* in */ + char *reply_buf; /* in */ + long status; /* out */ + long reply_length; /* out */ +}; + +int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count) +{ + struct obd_import *imp; + struct ptlrpc_request *req; + struct lgssd_ioctl_param param; + struct obd_device *obd; + char obdname[64]; + long lsize; + int rc; + + if (count != sizeof(param)) { + CERROR("ioctl size %lu, expect %lu, please check lgss_keyring " + "version\n", count, (unsigned long) sizeof(param)); + RETURN(-EINVAL); + } + if (copy_from_user(¶m, buffer, sizeof(param))) { + CERROR("failed copy data from lgssd\n"); + RETURN(-EFAULT); + } + + if (param.version != GSSD_INTERFACE_VERSION) { + CERROR("gssd interface version %d (expect %d)\n", + param.version, GSSD_INTERFACE_VERSION); + RETURN(-EINVAL); + } + + /* take name */ + if (strncpy_from_user(obdname, param.uuid, sizeof(obdname)) <= 0) { + CERROR("Invalid obdname pointer\n"); + RETURN(-EFAULT); + } + + obd = class_name2obd(obdname); + if (!obd) { + CERROR("no such obd %s\n", obdname); + RETURN(-EINVAL); + } + + if (unlikely(!obd->obd_set_up)) { + CERROR("obd %s not setup\n", obdname); + RETURN(-EINVAL); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + CERROR("obd %s has stopped\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("obd %s is not a client device\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + spin_unlock(&obd->obd_dev_lock); + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + CERROR("obd %s: import has gone\n", obd->obd_name); + up_read(&obd->u.cli.cl_sem); + RETURN(-EINVAL); + } + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + + if (imp->imp_deactive) { + CERROR("import has been deactivated\n"); + class_import_put(imp); + RETURN(-EINVAL); + } + + req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION, + SEC_CTX_INIT); + if (req == NULL) { + param.status = -ENOMEM; + goto out_copy; + } + + if (req->rq_cli_ctx->cc_sec->ps_id != param.secid) { + CWARN("original secid %d, now has changed to %d, " + "cancel this negotiation\n", param.secid, + req->rq_cli_ctx->cc_sec->ps_id); + param.status = -EINVAL; + goto out_copy; + } + + /* get token */ + rc = ctx_init_pack_request(imp, req, + param.lustre_svc, + param.uid, param.gid, + param.send_token_size, + param.send_token); + if (rc) { + param.status = rc; + goto out_copy; + } + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) { + /* If any _real_ denial be made, we expect server return + * -EACCES reply or return success but indicate gss error + * inside reply messsage. All other errors are treated as + * timeout, caller might try the negotiation repeatedly, + * leave recovery decisions to general ptlrpc layer. + * + * FIXME maybe some other error code shouldn't be treated + * as timeout. */ + param.status = rc; + if (rc != -EACCES) + param.status = -ETIMEDOUT; + goto out_copy; + } + + LASSERT(req->rq_repdata); + lsize = ctx_init_parse_reply(req->rq_repdata, + ptlrpc_rep_need_swab(req), + param.reply_buf, param.reply_buf_size); + if (lsize < 0) { + param.status = (int) lsize; + goto out_copy; + } + + param.status = 0; + param.reply_length = lsize; + +out_copy: + if (copy_to_user(buffer, ¶m, sizeof(param))) + rc = -EFAULT; + else + rc = 0; + + class_import_put(imp); + ptlrpc_req_finished(req); + RETURN(rc); +} + +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + struct obd_import *imp = ctx->cc_sec->ps_import; + struct ptlrpc_request *req; + struct ptlrpc_user_desc *pud; + int rc; + ENTRY; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (cli_ctx_is_error(ctx) || !cli_ctx_is_uptodate(ctx)) { + CDEBUG(D_SEC, "ctx %p(%u->%s) not uptodate, " + "don't send destroy rpc\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + RETURN(0); + } + + might_sleep(); + + CWARN("%s ctx %p idx "LPX64" (%u->%s)\n", + sec_is_reverse(ctx->cc_sec) ? + "server finishing reverse" : "client finishing forward", + ctx, gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + gctx->gc_proc = PTLRPC_GSS_PROC_DESTROY; + + req = ptlrpc_request_alloc(imp, &RQF_SEC_CTX); + if (req == NULL) { + CWARN("ctx %p(%u): fail to prepare rpc, destroy locally\n", + ctx, ctx->cc_vcred.vc_uid); + GOTO(out, rc = -ENOMEM); + } + + rc = ptlrpc_request_bufs_pack(req, LUSTRE_OBD_VERSION, SEC_CTX_FINI, + NULL, ctx); + if (rc) { + ptlrpc_request_free(req); + GOTO(out_ref, rc); + } + + /* fix the user desc */ + if (req->rq_pack_udesc) { + /* we rely the fact that this request is in AUTH mode, + * and user_desc at offset 2. */ + pud = lustre_msg_buf(req->rq_reqbuf, 2, sizeof(*pud)); + LASSERT(pud); + pud->pud_uid = pud->pud_fsuid = ctx->cc_vcred.vc_uid; + pud->pud_gid = pud->pud_fsgid = ctx->cc_vcred.vc_gid; + pud->pud_cap = 0; + pud->pud_ngroups = 0; + } + + req->rq_phase = RQ_PHASE_RPC; + rc = ptl_send_rpc(req, 1); + if (rc) + CWARN("ctx %p(%u->%s): rpc error %d, destroy locally\n", ctx, + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), rc); + +out_ref: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int __init gss_init_cli_upcall(void) +{ + return 0; +} + +void __exit gss_exit_cli_upcall(void) +{ +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h new file mode 100644 index 000000000000..13425796fa33 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_err.h @@ -0,0 +1,193 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __PTLRPC_GSS_GSS_ERR_H_ +#define __PTLRPC_GSS_GSS_ERR_H_ + +typedef unsigned int OM_uint32; + +/* + * Flag bits for context-level services. + */ +#define GSS_C_DELEG_FLAG (1) +#define GSS_C_MUTUAL_FLAG (2) +#define GSS_C_REPLAY_FLAG (4) +#define GSS_C_SEQUENCE_FLAG (8) +#define GSS_C_CONF_FLAG (16) +#define GSS_C_INTEG_FLAG (32) +#define GSS_C_ANON_FLAG (64) +#define GSS_C_PROT_READY_FLAG (128) +#define GSS_C_TRANS_FLAG (256) + +/* + * Credential usage options + */ +#define GSS_C_BOTH (0) +#define GSS_C_INITIATE (1) +#define GSS_C_ACCEPT (2) + +/* + * Status code types for gss_display_status + */ +#define GSS_C_GSS_CODE (1) +#define GSS_C_MECH_CODE (2) + + +/* + * Define the default Quality of Protection for per-message services. Note + * that an implementation that offers multiple levels of QOP may either reserve + * a value (for example zero, as assumed here) to mean "default protection", or + * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit + * QOP value. However a value of 0 should always be interpreted by a GSSAPI + * implementation as a request for the default protection level. + */ +#define GSS_C_QOP_DEFAULT (0) + +/* + * Expiration time of 2^32-1 seconds means infinite lifetime for a + * credential or security context + */ +#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful) + + +/* Major status codes */ + +#define GSS_S_COMPLETE (0) + +/* + * Some "helper" definitions to make the status code macros obvious. + */ +#define GSS_C_CALLING_ERROR_OFFSET (24) +#define GSS_C_ROUTINE_ERROR_OFFSET (16) +#define GSS_C_SUPPLEMENTARY_OFFSET (0) +#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) + +/* + * The macros that test status codes for error conditions. Note that the + * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now + * evaluates its argument only once. + */ +#define GSS_CALLING_ERROR(x) \ + ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) +#define GSS_ROUTINE_ERROR(x) \ + ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) +#define GSS_SUPPLEMENTARY_INFO(x) \ + ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) +#define GSS_ERROR(x) \ + ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ + (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) + +/* + * Now the actual status code definitions + */ + +/* + * Calling errors: + */ +#define GSS_S_CALL_INACCESSIBLE_READ \ + (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_INACCESSIBLE_WRITE \ + (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_BAD_STRUCTURE \ + (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET) + +/* + * Routine errors: + */ +#define GSS_S_BAD_MECH \ + (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAME \ + (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAMETYPE \ + (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_BINDINGS \ + (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_STATUS \ + (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_SIG \ + (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CRED \ + (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CONTEXT \ + (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_TOKEN \ + (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_CREDENTIAL \ + (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CREDENTIALS_EXPIRED \ + (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CONTEXT_EXPIRED \ + (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_FAILURE \ + (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_QOP \ + (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAUTHORIZED \ + (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAVAILABLE \ + (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DUPLICATE_ELEMENT \ + (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NAME_NOT_MN \ + (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET) + +/* + * Supplementary info bits: + */ +#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0)) +#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1)) +#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2)) +#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3)) +#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4)) + +/* XXXX these are not part of the GSSAPI C bindings! (but should be) */ + +#define GSS_CALLING_ERROR_FIELD(x) \ + (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK) +#define GSS_ROUTINE_ERROR_FIELD(x) \ + (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK) +#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \ + (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK) + +/* XXXX This is a necessary evil until the spec is fixed */ +#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE + +#endif /* __PTLRPC_GSS_GSS_ERR_H_ */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c new file mode 100644 index 000000000000..20b1638e7255 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_generic_token.c @@ -0,0 +1,285 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" +#include "gss_asn1.h" + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static +int der_length_size(int length) +{ + if (length < (1 << 7)) + return 1; + else if (length < (1 << 8)) + return 2; +#if (SIZEOF_INT == 2) + else + return 3; +#else + else if (length < (1 << 16)) + return 3; + else if (length < (1 << 24)) + return 4; + else + return 5; +#endif +} + +static +void der_write_length(unsigned char **buf, int length) +{ + if (length < (1 << 7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length) + 127); +#if (SIZEOF_INT > 2) + if (length >= (1 << 24)) + *(*buf)++ = (unsigned char) (length >> 24); + if (length >= (1 << 16)) + *(*buf)++ = (unsigned char) ((length >> 16) & 0xff); +#endif + if (length >= (1 << 8)) + *(*buf)++ = (unsigned char) ((length >> 8) & 0xff); + *(*buf)++ = (unsigned char) (length & 0xff); + } +} + +/* + * returns decoded length, or < 0 on failure. Advances buf and + * decrements bufsize + */ +static +int der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return -1; + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize) - 1)) + return -1; + if (sf > SIZEOF_INT) + return -1; + ret = 0; + for (; sf; sf--) { + ret = (ret << 8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return ret; +} + +/* + * returns the length of a token, given the mech oid and the body size + */ +int g_token_size(rawobj_t *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return (1 + der_length_size(body_size) + body_size); +} + +/* + * fills in a buffer with the token header. The buffer is assumed to + * be the right size. buf is advanced past the token header + */ +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + rawobj_t toid; + int ret = 0; + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((toksize -= 1) < 0) + return (G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize -= toid.len) < 0) + return (G_BAD_TOK_HEADER); + toid.data = buf; + buf += toid.len; + + if (!g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more + * important to return G_BAD_TOK_HEADER if the token header is + * in fact bad + */ + if ((toksize -= 2) < 0) + return (G_BAD_TOK_HEADER); + + if (ret) + return (ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return (ret); +} + +/* + * Given a buffer containing a token, returns a copy of the mech oid in + * the parameter mech. + */ +__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t *in_buf) +{ + unsigned char *buf = in_buf->data; + int len = in_buf->len; + int ret = 0; + int seqsize; + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return (G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &len)) < 0) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return (G_BAD_TOK_HEADER); + + if ((len -= 1) < 0) + return (G_BAD_TOK_HEADER); + mech->len = *buf++; + + if ((len -= mech->len) < 0) + return (G_BAD_TOK_HEADER); + OBD_ALLOC_LARGE(mech->data, mech->len); + if (!mech->data) + return (G_BUFFER_ALLOC); + memcpy(mech->data, buf, mech->len); + + return ret; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h new file mode 100644 index 000000000000..cbfc47cb3f7b --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_internal.h @@ -0,0 +1,526 @@ +/* + * Modified from NFSv4 project for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +#ifndef __PTLRPC_GSS_GSS_INTERNAL_H_ +#define __PTLRPC_GSS_GSS_INTERNAL_H_ + +#include + +/* + * rawobj stuff + */ +typedef struct netobj_s { + __u32 len; + __u8 data[0]; +} netobj_t; + +#define NETOBJ_EMPTY ((netobj_t) { 0 }) + +typedef struct rawobj_s { + __u32 len; + __u8 *data; +} rawobj_t; + +#define RAWOBJ_EMPTY ((rawobj_t) { 0, NULL }) + +typedef struct rawobj_buf_s { + __u32 dataoff; + __u32 datalen; + __u32 buflen; + __u8 *buf; +} rawobj_buf_t; + +int rawobj_empty(rawobj_t *obj); +int rawobj_alloc(rawobj_t *obj, char *buf, int len); +void rawobj_free(rawobj_t *obj); +int rawobj_equal(rawobj_t *a, rawobj_t *b); +int rawobj_dup(rawobj_t *dest, rawobj_t *src); +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj); +int rawobj_from_netobj_alloc(rawobj_t *obj, netobj_t *netobj); + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen); + +/* + * several timeout values. client refresh upcall timeout we using + * default in pipefs implemnetation. + */ +#define __TIMEOUT_DELTA (10) + +#define GSS_SECINIT_RPC_TIMEOUT \ + (obd_timeout < __TIMEOUT_DELTA ? \ + __TIMEOUT_DELTA : obd_timeout - __TIMEOUT_DELTA) + +#define GSS_SECFINI_RPC_TIMEOUT (__TIMEOUT_DELTA) +#define GSS_SECSVC_UPCALL_TIMEOUT (GSS_SECINIT_RPC_TIMEOUT) + +/* + * default gc interval + */ +#define GSS_GC_INTERVAL (60 * 60) /* 60 minutes */ + +static inline +unsigned long gss_round_ctx_expiry(unsigned long expiry, + unsigned long sec_flags) +{ + if (sec_flags & PTLRPC_SEC_FL_REVERSE) + return expiry; + + if (get_seconds() + __TIMEOUT_DELTA <= expiry) + return expiry - __TIMEOUT_DELTA; + + return expiry; +} + +/* + * Max encryption element in block cipher algorithms. + */ +#define GSS_MAX_CIPHER_BLOCK (16) + +/* + * XXX make it visible of kernel and lgssd/lsvcgssd + */ +#define GSSD_INTERFACE_VERSION (1) + +#define PTLRPC_GSS_VERSION (1) + + +enum ptlrpc_gss_proc { + PTLRPC_GSS_PROC_DATA = 0, + PTLRPC_GSS_PROC_INIT = 1, + PTLRPC_GSS_PROC_CONTINUE_INIT = 2, + PTLRPC_GSS_PROC_DESTROY = 3, + PTLRPC_GSS_PROC_ERR = 4, +}; + +enum ptlrpc_gss_tgt { + LUSTRE_GSS_TGT_MGS = 0, + LUSTRE_GSS_TGT_MDS = 1, + LUSTRE_GSS_TGT_OSS = 2, +}; + +enum ptlrpc_gss_header_flags { + LUSTRE_GSS_PACK_BULK = 1, + LUSTRE_GSS_PACK_USER = 2, +}; + +static inline +__u32 import_to_gss_svc(struct obd_import *imp) +{ + const char *name = imp->imp_obd->obd_type->typ_name; + + if (!strcmp(name, LUSTRE_MGC_NAME)) + return LUSTRE_GSS_TGT_MGS; + if (!strcmp(name, LUSTRE_MDC_NAME)) + return LUSTRE_GSS_TGT_MDS; + if (!strcmp(name, LUSTRE_OSC_NAME)) + return LUSTRE_GSS_TGT_OSS; + LBUG(); + return 0; +} + +/* + * following 3 header must have the same size and offset + */ +struct gss_header { + __u8 gh_version; /* gss version */ + __u8 gh_sp; /* sec part */ + __u16 gh_pad0; + __u32 gh_flags; /* wrap flags */ + __u32 gh_proc; /* proc */ + __u32 gh_seq; /* sequence */ + __u32 gh_svc; /* service */ + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; /* context handle */ +}; + +struct gss_rep_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_seqwin; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +struct gss_err_header { + __u8 gh_version; + __u8 gh_sp; + __u16 gh_pad0; + __u32 gh_flags; + __u32 gh_proc; + __u32 gh_major; + __u32 gh_minor; + __u32 gh_pad1; + __u32 gh_pad2; + __u32 gh_pad3; + netobj_t gh_handle; +}; + +/* + * part of wire context information send from client which be saved and + * used later by server. + */ +struct gss_wire_ctx { + __u32 gw_flags; + __u32 gw_proc; + __u32 gw_seq; + __u32 gw_svc; + rawobj_t gw_handle; +}; + +#define PTLRPC_GSS_MAX_HANDLE_SIZE (8) +#define PTLRPC_GSS_HEADER_SIZE (sizeof(struct gss_header) + \ + PTLRPC_GSS_MAX_HANDLE_SIZE) + + +static inline __u64 gss_handle_to_u64(rawobj_t *handle) +{ + if (handle->len != PTLRPC_GSS_MAX_HANDLE_SIZE) + return -1; + return *((__u64 *) handle->data); +} + +#define GSS_SEQ_WIN (2048) +#define GSS_SEQ_WIN_MAIN GSS_SEQ_WIN +#define GSS_SEQ_WIN_BACK (128) +#define GSS_SEQ_REPACK_THRESHOLD (GSS_SEQ_WIN_MAIN / 2 + \ + GSS_SEQ_WIN_MAIN / 4) + +struct gss_svc_seq_data { + spinlock_t ssd_lock; + /* + * highest sequence number seen so far, for main and back window + */ + __u32 ssd_max_main; + __u32 ssd_max_back; + /* + * main and back window + * for i such that ssd_max - GSS_SEQ_WIN < i <= ssd_max, the i-th bit + * of ssd_win is nonzero iff sequence number i has been seen already. + */ + unsigned long ssd_win_main[GSS_SEQ_WIN_MAIN/BITS_PER_LONG]; + unsigned long ssd_win_back[GSS_SEQ_WIN_BACK/BITS_PER_LONG]; +}; + +struct gss_svc_ctx { + struct gss_ctx *gsc_mechctx; + struct gss_svc_seq_data gsc_seqdata; + rawobj_t gsc_rvs_hdl; + __u32 gsc_rvs_seq; + uid_t gsc_uid; + gid_t gsc_gid; + uid_t gsc_mapped_uid; + unsigned int gsc_usr_root:1, + gsc_usr_mds:1, + gsc_usr_oss:1, + gsc_remote:1, + gsc_reverse:1; +}; + +struct gss_svc_reqctx { + struct ptlrpc_svc_ctx src_base; + /* + * context + */ + struct gss_wire_ctx src_wirectx; + struct gss_svc_ctx *src_ctx; + /* + * record place of bulk_sec_desc in request/reply buffer + */ + struct ptlrpc_bulk_sec_desc *src_reqbsd; + int src_reqbsd_size; + struct ptlrpc_bulk_sec_desc *src_repbsd; + int src_repbsd_size; + /* + * flags + */ + unsigned int src_init:1, + src_init_continue:1, + src_err_notify:1; + int src_reserve_len; +}; + +struct gss_cli_ctx { + struct ptlrpc_cli_ctx gc_base; + __u32 gc_flavor; + __u32 gc_proc; + __u32 gc_win; + atomic_t gc_seq; + rawobj_t gc_handle; + struct gss_ctx *gc_mechctx; + /* handle for the buddy svc ctx */ + rawobj_t gc_svc_handle; +}; + +struct gss_cli_ctx_keyring { + struct gss_cli_ctx gck_base; + struct key *gck_key; + struct timer_list *gck_timer; +}; + +struct gss_sec { + struct ptlrpc_sec gs_base; + struct gss_api_mech *gs_mech; + spinlock_t gs_lock; + __u64 gs_rvs_hdl; +}; + +struct gss_sec_pipefs { + struct gss_sec gsp_base; + int gsp_chash_size; /* must be 2^n */ + struct hlist_head gsp_chash[0]; +}; + +/* + * FIXME cleanup the keyring upcall mutexes + */ +#define HAVE_KEYRING_UPCALL_SERIALIZED 1 + +struct gss_sec_keyring { + struct gss_sec gsk_base; + /* + * all contexts listed here. access is protected by sec spinlock. + */ + struct hlist_head gsk_clist; + /* + * specially point to root ctx (only one at a time). access is + * protected by sec spinlock. + */ + struct ptlrpc_cli_ctx *gsk_root_ctx; + /* + * specially serialize upcalls for root context. + */ + struct mutex gsk_root_uc_lock; + +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + struct mutex gsk_uc_lock; /* serialize upcalls */ +#endif +}; + +static inline struct gss_cli_ctx *ctx2gctx(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx, struct gss_cli_ctx, gc_base); +} + +static inline +struct gss_cli_ctx_keyring *ctx2gctx_keyring(struct ptlrpc_cli_ctx *ctx) +{ + return container_of(ctx2gctx(ctx), + struct gss_cli_ctx_keyring, gck_base); +} + +static inline struct gss_sec *sec2gsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct gss_sec, gs_base); +} + +static inline struct gss_sec_pipefs *sec2gsec_pipefs(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_pipefs, gsp_base); +} + +static inline struct gss_sec_keyring *sec2gsec_keyring(struct ptlrpc_sec *sec) +{ + return container_of(sec2gsec(sec), struct gss_sec_keyring, gsk_base); +} + + +#define GSS_CTX_INIT_MAX_LEN (1024) + +/* + * This only guaranteed be enough for current krb5 des-cbc-crc . We might + * adjust this when new enc type or mech added in. + */ +#define GSS_PRIVBUF_PREFIX_LEN (32) +#define GSS_PRIVBUF_SUFFIX_LEN (32) + +static inline +struct gss_svc_reqctx *gss_svc_ctx2reqctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return container_of(ctx, struct gss_svc_reqctx, src_base); +} + +static inline +struct gss_svc_ctx *gss_svc_ctx2gssctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(ctx); + return gss_svc_ctx2reqctx(ctx)->src_ctx; +} + +/* sec_gss.c */ +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred); +int gss_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); + +int gss_sec_install_rctx(struct obd_import *imp, struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize); +void gss_free_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int segment, int newsize); + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, + struct ptlrpc_request *req); +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx); +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int gss_svc_authorize(struct ptlrpc_request *req); +void gss_svc_free_rs(struct ptlrpc_reply_state *rs); +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx); + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx); + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx); + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed); +netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment); + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx); +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor); +int gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num, int set); + +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf); +void gss_sec_destroy_common(struct gss_sec *gsec); +void gss_sec_kill(struct ptlrpc_sec *sec); + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred); +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize); + +/* gss_keyring.c */ +int __init gss_init_keyring(void); +void __exit gss_exit_keyring(void); + +/* gss_pipefs.c */ +int __init gss_init_pipefs(void); +void __exit gss_exit_pipefs(void); + +/* gss_bulk.c */ +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int gss_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* gss_mech_switch.c */ +int init_kerberos_module(void); +void cleanup_kerberos_module(void); + +/* gss_generic_token.c */ +int g_token_size(rawobj_t *mech, unsigned int body_size); +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf); +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize); + + +/* gss_cli_upcall.c */ +int gss_do_ctx_init_rpc(char *buffer, unsigned long count); +int gss_do_ctx_fini_rpc(struct gss_cli_ctx *gctx); + +int __init gss_init_cli_upcall(void); +void __exit gss_exit_cli_upcall(void); + +/* gss_svc_upcall.c */ +__u64 gss_get_next_ctx_index(void); +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx); +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle); +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx); +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq); +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token); +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw); +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx); +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx); + +int __init gss_init_svc_upcall(void); +void __exit gss_exit_svc_upcall(void); + +/* lproc_gss.c */ +void gss_stat_oos_record_cli(int behind); +void gss_stat_oos_record_svc(int phase, int replay); + +int __init gss_init_lproc(void); +void __exit gss_exit_lproc(void); + +/* gss_krb5_mech.c */ +int __init init_kerberos_module(void); +void __exit cleanup_kerberos_module(void); + + +/* debug */ +static inline +void __dbg_memdump(char *name, void *ptr, int size) +{ + char *buf, *p = (char *) ptr; + int bufsize = size * 2 + 1, i; + + OBD_ALLOC(buf, bufsize); + if (!buf) { + CDEBUG(D_ERROR, "DUMP ERROR: can't alloc %d bytes\n", bufsize); + return; + } + + for (i = 0; i < size; i++) + sprintf(&buf[i+i], "%02x", (__u8) p[i]); + buf[size + size] = '\0'; + LCONSOLE_INFO("DUMP %s@%p(%d): %s\n", name, ptr, size, buf); + OBD_FREE(buf, bufsize); +} + +#endif /* __PTLRPC_GSS_GSS_INTERNAL_H_ */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c new file mode 100644 index 000000000000..bb571ae51054 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_keyring.c @@ -0,0 +1,1424 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_keyring.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct ptlrpc_sec_policy gss_policy_keyring; +static struct ptlrpc_ctx_ops gss_keyring_ctxops; +static struct key_type gss_key_type; + +static int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx); + +/* + * the timeout is only for the case that upcall child process die abnormally. + * in any other cases it should finally update kernel key. + * + * FIXME we'd better to incorporate the client & server side upcall timeouts + * into the framework of Adaptive Timeouts, but we need to figure out how to + * make sure that kernel knows the upcall processes is in-progress or died + * unexpectedly. + */ +#define KEYRING_UPCALL_TIMEOUT (obd_timeout + obd_timeout) + +/**************************************** + * internal helpers * + ****************************************/ + +#define DUMP_PROCESS_KEYRINGS(tsk) \ +{ \ + CWARN("DUMP PK: %s[%u,%u/%u](<-%s[%u,%u/%u]): " \ + "a %d, t %d, p %d, s %d, u %d, us %d, df %d\n", \ + tsk->comm, tsk->pid, tsk->uid, tsk->fsuid, \ + tsk->parent->comm, tsk->parent->pid, \ + tsk->parent->uid, tsk->parent->fsuid, \ + tsk->request_key_auth ? \ + tsk->request_key_auth->serial : 0, \ + key_cred(tsk)->thread_keyring ? \ + key_cred(tsk)->thread_keyring->serial : 0, \ + key_tgcred(tsk)->process_keyring ? \ + key_tgcred(tsk)->process_keyring->serial : 0, \ + key_tgcred(tsk)->session_keyring ? \ + key_tgcred(tsk)->session_keyring->serial : 0, \ + key_cred(tsk)->user->uid_keyring ? \ + key_cred(tsk)->user->uid_keyring->serial : 0, \ + key_cred(tsk)->user->session_keyring ? \ + key_cred(tsk)->user->session_keyring->serial : 0, \ + key_cred(tsk)->jit_keyring \ + ); \ +} + +#define DUMP_KEY(key) \ +{ \ + CWARN("DUMP KEY: %p(%d) ref %d u%u/g%u desc %s\n", \ + key, key->serial, atomic_read(&key->usage), \ + key->uid, key->gid, \ + key->description ? key->description : "n/a" \ + ); \ +} + +#define key_cred(tsk) ((tsk)->cred) +#define key_tgcred(tsk) ((tsk)->cred->tgcred) + +static inline void keyring_upcall_lock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_lock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void keyring_upcall_unlock(struct gss_sec_keyring *gsec_kr) +{ +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_unlock(&gsec_kr->gsk_uc_lock); +#endif +} + +static inline void key_revoke_locked(struct key *key) +{ + set_bit(KEY_FLAG_REVOKED, &key->flags); +} + +static void ctx_upcall_timeout_kr(unsigned long data) +{ + struct ptlrpc_cli_ctx *ctx = (struct ptlrpc_cli_ctx *) data; + struct key *key = ctx2gctx_keyring(ctx)->gck_key; + + CWARN("ctx %p, key %p\n", ctx, key); + + LASSERT(key); + + cli_ctx_expire(ctx); + key_revoke_locked(key); +} + +static +void ctx_start_timer_kr(struct ptlrpc_cli_ctx *ctx, long timeout) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = gctx_kr->gck_timer; + + LASSERT(timer); + + CDEBUG(D_SEC, "ctx %p: start timer %lds\n", ctx, timeout); + timeout = timeout * HZ + cfs_time_current(); + + init_timer(timer); + timer->expires = timeout; + timer->data = (unsigned long ) ctx; + timer->function = ctx_upcall_timeout_kr; + + add_timer(timer); +} + +/* + * caller should make sure no race with other threads + */ +static +void ctx_clear_timer_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + struct timer_list *timer = gctx_kr->gck_timer; + + if (timer == NULL) + return; + + CDEBUG(D_SEC, "ctx %p, key %p\n", ctx, gctx_kr->gck_key); + + gctx_kr->gck_timer = NULL; + + del_singleshot_timer_sync(timer); + + OBD_FREE_PTR(timer); +} + +static +struct ptlrpc_cli_ctx *ctx_create_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx_keyring *gctx_kr; + + OBD_ALLOC_PTR(gctx_kr); + if (gctx_kr == NULL) + return NULL; + + OBD_ALLOC_PTR(gctx_kr->gck_timer); + if (gctx_kr->gck_timer == NULL) { + OBD_FREE_PTR(gctx_kr); + return NULL; + } + init_timer(gctx_kr->gck_timer); + + ctx = &gctx_kr->gck_base.gc_base; + + if (gss_cli_ctx_init_common(sec, ctx, &gss_keyring_ctxops, vcred)) { + OBD_FREE_PTR(gctx_kr->gck_timer); + OBD_FREE_PTR(gctx_kr); + return NULL; + } + + ctx->cc_expire = cfs_time_current_sec() + KEYRING_UPCALL_TIMEOUT; + clear_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); /* for the caller */ + + return ctx; +} + +static void ctx_destroy_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_cli_ctx_keyring *gctx_kr = ctx2gctx_keyring(ctx); + + CDEBUG(D_SEC, "destroying ctx %p\n", ctx); + + /* at this time the association with key has been broken. */ + LASSERT(sec); + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(gctx_kr->gck_key == NULL); + + ctx_clear_timer_kr(ctx); + LASSERT(gctx_kr->gck_timer == NULL); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx_kr); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static void ctx_release_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + if (sync) { + ctx_destroy_kr(ctx); + } else { + atomic_inc(&ctx->cc_refcount); + sptlrpc_gc_add_ctx(ctx); + } +} + +static void ctx_put_kr(struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (atomic_dec_and_test(&ctx->cc_refcount)) + ctx_release_kr(ctx, sync); +} + +/* + * key <-> ctx association and rules: + * - ctx might not bind with any key + * - key/ctx binding is protected by key semaphore (if the key present) + * - key and ctx each take a reference of the other + * - ctx enlist/unlist is protected by ctx spinlock + * - never enlist a ctx after it's been unlisted + * - whoever do enlist should also do bind, lock key before enlist: + * - lock key -> lock ctx -> enlist -> unlock ctx -> bind -> unlock key + * - whoever do unlist should also do unbind: + * - lock key -> lock ctx -> unlist -> unlock ctx -> unbind -> unlock key + * - lock ctx -> unlist -> unlock ctx -> lock key -> unbind -> unlock key + */ + +static inline void spin_lock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_lock(lock); +} + +static inline void spin_unlock_if(spinlock_t *lock, int condition) +{ + if (condition) + spin_unlock(lock); +} + +static void ctx_enlist_kr(struct ptlrpc_cli_ctx *ctx, int is_root, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + LASSERT(!test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + spin_lock_if(&sec->ps_lock, !locked); + + atomic_inc(&ctx->cc_refcount); + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + hlist_add_head(&ctx->cc_cache, &gsec_kr->gsk_clist); + if (is_root) + gsec_kr->gsk_root_ctx = ctx; + + spin_unlock_if(&sec->ps_lock, !locked); +} + +/* + * Note after this get called, caller should not access ctx again because + * it might have been freed, unless caller hold at least one refcount of + * the ctx. + * + * return non-zero if we indeed unlist this ctx. + */ +static int ctx_unlist_kr(struct ptlrpc_cli_ctx *ctx, int locked) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + /* if hashed bit has gone, leave the job to somebody who is doing it */ + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0) + return 0; + + /* drop ref inside spin lock to prevent race with other operations */ + spin_lock_if(&sec->ps_lock, !locked); + + if (gsec_kr->gsk_root_ctx == ctx) + gsec_kr->gsk_root_ctx = NULL; + hlist_del_init(&ctx->cc_cache); + atomic_dec(&ctx->cc_refcount); + + spin_unlock_if(&sec->ps_lock, !locked); + + return 1; +} + +/* + * bind a key with a ctx together. + * caller must hold write lock of the key, as well as ref on key & ctx. + */ +static void bind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(atomic_read(&key->usage) > 0); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == NULL); + LASSERT(key->payload.data == NULL); + + /* at this time context may or may not in list. */ + key_get(key); + atomic_inc(&ctx->cc_refcount); + ctx2gctx_keyring(ctx)->gck_key = key; + key->payload.data = ctx; +} + +/* + * unbind a key and a ctx. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_ctx(struct key *key, struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(key->payload.data == ctx); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + + /* must revoke the key, or others may treat it as newly created */ + key_revoke_locked(key); + + key->payload.data = NULL; + ctx2gctx_keyring(ctx)->gck_key = NULL; + + /* once ctx get split from key, the timer is meaningless */ + ctx_clear_timer_kr(ctx); + + ctx_put_kr(ctx, 1); + key_put(key); +} + +/* + * given a ctx, unbind with its coupled key, if any. + * unbind could only be called once, so we don't worry the key be released + * by someone else. + */ +static void unbind_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + struct key *key = ctx2gctx_keyring(ctx)->gck_key; + + if (key) { + LASSERT(key->payload.data == ctx); + + key_get(key); + down_write(&key->sem); + unbind_key_ctx(key, ctx); + up_write(&key->sem); + key_put(key); + } +} + +/* + * given a key, unbind with its coupled ctx, if any. + * caller must hold write lock, as well as a ref of the key. + */ +static void unbind_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key->payload.data; + + if (ctx) + unbind_key_ctx(key, ctx); +} + +/* + * unlist a ctx, and unbind from coupled key + */ +static void kill_ctx_kr(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_unlist_kr(ctx, 0)) + unbind_ctx_kr(ctx); +} + +/* + * given a key, unlist and unbind with the coupled ctx (if any). + * caller must hold write lock, as well as a ref of the key. + */ +static void kill_key_locked(struct key *key) +{ + struct ptlrpc_cli_ctx *ctx = key->payload.data; + + if (ctx && ctx_unlist_kr(ctx, 0)) + unbind_key_locked(key); +} + +/* + * caller should hold one ref on contexts in freelist. + */ +static void dispose_ctx_list_kr(struct hlist_head *freelist) +{ + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + + hlist_for_each_entry_safe(ctx, next, freelist, cc_cache) { + hlist_del_init(&ctx->cc_cache); + + /* reverse ctx: update current seq to buddy svcctx if exist. + * ideally this should be done at gss_cli_ctx_finalize(), but + * the ctx destroy could be delayed by: + * 1) ctx still has reference; + * 2) ctx destroy is asynchronous; + * and reverse import call inval_all_ctx() require this be done + *_immediately_ otherwise newly created reverse ctx might copy + * the very old sequence number from svcctx. */ + gctx = ctx2gctx(ctx); + if (!rawobj_empty(&gctx->gc_svc_handle) && + sec_is_reverse(gctx->gc_base.cc_sec)) { + gss_svc_upcall_update_sequence(&gctx->gc_svc_handle, + (__u32) atomic_read(&gctx->gc_seq)); + } + + /* we need to wakeup waiting reqs here. the context might + * be forced released before upcall finished, then the + * late-arrived downcall can't find the ctx even. */ + sptlrpc_cli_ctx_wakeup(ctx); + + unbind_ctx_kr(ctx); + ctx_put_kr(ctx, 0); + } +} + +/* + * lookup a root context directly in a sec, return root ctx with a + * reference taken or NULL. + */ +static +struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + + spin_lock(&sec->ps_lock); + + ctx = gsec_kr->gsk_root_ctx; + + if (ctx == NULL && unlikely(sec_is_reverse(sec))) { + struct ptlrpc_cli_ctx *tmp; + + /* reverse ctx, search root ctx in list, choose the one + * with shortest expire time, which is most possibly have + * an established peer ctx at client side. */ + hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) { + if (ctx == NULL || ctx->cc_expire == 0 || + ctx->cc_expire > tmp->cc_expire) { + ctx = tmp; + /* promote to be root_ctx */ + gsec_kr->gsk_root_ctx = ctx; + } + } + } + + if (ctx) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(!hlist_empty(&gsec_kr->gsk_clist)); + atomic_inc(&ctx->cc_refcount); + } + + spin_unlock(&sec->ps_lock); + + return ctx; +} + +#define RVS_CTX_EXPIRE_NICE (10) + +static +void rvs_sec_install_root_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *new_ctx, + struct key *key) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx; + cfs_time_t now; + ENTRY; + + LASSERT(sec_is_reverse(sec)); + + spin_lock(&sec->ps_lock); + + now = cfs_time_current_sec(); + + /* set all existing ctxs short expiry */ + hlist_for_each_entry(ctx, &gsec_kr->gsk_clist, cc_cache) { + if (ctx->cc_expire > now + RVS_CTX_EXPIRE_NICE) { + ctx->cc_early_expire = 1; + ctx->cc_expire = now + RVS_CTX_EXPIRE_NICE; + } + } + + /* if there's root_ctx there, instead obsolete the current + * immediately, we leave it continue operating for a little while. + * hopefully when the first backward rpc with newest ctx send out, + * the client side already have the peer ctx well established. */ + ctx_enlist_kr(new_ctx, gsec_kr->gsk_root_ctx ? 0 : 1, 1); + + if (key) + bind_key_ctx(key, new_ctx); + + spin_unlock(&sec->ps_lock); +} + +static void construct_key_desc(void *buf, int bufsize, + struct ptlrpc_sec *sec, uid_t uid) +{ + snprintf(buf, bufsize, "%d@%x", uid, sec->ps_id); + ((char *)buf)[bufsize - 1] = '\0'; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_sec * gss_sec_create_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_keyring *gsec_kr; + ENTRY; + + OBD_ALLOC(gsec_kr, sizeof(*gsec_kr)); + if (gsec_kr == NULL) + RETURN(NULL); + + INIT_HLIST_HEAD(&gsec_kr->gsk_clist); + gsec_kr->gsk_root_ctx = NULL; + mutex_init(&gsec_kr->gsk_root_uc_lock); +#ifdef HAVE_KEYRING_UPCALL_SERIALIZED + mutex_init(&gsec_kr->gsk_uc_lock); +#endif + + if (gss_sec_create_common(&gsec_kr->gsk_base, &gss_policy_keyring, + imp, svcctx, sf)) + goto err_free; + + if (svcctx != NULL && + sec_install_rctx_kr(&gsec_kr->gsk_base.gs_base, svcctx)) { + gss_sec_destroy_common(&gsec_kr->gsk_base); + goto err_free; + } + + RETURN(&gsec_kr->gsk_base.gs_base); + +err_free: + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); + RETURN(NULL); +} + +static +void gss_sec_destroy_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec *gsec = sec2gsec(sec); + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + + CDEBUG(D_SEC, "destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + LASSERT(hlist_empty(&gsec_kr->gsk_clist)); + LASSERT(gsec_kr->gsk_root_ctx == NULL); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec_kr, sizeof(*gsec_kr)); +} + +static inline int user_is_root(struct ptlrpc_sec *sec, struct vfs_cred *vcred) +{ + /* except the ROOTONLY flag, treat it as root user only if real uid + * is 0, euid/fsuid being 0 are handled as setuid scenarios */ + if (sec_is_rootonly(sec) || (vcred->vc_uid == 0)) + return 1; + else + return 0; +} + +/* + * unlink request key from it's ring, which is linked during request_key(). + * sadly, we have to 'guess' which keyring it's linked to. + * + * FIXME this code is fragile, depend on how request_key_link() is implemented. + */ +static void request_key_unlink(struct key *key) +{ + struct task_struct *tsk = current; + struct key *ring; + + switch (key_cred(tsk)->jit_keyring) { + case KEY_REQKEY_DEFL_DEFAULT: + case KEY_REQKEY_DEFL_THREAD_KEYRING: + ring = key_get(key_cred(tsk)->thread_keyring); + if (ring) + break; + case KEY_REQKEY_DEFL_PROCESS_KEYRING: + ring = key_get(key_tgcred(tsk)->process_keyring); + if (ring) + break; + case KEY_REQKEY_DEFL_SESSION_KEYRING: + rcu_read_lock(); + ring = key_get(rcu_dereference(key_tgcred(tsk) + ->session_keyring)); + rcu_read_unlock(); + if (ring) + break; + case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: + ring = key_get(key_cred(tsk)->user->session_keyring); + break; + case KEY_REQKEY_DEFL_USER_KEYRING: + ring = key_get(key_cred(tsk)->user->uid_keyring); + break; + case KEY_REQKEY_DEFL_GROUP_KEYRING: + default: + LBUG(); + } + + LASSERT(ring); + key_unlink(ring, key); + key_put(ring); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_kr(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct obd_import *imp = sec->ps_import; + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct ptlrpc_cli_ctx *ctx = NULL; + unsigned int is_root = 0, create_new = 0; + struct key *key; + char desc[24]; + char *coinfo; + int coinfo_size; + char *co_flags = ""; + ENTRY; + + LASSERT(imp != NULL); + + is_root = user_is_root(sec, vcred); + + /* a little bit optimization for root context */ + if (is_root) { + ctx = sec_lookup_root_ctx_kr(sec); + /* + * Only lookup directly for REVERSE sec, which should + * always succeed. + */ + if (ctx || sec_is_reverse(sec)) + RETURN(ctx); + } + + LASSERT(create != 0); + + /* for root context, obtain lock and check again, this time hold + * the root upcall lock, make sure nobody else populated new root + * context after last check. */ + if (is_root) { + mutex_lock(&gsec_kr->gsk_root_uc_lock); + + ctx = sec_lookup_root_ctx_kr(sec); + if (ctx) + goto out; + + /* update reverse handle for root user */ + sec2gsec(sec)->gs_rvs_hdl = gss_get_next_ctx_index(); + + switch (sec->ps_part) { + case LUSTRE_SP_MDT: + co_flags = "m"; + break; + case LUSTRE_SP_OST: + co_flags = "o"; + break; + case LUSTRE_SP_MGC: + co_flags = "rmo"; + break; + case LUSTRE_SP_CLI: + co_flags = "r"; + break; + case LUSTRE_SP_MGS: + default: + LBUG(); + } + } + + /* in case of setuid, key will be constructed as owner of fsuid/fsgid, + * but we do authentication based on real uid/gid. the key permission + * bits will be exactly as POS_ALL, so only processes who subscribed + * this key could have the access, although the quota might be counted + * on others (fsuid/fsgid). + * + * keyring will use fsuid/fsgid as upcall parameters, so we have to + * encode real uid/gid into callout info. + */ + + construct_key_desc(desc, sizeof(desc), sec, vcred->vc_uid); + + /* callout info format: + * secid:mech:uid:gid:flags:svc_type:peer_nid:target_uuid + */ + coinfo_size = sizeof(struct obd_uuid) + MAX_OBD_NAME + 64; + OBD_ALLOC(coinfo, coinfo_size); + if (coinfo == NULL) + goto out; + + snprintf(coinfo, coinfo_size, "%d:%s:%u:%u:%s:%d:"LPX64":%s", + sec->ps_id, sec2gsec(sec)->gs_mech->gm_name, + vcred->vc_uid, vcred->vc_gid, + co_flags, import_to_gss_svc(imp), + imp->imp_connection->c_peer.nid, imp->imp_obd->obd_name); + + CDEBUG(D_SEC, "requesting key for %s\n", desc); + + keyring_upcall_lock(gsec_kr); + key = request_key(&gss_key_type, desc, coinfo); + keyring_upcall_unlock(gsec_kr); + + OBD_FREE(coinfo, coinfo_size); + + if (IS_ERR(key)) { + CERROR("failed request key: %ld\n", PTR_ERR(key)); + goto out; + } + CDEBUG(D_SEC, "obtained key %08x for %s\n", key->serial, desc); + + /* once payload.data was pointed to a ctx, it never changes until + * we de-associate them; but parallel request_key() may return + * a key with payload.data == NULL at the same time. so we still + * need wirtelock of key->sem to serialize them. */ + down_write(&key->sem); + + if (likely(key->payload.data != NULL)) { + ctx = key->payload.data; + + LASSERT(atomic_read(&ctx->cc_refcount) >= 1); + LASSERT(ctx2gctx_keyring(ctx)->gck_key == key); + LASSERT(atomic_read(&key->usage) >= 2); + + /* simply take a ref and return. it's upper layer's + * responsibility to detect & replace dead ctx. */ + atomic_inc(&ctx->cc_refcount); + } else { + /* pre initialization with a cli_ctx. this can't be done in + * key_instantiate() because we'v no enough information + * there. */ + ctx = ctx_create_kr(sec, vcred); + if (ctx != NULL) { + ctx_enlist_kr(ctx, is_root, 0); + bind_key_ctx(key, ctx); + + ctx_start_timer_kr(ctx, KEYRING_UPCALL_TIMEOUT); + + CDEBUG(D_SEC, "installed key %p <-> ctx %p (sec %p)\n", + key, ctx, sec); + } else { + /* we'd prefer to call key_revoke(), but we more like + * to revoke it within this key->sem locked period. */ + key_revoke_locked(key); + } + + create_new = 1; + } + + up_write(&key->sem); + + if (is_root && create_new) + request_key_unlink(key); + + key_put(key); +out: + if (is_root) + mutex_unlock(&gsec_kr->gsk_root_uc_lock); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + ctx_release_kr(ctx, sync); +} + +/* + * flush context of normal user, we must resort to keyring itself to find out + * contexts which belong to me. + * + * Note here we suppose only to flush _my_ context, the "uid" will + * be ignored in the search. + */ +static +void flush_user_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct key *key; + char desc[24]; + + /* nothing to do for reverse or rootonly sec */ + if (sec_is_reverse(sec) || sec_is_rootonly(sec)) + return; + + construct_key_desc(desc, sizeof(desc), sec, uid); + + /* there should be only one valid key, but we put it in the + * loop in case of any weird cases */ + for (;;) { + key = request_key(&gss_key_type, desc, NULL); + if (IS_ERR(key)) { + CDEBUG(D_SEC, "No more key found for current user\n"); + break; + } + + down_write(&key->sem); + + kill_key_locked(key); + + /* kill_key_locked() should usually revoke the key, but we + * revoke it again to make sure, e.g. some case the key may + * not well coupled with a context. */ + key_revoke_locked(key); + + up_write(&key->sem); + + key_put(key); + } +} + +/* + * flush context of root or all, we iterate through the list. + */ +static +void flush_spec_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct gss_sec_keyring *gsec_kr; + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + gsec_kr = sec2gsec_keyring(sec); + + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, + &gsec_kr->gsk_clist, cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + /* at this moment there's at least 2 base reference: + * key association and in-list. */ + if (atomic_read(&ctx->cc_refcount) > 2) { + if (!force) + continue; + CWARN("flush busy ctx %p(%u->%s, extra ref %d)\n", + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), + atomic_read(&ctx->cc_refcount) - 2); + } + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + atomic_inc(&ctx->cc_refcount); + + if (ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; +} + +static +int gss_sec_flush_ctx_cache_kr(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + ENTRY; + + CDEBUG(D_SEC, "sec %p(%d, nctx %d), uid %d, grace %d, force %d\n", + sec, atomic_read(&sec->ps_refcount), + atomic_read(&sec->ps_nctx), + uid, grace, force); + + if (uid != -1 && uid != 0) + flush_user_ctx_cache_kr(sec, uid, grace, force); + else + flush_spec_ctx_cache_kr(sec, uid, grace, force); + + RETURN(0); +} + +static +void gss_sec_gc_ctx_kr(struct ptlrpc_sec *sec) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_head freelist = HLIST_HEAD_INIT; + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + CWARN("running gc\n"); + + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, + &gsec_kr->gsk_clist, cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + atomic_inc(&ctx->cc_refcount); + + if (cli_ctx_check_death(ctx) && ctx_unlist_kr(ctx, 1)) { + hlist_add_head(&ctx->cc_cache, &freelist); + CWARN("unhashed ctx %p\n", ctx); + } else { + LASSERT(atomic_read(&ctx->cc_refcount) >= 2); + atomic_dec(&ctx->cc_refcount); + } + } + spin_unlock(&sec->ps_lock); + + dispose_ctx_list_kr(&freelist); + EXIT; + return; +} + +static +int gss_sec_display_kr(struct ptlrpc_sec *sec, struct seq_file *seq) +{ + struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); + struct hlist_node *next; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx; + time_t now = cfs_time_current_sec(); + ENTRY; + + spin_lock(&sec->ps_lock); + hlist_for_each_entry_safe(ctx, next, + &gsec_kr->gsk_clist, cc_cache) { + struct key *key; + char flags_str[40]; + char mech[40]; + + gctx = ctx2gctx(ctx); + key = ctx2gctx_keyring(ctx)->gck_key; + + gss_cli_ctx_flags2str(ctx->cc_flags, + flags_str, sizeof(flags_str)); + + if (gctx->gc_mechctx) + lgss_display(gctx->gc_mechctx, mech, sizeof(mech)); + else + snprintf(mech, sizeof(mech), "N/A"); + mech[sizeof(mech) - 1] = '\0'; + + seq_printf(seq, "%p: uid %u, ref %d, expire %ld(%+ld), fl %s, " + "seq %d, win %u, key %08x(ref %d), " + "hdl "LPX64":"LPX64", mech: %s\n", + ctx, ctx->cc_vcred.vc_uid, + atomic_read(&ctx->cc_refcount), + ctx->cc_expire, + ctx->cc_expire ? ctx->cc_expire - now : 0, + flags_str, + atomic_read(&gctx->gc_seq), + gctx->gc_win, + key ? key->serial : 0, + key ? atomic_read(&key->usage) : 0, + gss_handle_to_u64(&gctx->gc_handle), + gss_handle_to_u64(&gctx->gc_svc_handle), + mech); + } + spin_unlock(&sec->ps_lock); + + RETURN(0); +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int gss_cli_ctx_refresh_kr(struct ptlrpc_cli_ctx *ctx) +{ + /* upcall is already on the way */ + return 0; +} + +static +int gss_cli_ctx_validate_kr(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + if (cli_ctx_check_death(ctx)) { + kill_ctx_kr(ctx); + return 1; + } + + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_kr(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + cli_ctx_expire(ctx); + kill_ctx_kr(ctx); +} + +/**************************************** + * (reverse) service * + ****************************************/ + +/* + * reverse context could have nothing to do with keyrings. here we still keep + * the version which bind to a key, for future reference. + */ +#define HAVE_REVERSE_CTX_NOKEY + + +static +int sec_install_rctx_kr(struct ptlrpc_sec *sec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_cli_ctx *cli_ctx; + struct vfs_cred vcred = { 0, 0 }; + int rc; + + LASSERT(sec); + LASSERT(svc_ctx); + + cli_ctx = ctx_create_kr(sec, &vcred); + if (cli_ctx == NULL) + return -ENOMEM; + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + CERROR("failed copy reverse cli ctx: %d\n", rc); + + ctx_put_kr(cli_ctx, 1); + return rc; + } + + rvs_sec_install_root_ctx_kr(sec, cli_ctx, NULL); + + ctx_put_kr(cli_ctx, 1); + + return 0; +} + + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_kr(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_keyring, req); +} + +static +int gss_svc_install_rctx_kr(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + + rc = sec_install_rctx_kr(sec, svc_ctx); + sptlrpc_sec_put(sec); + + return rc; +} + +/**************************************** + * key apis * + ****************************************/ + +static +int gss_kt_instantiate(struct key *key, const void *data, size_t datalen) +{ + int rc; + ENTRY; + + if (data != NULL || datalen != 0) { + CERROR("invalid: data %p, len %lu\n", data, (long)datalen); + RETURN(-EINVAL); + } + + if (key->payload.data != 0) { + CERROR("key already have payload\n"); + RETURN(-EINVAL); + } + + /* link the key to session keyring, so following context negotiation + * rpc fired from user space could find this key. This will be unlinked + * automatically when upcall processes die. + * + * we can't do this through keyctl from userspace, because the upcall + * might be neither possessor nor owner of the key (setuid). + * + * the session keyring is created upon upcall, and don't change all + * the way until upcall finished, so rcu lock is not needed here. + */ + LASSERT(key_tgcred(current)->session_keyring); + + lockdep_off(); + rc = key_link(key_tgcred(current)->session_keyring, key); + lockdep_on(); + if (unlikely(rc)) { + CERROR("failed to link key %08x to keyring %08x: %d\n", + key->serial, + key_tgcred(current)->session_keyring->serial, rc); + RETURN(rc); + } + + CDEBUG(D_SEC, "key %p instantiated, ctx %p\n", key, key->payload.data); + RETURN(0); +} + +/* + * called with key semaphore write locked. it means we can operate + * on the context without fear of loosing refcount. + */ +static +int gss_kt_update(struct key *key, const void *data, size_t datalen) +{ + struct ptlrpc_cli_ctx *ctx = key->payload.data; + struct gss_cli_ctx *gctx; + rawobj_t tmpobj = RAWOBJ_EMPTY; + __u32 datalen32 = (__u32) datalen; + int rc; + ENTRY; + + if (data == NULL || datalen == 0) { + CWARN("invalid: data %p, len %lu\n", data, (long)datalen); + RETURN(-EINVAL); + } + + /* if upcall finished negotiation too fast (mostly likely because + * of local error happened) and call kt_update(), the ctx + * might be still NULL. but the key will finally be associate + * with a context, or be revoked. if key status is fine, return + * -EAGAIN to allow userspace sleep a while and call again. */ + if (ctx == NULL) { + CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n", + key, key->serial, key->flags); + + rc = key_validate(key); + if (rc == 0) + RETURN(-EAGAIN); + else + RETURN(rc); + } + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(ctx->cc_sec); + + ctx_clear_timer_kr(ctx); + + /* don't proceed if already refreshed */ + if (cli_ctx_is_refreshed(ctx)) { + CWARN("ctx already done refresh\n"); + RETURN(0); + } + + sptlrpc_cli_ctx_get(ctx); + gctx = ctx2gctx(ctx); + + rc = buffer_extract_bytes(&data, &datalen32, &gctx->gc_win, + sizeof(gctx->gc_win)); + if (rc) { + CERROR("failed extract seq_win\n"); + goto out; + } + + if (gctx->gc_win == 0) { + __u32 nego_rpc_err, nego_gss_err; + + rc = buffer_extract_bytes(&data, &datalen32, &nego_rpc_err, + sizeof(nego_rpc_err)); + if (rc) { + CERROR("failed to extrace rpc rc\n"); + goto out; + } + + rc = buffer_extract_bytes(&data, &datalen32, &nego_gss_err, + sizeof(nego_gss_err)); + if (rc) { + CERROR("failed to extrace gss rc\n"); + goto out; + } + + CERROR("negotiation: rpc err %d, gss err %x\n", + nego_rpc_err, nego_gss_err); + + rc = nego_rpc_err ? nego_rpc_err : -EACCES; + } else { + rc = rawobj_extract_local_alloc(&gctx->gc_handle, + (__u32 **) &data, &datalen32); + if (rc) { + CERROR("failed extract handle\n"); + goto out; + } + + rc = rawobj_extract_local(&tmpobj, (__u32 **) &data,&datalen32); + if (rc) { + CERROR("failed extract mech\n"); + goto out; + } + + rc = lgss_import_sec_context(&tmpobj, + sec2gsec(ctx->cc_sec)->gs_mech, + &gctx->gc_mechctx); + if (rc != GSS_S_COMPLETE) + CERROR("failed import context\n"); + else + rc = 0; + } +out: + /* we don't care what current status of this ctx, even someone else + * is operating on the ctx at the same time. we just add up our own + * opinions here. */ + if (rc == 0) { + gss_cli_ctx_uptodate(gctx); + } else { + /* this will also revoke the key. has to be done before + * wakeup waiters otherwise they can find the stale key */ + kill_key_locked(key); + + cli_ctx_expire(ctx); + + if (rc != -ERESTART) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } + + /* let user space think it's a success */ + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +static +int gss_kt_match(const struct key *key, const void *desc) +{ + return (strcmp(key->description, (const char *) desc) == 0); +} + +static +void gss_kt_destroy(struct key *key) +{ + ENTRY; + LASSERT(key->payload.data == NULL); + CDEBUG(D_SEC, "destroy key %p\n", key); + EXIT; +} + +static +void gss_kt_describe(const struct key *key, struct seq_file *s) +{ + if (key->description == NULL) + seq_puts(s, "[null]"); + else + seq_puts(s, key->description); +} + +static struct key_type gss_key_type = +{ + .name = "lgssc", + .def_datalen = 0, + .instantiate = gss_kt_instantiate, + .update = gss_kt_update, + .match = gss_kt_match, + .destroy = gss_kt_destroy, + .describe = gss_kt_describe, +}; + +/**************************************** + * lustre gss keyring policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_keyring_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_kr, + .validate = gss_cli_ctx_validate_kr, + .die = gss_cli_ctx_die_kr, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_keyring_cops = { + .create_sec = gss_sec_create_kr, + .destroy_sec = gss_sec_destroy_kr, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_kr, + .release_ctx = gss_sec_release_ctx_kr, + .flush_ctx_cache = gss_sec_flush_ctx_cache_kr, + .gc_ctx = gss_sec_gc_ctx_kr, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, + .display = gss_sec_display_kr, +}; + +static struct ptlrpc_sec_sops gss_sec_keyring_sops = { + .accept = gss_svc_accept_kr, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .prep_bulk = gss_svc_prep_bulk, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_kr, +}; + +static struct ptlrpc_sec_policy gss_policy_keyring = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.keyring", + .sp_policy = SPTLRPC_POLICY_GSS, + .sp_cops = &gss_sec_keyring_cops, + .sp_sops = &gss_sec_keyring_sops, +}; + + +int __init gss_init_keyring(void) +{ + int rc; + + rc = register_key_type(&gss_key_type); + if (rc) { + CERROR("failed to register keyring type: %d\n", rc); + return rc; + } + + rc = sptlrpc_register_policy(&gss_policy_keyring); + if (rc) { + unregister_key_type(&gss_key_type); + return rc; + } + + return 0; +} + +void __exit gss_exit_keyring(void) +{ + unregister_key_type(&gss_key_type); + sptlrpc_unregister_policy(&gss_policy_keyring); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h new file mode 100644 index 000000000000..676d4b96311a --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5.h @@ -0,0 +1,163 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Author: Eric Mei + */ + +/* + * linux/include/linux/sunrpc/gss_krb5_types.h + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#ifndef PTLRPC_GSS_KRB5_H +#define PTLRPC_GSS_KRB5_H + +/* + * RFC 4142 + */ + +#define KG_USAGE_ACCEPTOR_SEAL 22 +#define KG_USAGE_ACCEPTOR_SIGN 23 +#define KG_USAGE_INITIATOR_SEAL 24 +#define KG_USAGE_INITIATOR_SIGN 25 + +#define KG_TOK_MIC_MSG 0x0404 +#define KG_TOK_WRAP_MSG 0x0504 + +#define FLAG_SENDER_IS_ACCEPTOR 0x01 +#define FLAG_WRAP_CONFIDENTIAL 0x02 +#define FLAG_ACCEPTOR_SUBKEY 0x04 + +struct krb5_header { + __u16 kh_tok_id; /* token id */ + __u8 kh_flags; /* acceptor flags */ + __u8 kh_filler; /* 0xff */ + __u16 kh_ec; /* extra count */ + __u16 kh_rrc; /* right rotation count */ + __u64 kh_seq; /* sequence number */ + __u8 kh_cksum[0]; /* checksum */ +}; + +struct krb5_keyblock { + rawobj_t kb_key; + struct ll_crypto_cipher *kb_tfm; +}; + +struct krb5_ctx { + unsigned int kc_initiate:1, + kc_cfx:1, + kc_seed_init:1, + kc_have_acceptor_subkey:1; + __s32 kc_endtime; + __u8 kc_seed[16]; + __u64 kc_seq_send; + __u64 kc_seq_recv; + __u32 kc_enctype; + struct krb5_keyblock kc_keye; /* encryption */ + struct krb5_keyblock kc_keyi; /* integrity */ + struct krb5_keyblock kc_keyc; /* checksum */ + rawobj_t kc_mech_used; +}; + +enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, + SGN_ALG_DES_MAC = 0x0002, + SGN_ALG_3 = 0x0003, /* not published */ + SGN_ALG_HMAC_MD5 = 0x0011, /* microsoft w2k; no support */ + SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004 +}; + +enum seal_alg { + SEAL_ALG_NONE = 0xffff, + SEAL_ALG_DES = 0x0000, + SEAL_ALG_1 = 0x0001, /* not published */ + SEAL_ALG_MICROSOFT_RC4 = 0x0010, /* microsoft w2k; no support */ + SEAL_ALG_DES3KD = 0x0002 +}; + +#define CKSUMTYPE_CRC32 0x0001 +#define CKSUMTYPE_RSA_MD4 0x0002 +#define CKSUMTYPE_RSA_MD4_DES 0x0003 +#define CKSUMTYPE_DESCBC 0x0004 +/* des-mac-k */ +/* rsa-md4-des-k */ +#define CKSUMTYPE_RSA_MD5 0x0007 +#define CKSUMTYPE_RSA_MD5_DES 0x0008 +#define CKSUMTYPE_NIST_SHA 0x0009 +#define CKSUMTYPE_HMAC_SHA1_DES3 0x000c +#define CKSUMTYPE_HMAC_SHA1_96_AES128 0x000f +#define CKSUMTYPE_HMAC_SHA1_96_AES256 0x0010 +#define CKSUMTYPE_HMAC_MD5_ARCFOUR -138 + +/* from gssapi_err_krb5.h */ +#define KG_CCACHE_NOMATCH (39756032L) +#define KG_KEYTAB_NOMATCH (39756033L) +#define KG_TGT_MISSING (39756034L) +#define KG_NO_SUBKEY (39756035L) +#define KG_CONTEXT_ESTABLISHED (39756036L) +#define KG_BAD_SIGN_TYPE (39756037L) +#define KG_BAD_LENGTH (39756038L) +#define KG_CTX_INCOMPLETE (39756039L) +#define KG_CONTEXT (39756040L) +#define KG_CRED (39756041L) +#define KG_ENC_DESC (39756042L) +#define KG_BAD_SEQ (39756043L) +#define KG_EMPTY_CCACHE (39756044L) +#define KG_NO_CTYPES (39756045L) + +/* per Kerberos v5 protocol spec crypto types from the wire. + * these get mapped to linux kernel crypto routines. + */ +#define ENCTYPE_NULL 0x0000 +#define ENCTYPE_DES_CBC_CRC 0x0001 /* DES cbc mode with CRC-32 */ +#define ENCTYPE_DES_CBC_MD4 0x0002 /* DES cbc mode with RSA-MD4 */ +#define ENCTYPE_DES_CBC_MD5 0x0003 /* DES cbc mode with RSA-MD5 */ +#define ENCTYPE_DES_CBC_RAW 0x0004 /* DES cbc mode raw */ +/* XXX deprecated? */ +#define ENCTYPE_DES3_CBC_SHA 0x0005 /* DES-3 cbc mode with NIST-SHA */ +#define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ +#define ENCTYPE_DES_HMAC_SHA1 0x0008 +#define ENCTYPE_DES3_CBC_SHA1 0x0010 +#define ENCTYPE_AES128_CTS_HMAC_SHA1_96 0x0011 +#define ENCTYPE_AES256_CTS_HMAC_SHA1_96 0x0012 +#define ENCTYPE_ARCFOUR_HMAC 0x0017 +#define ENCTYPE_ARCFOUR_HMAC_EXP 0x0018 +#define ENCTYPE_UNKNOWN 0x01ff + +#endif /* PTLRPC_GSS_KRB5_H */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c new file mode 100644 index 000000000000..4b28931bbc96 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_krb5_mech.c @@ -0,0 +1,1786 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * linux/net/sunrpc/gss_krb5_crypto.c + * linux/net/sunrpc/gss_krb5_seal.c + * linux/net/sunrpc/gss_krb5_seqnum.c + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_asn1.h" +#include "gss_krb5.h" + +static spinlock_t krb5_seq_lock; + +struct krb5_enctype { + char *ke_dispname; + char *ke_enc_name; /* linux tfm name */ + char *ke_hash_name; /* linux tfm name */ + int ke_enc_mode; /* linux tfm mode */ + int ke_hash_size; /* checksum size */ + int ke_conf_size; /* confounder size */ + unsigned int ke_hash_hmac:1; /* is hmac? */ +}; + +/* + * NOTE: for aes128-cts and aes256-cts, MIT implementation use CTS encryption. + * but currently we simply CBC with padding, because linux doesn't support CTS + * yet. this need to be fixed in the future. + */ +static struct krb5_enctype enctypes[] = { + [ENCTYPE_DES_CBC_RAW] = { /* des-cbc-md5 */ + "des-cbc-md5", + "cbc(des)", + "md5", + 0, + 16, + 8, + 0, + }, + [ENCTYPE_DES3_CBC_RAW] = { /* des3-hmac-sha1 */ + "des3-hmac-sha1", + "cbc(des3_ede)", + "hmac(sha1)", + 0, + 20, + 8, + 1, + }, + [ENCTYPE_AES128_CTS_HMAC_SHA1_96] = { /* aes128-cts */ + "aes128-cts-hmac-sha1-96", + "cbc(aes)", + "hmac(sha1)", + 0, + 12, + 16, + 1, + }, + [ENCTYPE_AES256_CTS_HMAC_SHA1_96] = { /* aes256-cts */ + "aes256-cts-hmac-sha1-96", + "cbc(aes)", + "hmac(sha1)", + 0, + 12, + 16, + 1, + }, + [ENCTYPE_ARCFOUR_HMAC] = { /* arcfour-hmac-md5 */ + "arcfour-hmac-md5", + "ecb(arc4)", + "hmac(md5)", + 0, + 16, + 8, + 1, + }, +}; + +#define MAX_ENCTYPES sizeof(enctypes)/sizeof(struct krb5_enctype) + +static const char * enctype2str(__u32 enctype) +{ + if (enctype < MAX_ENCTYPES && enctypes[enctype].ke_dispname) + return enctypes[enctype].ke_dispname; + + return "unknown"; +} + +static +int keyblock_init(struct krb5_keyblock *kb, char *alg_name, int alg_mode) +{ + kb->kb_tfm = ll_crypto_alloc_blkcipher(alg_name, alg_mode, 0); + if (IS_ERR(kb->kb_tfm)) { + CERROR("failed to alloc tfm: %s, mode %d\n", + alg_name, alg_mode); + return -1; + } + + if (ll_crypto_blkcipher_setkey(kb->kb_tfm, kb->kb_key.data, kb->kb_key.len)) { + CERROR("failed to set %s key, len %d\n", + alg_name, kb->kb_key.len); + return -1; + } + + return 0; +} + +static +int krb5_init_keys(struct krb5_ctx *kctx) +{ + struct krb5_enctype *ke; + + if (kctx->kc_enctype >= MAX_ENCTYPES || + enctypes[kctx->kc_enctype].ke_hash_size == 0) { + CERROR("unsupported enctype %x\n", kctx->kc_enctype); + return -1; + } + + ke = &enctypes[kctx->kc_enctype]; + + /* tfm arc4 is stateful, user should alloc-use-free by his own */ + if (kctx->kc_enctype != ENCTYPE_ARCFOUR_HMAC && + keyblock_init(&kctx->kc_keye, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + /* tfm hmac is stateful, user should alloc-use-free by his own */ + if (ke->ke_hash_hmac == 0 && + keyblock_init(&kctx->kc_keyi, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + if (ke->ke_hash_hmac == 0 && + keyblock_init(&kctx->kc_keyc, ke->ke_enc_name, ke->ke_enc_mode)) + return -1; + + return 0; +} + +static +void keyblock_free(struct krb5_keyblock *kb) +{ + rawobj_free(&kb->kb_key); + if (kb->kb_tfm) + ll_crypto_free_blkcipher(kb->kb_tfm); +} + +static +int keyblock_dup(struct krb5_keyblock *new, struct krb5_keyblock *kb) +{ + return rawobj_dup(&new->kb_key, &kb->kb_key); +} + +static +int get_bytes(char **ptr, const char *end, void *res, int len) +{ + char *p, *q; + p = *ptr; + q = p + len; + if (q > end || q < p) + return -1; + memcpy(res, p, len); + *ptr = q; + return 0; +} + +static +int get_rawobj(char **ptr, const char *end, rawobj_t *res) +{ + char *p, *q; + __u32 len; + + p = *ptr; + if (get_bytes(&p, end, &len, sizeof(len))) + return -1; + + q = p + len; + if (q > end || q < p) + return -1; + + OBD_ALLOC_LARGE(res->data, len); + if (!res->data) + return -1; + + res->len = len; + memcpy(res->data, p, len); + *ptr = q; + return 0; +} + +static +int get_keyblock(char **ptr, const char *end, + struct krb5_keyblock *kb, __u32 keysize) +{ + char *buf; + + OBD_ALLOC_LARGE(buf, keysize); + if (buf == NULL) + return -1; + + if (get_bytes(ptr, end, buf, keysize)) { + OBD_FREE_LARGE(buf, keysize); + return -1; + } + + kb->kb_key.len = keysize; + kb->kb_key.data = buf; + return 0; +} + +static +void delete_context_kerberos(struct krb5_ctx *kctx) +{ + rawobj_free(&kctx->kc_mech_used); + + keyblock_free(&kctx->kc_keye); + keyblock_free(&kctx->kc_keyi); + keyblock_free(&kctx->kc_keyc); +} + +static +__u32 import_context_rfc1964(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* seed_init flag */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seed_init = (tmp_uint != 0); + + /* seed */ + if (get_bytes(&p, end, kctx->kc_seed, sizeof(kctx->kc_seed))) + goto out_err; + + /* sign/seal algorithm, not really used now */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + /* end time */ + if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime))) + goto out_err; + + /* seq send */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + kctx->kc_seq_send = tmp_uint; + + /* mech oid */ + if (get_rawobj(&p, end, &kctx->kc_mech_used)) + goto out_err; + + /* old style enc/seq keys in format: + * - enctype (u32) + * - keysize (u32) + * - keydata + * we decompose them to fit into the new context + */ + + /* enc key */ + if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + if (get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + if (get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + + /* seq key */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != kctx->kc_enctype) + goto out_err; + + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint)) || + tmp_uint != keysize) + goto out_err; + + if (get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + /* old style fallback */ + if (keyblock_dup(&kctx->kc_keyi, &kctx->kc_keyc)) + goto out_err; + + if (p != end) + goto out_err; + + CDEBUG(D_SEC, "succesfully imported rfc1964 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* Flags for version 2 context flags */ +#define KRB5_CTX_FLAG_INITIATOR 0x00000001 +#define KRB5_CTX_FLAG_CFX 0x00000002 +#define KRB5_CTX_FLAG_ACCEPTOR_SUBKEY 0x00000004 + +static +__u32 import_context_rfc4121(struct krb5_ctx *kctx, char *p, char *end) +{ + unsigned int tmp_uint, keysize; + + /* end time */ + if (get_bytes(&p, end, &kctx->kc_endtime, sizeof(kctx->kc_endtime))) + goto out_err; + + /* flags */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint & KRB5_CTX_FLAG_INITIATOR) + kctx->kc_initiate = 1; + if (tmp_uint & KRB5_CTX_FLAG_CFX) + kctx->kc_cfx = 1; + if (tmp_uint & KRB5_CTX_FLAG_ACCEPTOR_SUBKEY) + kctx->kc_have_acceptor_subkey = 1; + + /* seq send */ + if (get_bytes(&p, end, &kctx->kc_seq_send, sizeof(kctx->kc_seq_send))) + goto out_err; + + /* enctype */ + if (get_bytes(&p, end, &kctx->kc_enctype, sizeof(kctx->kc_enctype))) + goto out_err; + + /* size of each key */ + if (get_bytes(&p, end, &keysize, sizeof(keysize))) + goto out_err; + + /* number of keys - should always be 3 */ + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) + goto out_err; + + if (tmp_uint != 3) { + CERROR("Invalid number of keys: %u\n", tmp_uint); + goto out_err; + } + + /* ke */ + if (get_keyblock(&p, end, &kctx->kc_keye, keysize)) + goto out_err; + /* ki */ + if (get_keyblock(&p, end, &kctx->kc_keyi, keysize)) + goto out_err; + /* ki */ + if (get_keyblock(&p, end, &kctx->kc_keyc, keysize)) + goto out_err; + + CDEBUG(D_SEC, "succesfully imported v2 context\n"); + return 0; +out_err: + return GSS_S_FAILURE; +} + +/* + * The whole purpose here is trying to keep user level gss context parsing + * from nfs-utils unchanged as possible as we can, they are not quite mature + * yet, and many stuff still not clear, like heimdal etc. + */ +static +__u32 gss_import_sec_context_kerberos(rawobj_t *inbuf, + struct gss_ctx *gctx) +{ + struct krb5_ctx *kctx; + char *p = (char *) inbuf->data; + char *end = (char *) (inbuf->data + inbuf->len); + unsigned int tmp_uint, rc; + + if (get_bytes(&p, end, &tmp_uint, sizeof(tmp_uint))) { + CERROR("Fail to read version\n"); + return GSS_S_FAILURE; + } + + /* only support 0, 1 for the moment */ + if (tmp_uint > 2) { + CERROR("Invalid version %u\n", tmp_uint); + return GSS_S_FAILURE; + } + + OBD_ALLOC_PTR(kctx); + if (!kctx) + return GSS_S_FAILURE; + + if (tmp_uint == 0 || tmp_uint == 1) { + kctx->kc_initiate = tmp_uint; + rc = import_context_rfc1964(kctx, p, end); + } else { + rc = import_context_rfc4121(kctx, p, end); + } + + if (rc == 0) + rc = krb5_init_keys(kctx); + + if (rc) { + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); + + return GSS_S_FAILURE; + } + + gctx->internal_ctx_id = kctx; + return GSS_S_COMPLETE; +} + +static +__u32 gss_copy_reverse_context_kerberos(struct gss_ctx *gctx, + struct gss_ctx *gctx_new) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_ctx *knew; + + OBD_ALLOC_PTR(knew); + if (!knew) + return GSS_S_FAILURE; + + knew->kc_initiate = kctx->kc_initiate ? 0 : 1; + knew->kc_cfx = kctx->kc_cfx; + knew->kc_seed_init = kctx->kc_seed_init; + knew->kc_have_acceptor_subkey = kctx->kc_have_acceptor_subkey; + knew->kc_endtime = kctx->kc_endtime; + + memcpy(knew->kc_seed, kctx->kc_seed, sizeof(kctx->kc_seed)); + knew->kc_seq_send = kctx->kc_seq_recv; + knew->kc_seq_recv = kctx->kc_seq_send; + knew->kc_enctype = kctx->kc_enctype; + + if (rawobj_dup(&knew->kc_mech_used, &kctx->kc_mech_used)) + goto out_err; + + if (keyblock_dup(&knew->kc_keye, &kctx->kc_keye)) + goto out_err; + if (keyblock_dup(&knew->kc_keyi, &kctx->kc_keyi)) + goto out_err; + if (keyblock_dup(&knew->kc_keyc, &kctx->kc_keyc)) + goto out_err; + if (krb5_init_keys(knew)) + goto out_err; + + gctx_new->internal_ctx_id = knew; + CDEBUG(D_SEC, "succesfully copied reverse context\n"); + return GSS_S_COMPLETE; + +out_err: + delete_context_kerberos(knew); + OBD_FREE_PTR(knew); + return GSS_S_FAILURE; +} + +static +__u32 gss_inquire_context_kerberos(struct gss_ctx *gctx, + unsigned long *endtime) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + + *endtime = (unsigned long) ((__u32) kctx->kc_endtime); + return GSS_S_COMPLETE; +} + +static +void gss_delete_sec_context_kerberos(void *internal_ctx) +{ + struct krb5_ctx *kctx = internal_ctx; + + delete_context_kerberos(kctx); + OBD_FREE_PTR(kctx); +} + +static +void buf_to_sg(struct scatterlist *sg, void *ptr, int len) +{ + sg_set_buf(sg, ptr, len); +} + +static +__u32 krb5_encrypt(struct ll_crypto_cipher *tfm, + int decrypt, + void * iv, + void * in, + void * out, + int length) +{ + struct blkcipher_desc desc; + struct scatterlist sg; + __u8 local_iv[16] = {0}; + __u32 ret = -EINVAL; + + LASSERT(tfm); + desc.tfm = tfm; + desc.info = local_iv; + desc.flags= 0; + + if (length % ll_crypto_blkcipher_blocksize(tfm) != 0) { + CERROR("output length %d mismatch blocksize %d\n", + length, ll_crypto_blkcipher_blocksize(tfm)); + goto out; + } + + if (ll_crypto_blkcipher_ivsize(tfm) > 16) { + CERROR("iv size too large %d\n", ll_crypto_blkcipher_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, ll_crypto_blkcipher_ivsize(tfm)); + + memcpy(out, in, length); + buf_to_sg(&sg, out, length); + + if (decrypt) + ret = ll_crypto_blkcipher_decrypt_iv(&desc, &sg, &sg, length); + else + ret = ll_crypto_blkcipher_encrypt_iv(&desc, &sg, &sg, length); + +out: + return(ret); +} + + +static inline +int krb5_digest_hmac(struct ll_crypto_hash *tfm, + rawobj_t *key, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, + rawobj_t *cksum) +{ + struct hash_desc desc; + struct scatterlist sg[1]; + int i; + + ll_crypto_hash_setkey(tfm, key->data, key->len); + desc.tfm = tfm; + desc.flags= 0; + + ll_crypto_hash_init(&desc); + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len); + ll_crypto_hash_update(&desc, sg, msgs[i].len); + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + + sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len, + iovs[i].kiov_offset); + ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len); + } + + if (khdr) { + buf_to_sg(sg, (char *) khdr, sizeof(*khdr)); + ll_crypto_hash_update(&desc, sg, sizeof(*khdr)); + } + + return ll_crypto_hash_final(&desc, cksum->data); +} + + +static inline +int krb5_digest_norm(struct ll_crypto_hash *tfm, + struct krb5_keyblock *kb, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, + rawobj_t *cksum) +{ + struct hash_desc desc; + struct scatterlist sg[1]; + int i; + + LASSERT(kb->kb_tfm); + desc.tfm = tfm; + desc.flags= 0; + + ll_crypto_hash_init(&desc); + + for (i = 0; i < msgcnt; i++) { + if (msgs[i].len == 0) + continue; + buf_to_sg(sg, (char *) msgs[i].data, msgs[i].len); + ll_crypto_hash_update(&desc, sg, msgs[i].len); + } + + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + + sg_set_page(&sg[0], iovs[i].kiov_page, iovs[i].kiov_len, + iovs[i].kiov_offset); + ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len); + } + + if (khdr) { + buf_to_sg(sg, (char *) khdr, sizeof(*khdr)); + ll_crypto_hash_update(&desc, sg, sizeof(*khdr)); + } + + ll_crypto_hash_final(&desc, cksum->data); + + return krb5_encrypt(kb->kb_tfm, 0, NULL, cksum->data, + cksum->data, cksum->len); +} + +/* + * compute (keyed/keyless) checksum against the plain text which appended + * with krb5 wire token header. + */ +static +__s32 krb5_make_checksum(__u32 enctype, + struct krb5_keyblock *kb, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, + rawobj_t *cksum) +{ + struct krb5_enctype *ke = &enctypes[enctype]; + struct ll_crypto_hash *tfm; + __u32 code = GSS_S_FAILURE; + int rc; + + if (!(tfm = ll_crypto_alloc_hash(ke->ke_hash_name, 0, 0))) { + CERROR("failed to alloc TFM: %s\n", ke->ke_hash_name); + return GSS_S_FAILURE; + } + + cksum->len = ll_crypto_hash_digestsize(tfm); + OBD_ALLOC_LARGE(cksum->data, cksum->len); + if (!cksum->data) { + cksum->len = 0; + goto out_tfm; + } + + if (ke->ke_hash_hmac) + rc = krb5_digest_hmac(tfm, &kb->kb_key, + khdr, msgcnt, msgs, iovcnt, iovs, cksum); + else + rc = krb5_digest_norm(tfm, kb, + khdr, msgcnt, msgs, iovcnt, iovs, cksum); + + if (rc == 0) + code = GSS_S_COMPLETE; +out_tfm: + ll_crypto_free_hash(tfm); + return code; +} + +static void fill_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + + acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; + + if (privacy) { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG); + khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL; + khdr->kh_ec = cpu_to_be16(0); + khdr->kh_rrc = cpu_to_be16(0); + } else { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG); + khdr->kh_flags = acceptor_flag; + khdr->kh_ec = cpu_to_be16(0xffff); + khdr->kh_rrc = cpu_to_be16(0xffff); + } + + khdr->kh_filler = 0xff; + spin_lock(&krb5_seq_lock); + khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); + spin_unlock(&krb5_seq_lock); +} + +static __u32 verify_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + __u16 tok_id, ec_rrc; + + acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; + + if (privacy) { + tok_id = KG_TOK_WRAP_MSG; + ec_rrc = 0x0; + } else { + tok_id = KG_TOK_MIC_MSG; + ec_rrc = 0xffff; + } + + /* sanity checks */ + if (be16_to_cpu(khdr->kh_tok_id) != tok_id) { + CERROR("bad token id\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { + CERROR("bad direction flag\n"); + return GSS_S_BAD_SIG; + } + if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) { + CERROR("missing confidential flag\n"); + return GSS_S_BAD_SIG; + } + if (khdr->kh_filler != 0xff) { + CERROR("bad filler\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if (be16_to_cpu(khdr->kh_ec) != ec_rrc || + be16_to_cpu(khdr->kh_rrc) != ec_rrc) { + CERROR("bad EC or RRC\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + return GSS_S_COMPLETE; +} + +static +__u32 gss_get_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *) token->data; + fill_krb5_header(kctx, khdr, 0); + + /* checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, + khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) + return GSS_S_FAILURE; + + LASSERT(cksum.len >= ke->ke_hash_size); + LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size); + memcpy(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + + token->len = sizeof(*khdr) + ke->ke_hash_size; + rawobj_free(&cksum); + return GSS_S_COMPLETE; +} + +static +__u32 gss_verify_mic_kerberos(struct gss_ctx *gctx, + int msgcnt, + rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + rawobj_t cksum = RAWOBJ_EMPTY; + __u32 major; + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *) token->data; + + major = verify_krb5_header(kctx, khdr, 0); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + if (token->len < sizeof(*khdr) + ke->ke_hash_size) { + CERROR("short signature: %u, require %d\n", + token->len, (int) sizeof(*khdr) + ke->ke_hash_size); + return GSS_S_FAILURE; + } + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, + khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) { + CERROR("failed to make checksum\n"); + return GSS_S_FAILURE; + } + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + rawobj_free(&cksum); + return GSS_S_BAD_SIG; + } + + rawobj_free(&cksum); + return GSS_S_COMPLETE; +} + +static +int add_padding(rawobj_t *msg, int msg_buflen, int blocksize) +{ + int padding; + + padding = (blocksize - (msg->len & (blocksize - 1))) & + (blocksize - 1); + if (!padding) + return 0; + + if (msg->len + padding > msg_buflen) { + CERROR("bufsize %u too small: datalen %u, padding %u\n", + msg_buflen, msg->len, padding); + return -EINVAL; + } + + memset(msg->data + msg->len, padding, padding); + msg->len += padding; + return 0; +} + +static +int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm, + int mode_ecb, + int inobj_cnt, + rawobj_t *inobjs, + rawobj_t *outobj, + int enc) +{ + struct blkcipher_desc desc; + struct scatterlist src, dst; + __u8 local_iv[16] = {0}, *buf; + __u32 datalen = 0; + int i, rc; + ENTRY; + + buf = outobj->data; + desc.tfm = tfm; + desc.info = local_iv; + desc.flags = 0; + + for (i = 0; i < inobj_cnt; i++) { + LASSERT(buf + inobjs[i].len <= outobj->data + outobj->len); + + buf_to_sg(&src, inobjs[i].data, inobjs[i].len); + buf_to_sg(&dst, buf, outobj->len - datalen); + + if (mode_ecb) { + if (enc) + rc = ll_crypto_blkcipher_encrypt( + &desc, &dst, &src, src.length); + else + rc = ll_crypto_blkcipher_decrypt( + &desc, &dst, &src, src.length); + } else { + if (enc) + rc = ll_crypto_blkcipher_encrypt_iv( + &desc, &dst, &src, src.length); + else + rc = ll_crypto_blkcipher_decrypt_iv( + &desc, &dst, &src, src.length); + } + + if (rc) { + CERROR("encrypt error %d\n", rc); + RETURN(rc); + } + + datalen += inobjs[i].len; + buf += inobjs[i].len; + } + + outobj->len = datalen; + RETURN(0); +} + +/* + * if adj_nob != 0, we adjust desc->bd_nob to the actual cipher text size. + */ +static +int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm, + struct krb5_header *khdr, + char *confounder, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + int adj_nob) +{ + struct blkcipher_desc ciph_desc; + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + int blocksize, i, rc, nob = 0; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + + blocksize = ll_crypto_blkcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + ciph_desc.tfm = tfm; + ciph_desc.info = local_iv; + ciph_desc.flags = 0; + + /* encrypt confounder */ + buf_to_sg(&src, confounder, blocksize); + buf_to_sg(&dst, cipher->data, blocksize); + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize); + if (rc) { + CERROR("error to encrypt confounder: %d\n", rc); + return rc; + } + + /* encrypt clear pages */ + for (i = 0; i < desc->bd_iov_count; i++) { + sg_set_page(&src, desc->bd_iov[i].kiov_page, + (desc->bd_iov[i].kiov_len + blocksize - 1) & + (~(blocksize - 1)), + desc->bd_iov[i].kiov_offset); + if (adj_nob) + nob += src.length; + sg_set_page(&dst, desc->bd_enc_iov[i].kiov_page, src.length, + src.offset); + + desc->bd_enc_iov[i].kiov_offset = dst.offset; + desc->bd_enc_iov[i].kiov_len = dst.length; + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, + src.length); + if (rc) { + CERROR("error to encrypt page: %d\n", rc); + return rc; + } + } + + /* encrypt krb5 header */ + buf_to_sg(&src, khdr, sizeof(*khdr)); + buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr)); + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, + &dst, &src, sizeof(*khdr)); + if (rc) { + CERROR("error to encrypt krb5 header: %d\n", rc); + return rc; + } + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +/* + * desc->bd_nob_transferred is the size of cipher text received. + * desc->bd_nob is the target size of plain text supposed to be. + * + * if adj_nob != 0, we adjust each page's kiov_len to the actual + * plain text size. + * - for client read: we don't know data size for each page, so + * bd_iov[]->kiov_len is set to PAGE_SIZE, but actual data received might + * be smaller, so we need to adjust it according to bd_enc_iov[]->kiov_len. + * this means we DO NOT support the situation that server send an odd size + * data in a page which is not the last one. + * - for server write: we knows exactly data size for each page being expected, + * thus kiov_len is accurate already, so we should not adjust it at all. + * and bd_enc_iov[]->kiov_len should be round_up(bd_iov[]->kiov_len) which + * should have been done by prep_bulk(). + */ +static +int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm, + struct krb5_header *khdr, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + rawobj_t *plain, + int adj_nob) +{ + struct blkcipher_desc ciph_desc; + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + int ct_nob = 0, pt_nob = 0; + int blocksize, i, rc; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + LASSERT(desc->bd_nob_transferred); + + blocksize = ll_crypto_blkcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + ciph_desc.tfm = tfm; + ciph_desc.info = local_iv; + ciph_desc.flags = 0; + + if (desc->bd_nob_transferred % blocksize) { + CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred); + return -EPROTO; + } + + /* decrypt head (confounder) */ + buf_to_sg(&src, cipher->data, blocksize); + buf_to_sg(&dst, plain->data, blocksize); + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize); + if (rc) { + CERROR("error to decrypt confounder: %d\n", rc); + return rc; + } + + for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred; + i++) { + if (desc->bd_enc_iov[i].kiov_offset % blocksize != 0 || + desc->bd_enc_iov[i].kiov_len % blocksize != 0) { + CERROR("page %d: odd offset %u len %u, blocksize %d\n", + i, desc->bd_enc_iov[i].kiov_offset, + desc->bd_enc_iov[i].kiov_len, blocksize); + return -EFAULT; + } + + if (adj_nob) { + if (ct_nob + desc->bd_enc_iov[i].kiov_len > + desc->bd_nob_transferred) + desc->bd_enc_iov[i].kiov_len = + desc->bd_nob_transferred - ct_nob; + + desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len; + if (pt_nob + desc->bd_enc_iov[i].kiov_len >desc->bd_nob) + desc->bd_iov[i].kiov_len = desc->bd_nob -pt_nob; + } else { + /* this should be guaranteed by LNET */ + LASSERT(ct_nob + desc->bd_enc_iov[i].kiov_len <= + desc->bd_nob_transferred); + LASSERT(desc->bd_iov[i].kiov_len <= + desc->bd_enc_iov[i].kiov_len); + } + + if (desc->bd_enc_iov[i].kiov_len == 0) + continue; + + sg_set_page(&src, desc->bd_enc_iov[i].kiov_page, + desc->bd_enc_iov[i].kiov_len, + desc->bd_enc_iov[i].kiov_offset); + dst = src; + if (desc->bd_iov[i].kiov_len % blocksize == 0) + sg_assign_page(&dst, desc->bd_iov[i].kiov_page); + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, + src.length); + if (rc) { + CERROR("error to decrypt page: %d\n", rc); + return rc; + } + + if (desc->bd_iov[i].kiov_len % blocksize != 0) { + memcpy(page_address(desc->bd_iov[i].kiov_page) + + desc->bd_iov[i].kiov_offset, + page_address(desc->bd_enc_iov[i].kiov_page) + + desc->bd_iov[i].kiov_offset, + desc->bd_iov[i].kiov_len); + } + + ct_nob += desc->bd_enc_iov[i].kiov_len; + pt_nob += desc->bd_iov[i].kiov_len; + } + + if (unlikely(ct_nob != desc->bd_nob_transferred)) { + CERROR("%d cipher text transferred but only %d decrypted\n", + desc->bd_nob_transferred, ct_nob); + return -EFAULT; + } + + if (unlikely(!adj_nob && pt_nob != desc->bd_nob)) { + CERROR("%d plain text expected but only %d received\n", + desc->bd_nob, pt_nob); + return -EFAULT; + } + + /* if needed, clear up the rest unused iovs */ + if (adj_nob) + while (i < desc->bd_iov_count) + desc->bd_iov[i++].kiov_len = 0; + + /* decrypt tail (krb5 header) */ + buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr)); + buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr)); + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, + &dst, &src, sizeof(*khdr)); + if (rc) { + CERROR("error to decrypt tail: %d\n", rc); + return rc; + } + + if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) { + CERROR("krb5 header doesn't match\n"); + return -EACCES; + } + + return 0; +} + +static +__u32 gss_wrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *token) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[3], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + int rc = 0; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + LASSERT(kctx->kc_keye.kb_tfm == NULL || + ke->ke_conf_size >= + ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm)); + + /* + * final token format: + * --------------------------------------------------- + * | krb5 header | cipher text | checksum (16 bytes) | + * --------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *) token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + cfs_get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + LASSERT(blocksize <= ke->ke_conf_size); + + /* padding the message */ + if (add_padding(msg, msg_buflen, blocksize)) + return GSS_S_FAILURE; + + /* + * clear text layout for checksum: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = gsshdr->data; + data_desc[1].len = gsshdr->len; + data_desc[2].data = msg->data; + data_desc[2].len = msg->len; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, data_desc, 0, NULL, &cksum)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + data_desc[1].data = msg->data; + data_desc[1].len = msg->len; + data_desc[2].data = (__u8 *) khdr; + data_desc[2].len = sizeof(*khdr); + + /* cipher text will be directly inplace */ + cipher.data = (__u8 *) (khdr + 1); + cipher.len = token->len - sizeof(*khdr); + LASSERT(cipher.len >= ke->ke_conf_size + msg->len + sizeof(*khdr)); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye; + struct ll_crypto_cipher *arc4_tfm; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out, rc = -EACCES); + } + + arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data, + arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = krb5_encrypt_rawobjs(arc4_tfm, 1, + 3, data_desc, &cipher, 1); +arc4_out_tfm: + ll_crypto_free_blkcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); +arc4_out: + do {} while(0); /* just to avoid compile warning */ + } else { + rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, + 3, data_desc, &cipher, 1); + } + + if (rc != 0) { + rawobj_free(&cksum); + return GSS_S_FAILURE; + } + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + rawobj_free(&cksum); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + int blocksize, i; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + LASSERT(kctx->kc_keye.kb_tfm); + + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page); + /* + * offset should always start at page boundary of either + * client or server side. + */ + if (desc->bd_iov[i].kiov_offset & blocksize) { + CERROR("odd offset %d in page %d\n", + desc->bd_iov[i].kiov_offset, i); + return GSS_S_FAILURE; + } + + desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset; + desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len + + blocksize - 1) & (~(blocksize - 1)); + } + + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[1], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + int rc = 0; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + + /* + * final token format: + * -------------------------------------------------- + * | krb5 header | head/tail cipher text | checksum | + * -------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *) token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + cfs_get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* + * we assume the size of krb5_header (16 bytes) must be n * blocksize. + * the bulk token size would be exactly (sizeof(krb5_header) + + * blocksize + sizeof(krb5_header) + hashsize) + */ + LASSERT(blocksize <= ke->ke_conf_size); + LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0); + LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16); + + /* + * clear text layout for checksum: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, desc->bd_iov, + &cksum)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + * | | | + * ---------- (cipher pages) | + * result token: | | + * ------------------------------------------- + * | krb5 header | cipher text | cipher text | + * ------------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksize + sizeof(*khdr); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LBUG(); + rc = 0; + } else { + rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + conf, desc, &cipher, adj_nob); + } + + if (rc != 0) { + rawobj_free(&cksum); + return GSS_S_FAILURE; + } + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + rawobj_free(&cksum); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + return GSS_S_COMPLETE; +} + +static +__u32 gss_unwrap_kerberos(struct gss_ctx *gctx, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *msg) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + unsigned char *tmpbuf; + int blocksize, bodysize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher_in, plain_out; + rawobj_t hash_objs[3]; + int rc = 0; + __u32 major; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *) token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* expected token layout: + * ---------------------------------------- + * | krb5 header | cipher text | checksum | + * ---------------------------------------- + */ + bodysize = token->len - sizeof(*khdr) - ke->ke_hash_size; + + if (bodysize % blocksize) { + CERROR("odd bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (bodysize <= ke->ke_conf_size + sizeof(*khdr)) { + CERROR("incomplete token: bodysize %d\n", bodysize); + return GSS_S_DEFECTIVE_TOKEN; + } + + if (msg->len < bodysize - ke->ke_conf_size - sizeof(*khdr)) { + CERROR("buffer too small: %u, require %d\n", + msg->len, bodysize - ke->ke_conf_size); + return GSS_S_FAILURE; + } + + /* decrypting */ + OBD_ALLOC_LARGE(tmpbuf, bodysize); + if (!tmpbuf) + return GSS_S_FAILURE; + + major = GSS_S_FAILURE; + + cipher_in.data = (__u8 *) (khdr + 1); + cipher_in.len = bodysize; + plain_out.data = tmpbuf; + plain_out.len = bodysize; + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + rawobj_t arc4_keye; + struct ll_crypto_cipher *arc4_tfm; + + cksum.data = token->data + token->len - ke->ke_hash_size; + cksum.len = ke->ke_hash_size; + + if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, + NULL, 1, &cksum, 0, NULL, &arc4_keye)) { + CERROR("failed to obtain arc4 enc key\n"); + GOTO(arc4_out, rc = -EACCES); + } + + arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0); + if (IS_ERR(arc4_tfm)) { + CERROR("failed to alloc tfm arc4 in ECB mode\n"); + GOTO(arc4_out_key, rc = -EACCES); + } + + if (ll_crypto_blkcipher_setkey(arc4_tfm, + arc4_keye.data, arc4_keye.len)) { + CERROR("failed to set arc4 key, len %d\n", + arc4_keye.len); + GOTO(arc4_out_tfm, rc = -EACCES); + } + + rc = krb5_encrypt_rawobjs(arc4_tfm, 1, + 1, &cipher_in, &plain_out, 0); +arc4_out_tfm: + ll_crypto_free_blkcipher(arc4_tfm); +arc4_out_key: + rawobj_free(&arc4_keye); +arc4_out: + cksum = RAWOBJ_EMPTY; + } else { + rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, + 1, &cipher_in, &plain_out, 0); + } + + if (rc != 0) { + CERROR("error decrypt\n"); + goto out_free; + } + LASSERT(plain_out.len == bodysize); + + /* expected clear text layout: + * ----------------------------------------- + * | confounder | clear msgs | krb5 header | + * ----------------------------------------- + */ + + /* verify krb5 header in token is not modified */ + if (memcmp(khdr, plain_out.data + plain_out.len - sizeof(*khdr), + sizeof(*khdr))) { + CERROR("decrypted krb5 header mismatch\n"); + goto out_free; + } + + /* verify checksum, compose clear text as layout: + * ------------------------------------------------------ + * | confounder | gss header | clear msgs | krb5 header | + * ------------------------------------------------------ + */ + hash_objs[0].len = ke->ke_conf_size; + hash_objs[0].data = plain_out.data; + hash_objs[1].len = gsshdr->len; + hash_objs[1].data = gsshdr->data; + hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr); + hash_objs[2].data = plain_out.data + ke->ke_conf_size; + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 3, hash_objs, 0, NULL, &cksum)) + goto out_free; + + LASSERT(cksum.len >= ke->ke_hash_size); + if (memcmp((char *)(khdr + 1) + bodysize, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + goto out_free; + } + + msg->len = bodysize - ke->ke_conf_size - sizeof(*khdr); + memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len); + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(tmpbuf, bodysize); + rawobj_free(&cksum); + return major; +} + +static +__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher, plain; + rawobj_t data_desc[1]; + int rc; + __u32 major; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *) token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + LBUG(); + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0); + + /* + * token format is expected as: + * ----------------------------------------------- + * | krb5 header | head/tail cipher text | cksum | + * ----------------------------------------------- + */ + if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) + + ke->ke_hash_size) { + CERROR("short token size: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksize + sizeof(*khdr); + plain.data = cipher.data; + plain.len = cipher.len; + + rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + desc, &cipher, &plain, adj_nob); + if (rc) + return GSS_S_DEFECTIVE_TOKEN; + + /* + * verify checksum, compose clear text as layout: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = plain.data; + data_desc[0].len = blocksize; + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, desc->bd_iov, + &cksum)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + if (memcmp(plain.data + blocksize + sizeof(*khdr), + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + rawobj_free(&cksum); + return GSS_S_BAD_SIG; + } + + rawobj_free(&cksum); + return GSS_S_COMPLETE; +} + +int gss_display_kerberos(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int written; + + written = snprintf(buf, bufsize, "krb5 (%s)", + enctype2str(kctx->kc_enctype)); + return written; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_copy_reverse_context = gss_copy_reverse_context_kerberos, + .gss_inquire_context = gss_inquire_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, + .gss_prep_bulk = gss_prep_bulk_kerberos, + .gss_wrap_bulk = gss_wrap_bulk_kerberos, + .gss_unwrap_bulk = gss_unwrap_bulk_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, + .gss_display = gss_display_kerberos, +}; + +static struct subflavor_desc gss_kerberos_sfs[] = { + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5N, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_NULL, + .sf_name = "krb5n" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5A, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_AUTH, + .sf_name = "krb5a" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5I, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_INTG, + .sf_name = "krb5i" + }, + { + .sf_subflavor = SPTLRPC_SUBFLVR_KRB5P, + .sf_qop = 0, + .sf_service = SPTLRPC_SVC_PRIV, + .sf_name = "krb5p" + }, +}; + +/* + * currently we leave module owner NULL + */ +static struct gss_api_mech gss_kerberos_mech = { + .gm_owner = NULL, /*THIS_MODULE, */ + .gm_name = "krb5", + .gm_oid = (rawobj_t) + {9, "\052\206\110\206\367\022\001\002\002"}, + .gm_ops = &gss_kerberos_ops, + .gm_sf_num = 4, + .gm_sfs = gss_kerberos_sfs, +}; + +int __init init_kerberos_module(void) +{ + int status; + + spin_lock_init(&krb5_seq_lock); + + status = lgss_mech_register(&gss_kerberos_mech); + if (status) + CERROR("Failed to register kerberos gss mechanism!\n"); + return status; +} + +void __exit cleanup_kerberos_module(void) +{ + lgss_mech_unregister(&gss_kerberos_mech); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c new file mode 100644 index 000000000000..8cdad800382d --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_mech_switch.c @@ -0,0 +1,359 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static LIST_HEAD(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +int lgss_mech_register(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + CWARN("Register %s mechanism\n", gm->gm_name); + return 0; +} + +void lgss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + CWARN("Unregister %s mechanism\n", gm->gm_name); +} + + +struct gss_api_mech *lgss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +struct gss_api_mech *lgss_name_to_mech(char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (!try_module_get(pos->gm_owner)) + continue; + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +static inline +int mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor) +{ + int i; + + for (i = 0; i < gm->gm_sf_num; i++) { + if (gm->gm_sfs[i].sf_subflavor == subflavor) + return 1; + } + return 0; +} + +struct gss_api_mech *lgss_subflavor_to_mech(__u32 subflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!try_module_get(pos->gm_owner)) + continue; + if (!mech_supports_subflavor(pos, subflavor)) { + module_put(pos->gm_owner); + continue; + } + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +void lgss_mech_put(struct gss_api_mech *gm) +{ + module_put(gm->gm_owner); +} + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +__u32 lgss_import_sec_context(rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + OBD_ALLOC_PTR(*ctx_id); + if (*ctx_id == NULL) + return GSS_S_FAILURE; + + (*ctx_id)->mech_type = lgss_mech_get(mech); + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_import_sec_context); + return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id); +} + +__u32 lgss_copy_reverse_context(struct gss_ctx *ctx_id, + struct gss_ctx **ctx_id_new) +{ + struct gss_api_mech *mech = ctx_id->mech_type; + __u32 major; + + LASSERT(mech); + + OBD_ALLOC_PTR(*ctx_id_new); + if (*ctx_id_new == NULL) + return GSS_S_FAILURE; + + (*ctx_id_new)->mech_type = lgss_mech_get(mech); + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_copy_reverse_context); + + major = mech->gm_ops->gss_copy_reverse_context(ctx_id, *ctx_id_new); + if (major != GSS_S_COMPLETE) { + lgss_mech_put(mech); + OBD_FREE_PTR(*ctx_id_new); + *ctx_id_new = NULL; + } + return major; +} + +/* + * this interface is much simplified, currently we only need endtime. + */ +__u32 lgss_inquire_context(struct gss_ctx *context_handle, + unsigned long *endtime) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context); + + return context_handle->mech_type->gm_ops + ->gss_inquire_context(context_handle, + endtime); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ +__u32 lgss_get_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_get_mic); + + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ +__u32 lgss_verify_mic(struct gss_ctx *context_handle, + int msgcnt, + rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic); + + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + msgcnt, + msg, + iovcnt, + iovs, + mic_token); +} + +__u32 lgss_wrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *msg, + int msg_buflen, + rawobj_t *out_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap); + + return context_handle->mech_type->gm_ops + ->gss_wrap(context_handle, gsshdr, msg, msg_buflen, out_token); +} + +__u32 lgss_unwrap(struct gss_ctx *context_handle, + rawobj_t *gsshdr, + rawobj_t *token, + rawobj_t *out_msg) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap); + + return context_handle->mech_type->gm_ops + ->gss_unwrap(context_handle, gsshdr, token, out_msg); +} + + +__u32 lgss_prep_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk); + + return context_handle->mech_type->gm_ops + ->gss_prep_bulk(context_handle, desc); +} + +__u32 lgss_wrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_wrap_bulk(context_handle, desc, token, adj_nob); +} + +__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_unwrap_bulk(context_handle, desc, token, adj_nob); +} + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +__u32 lgss_delete_sec_context(struct gss_ctx **context_handle) +{ + struct gss_api_mech *mech; + + CDEBUG(D_SEC, "deleting %p\n", *context_handle); + + if (!*context_handle) + return(GSS_S_NO_CONTEXT); + + mech = (*context_handle)->mech_type; + if ((*context_handle)->internal_ctx_id != 0) { + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_delete_sec_context); + mech->gm_ops->gss_delete_sec_context( + (*context_handle)->internal_ctx_id); + } + if (mech) + lgss_mech_put(mech); + + OBD_FREE_PTR(*context_handle); + *context_handle=NULL; + return GSS_S_COMPLETE; +} + +int lgss_display(struct gss_ctx *ctx, + char *buf, + int bufsize) +{ + LASSERT(ctx); + LASSERT(ctx->mech_type); + LASSERT(ctx->mech_type->gm_ops); + LASSERT(ctx->mech_type->gm_ops->gss_display); + + return ctx->mech_type->gm_ops->gss_display(ctx, buf, bufsize); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c new file mode 100644 index 000000000000..3df7257b7fa0 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_pipefs.c @@ -0,0 +1,1252 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include +#include +struct rpc_clnt; /* for rpc_pipefs */ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct ptlrpc_sec_policy gss_policy_pipefs; +static struct ptlrpc_ctx_ops gss_pipefs_ctxops; + +static int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx); + +static int gss_sec_pipe_upcall_init(struct gss_sec *gsec) +{ + return 0; +} + +static void gss_sec_pipe_upcall_fini(struct gss_sec *gsec) +{ +} + +/**************************************** + * internel context helpers * + ****************************************/ + +static +struct ptlrpc_cli_ctx *ctx_create_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx; + int rc; + + OBD_ALLOC_PTR(gctx); + if (gctx == NULL) + return NULL; + + rc = gss_cli_ctx_init_common(sec, &gctx->gc_base, + &gss_pipefs_ctxops, vcred); + if (rc) { + OBD_FREE_PTR(gctx); + return NULL; + } + + return &gctx->gc_base; +} + +static +void ctx_destroy_pf(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + if (gss_cli_ctx_fini_common(sec, ctx)) + return; + + OBD_FREE_PTR(gctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +void ctx_enhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *hash) +{ + set_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + atomic_inc(&ctx->cc_refcount); + hlist_add_head(&ctx->cc_cache, hash); +} + +/* + * caller must hold spinlock + */ +static +void ctx_unhash_pf(struct ptlrpc_cli_ctx *ctx, struct hlist_head *freelist) +{ + LASSERT(spin_is_locked(&ctx->cc_sec->ps_lock)); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + + clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags); + + if (atomic_dec_and_test(&ctx->cc_refcount)) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, freelist); + } else { + hlist_del_init(&ctx->cc_cache); + } +} + +/* + * return 1 if the context is dead. + */ +static +int ctx_check_death_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + if (cli_ctx_check_death(ctx)) { + if (freelist) + ctx_unhash_pf(ctx, freelist); + return 1; + } + + return 0; +} + +static inline +int ctx_check_death_locked_pf(struct ptlrpc_cli_ctx *ctx, + struct hlist_head *freelist) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)); + + return ctx_check_death_pf(ctx, freelist); +} + +static inline +int ctx_match_pf(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + /* a little bit optimization for null policy */ + if (!ctx->cc_ops->match) + return 1; + + return ctx->cc_ops->match(ctx, vcred); +} + +static +void ctx_list_destroy_pf(struct hlist_head *head) +{ + struct ptlrpc_cli_ctx *ctx; + + while (!hlist_empty(head)) { + ctx = hlist_entry(head->first, struct ptlrpc_cli_ctx, + cc_cache); + + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, + &ctx->cc_flags) == 0); + + hlist_del_init(&ctx->cc_cache); + ctx_destroy_pf(ctx->cc_sec, ctx); + } +} + +/**************************************** + * context apis * + ****************************************/ + +static +int gss_cli_ctx_validate_pf(struct ptlrpc_cli_ctx *ctx) +{ + if (ctx_check_death_pf(ctx, NULL)) + return 1; + if (cli_ctx_is_ready(ctx)) + return 0; + return 1; +} + +static +void gss_cli_ctx_die_pf(struct ptlrpc_cli_ctx *ctx, int grace) +{ + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + cli_ctx_expire(ctx); + + spin_lock(&ctx->cc_sec->ps_lock); + + if (test_and_clear_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags)) { + LASSERT(!hlist_unhashed(&ctx->cc_cache)); + LASSERT(atomic_read(&ctx->cc_refcount) > 1); + + hlist_del_init(&ctx->cc_cache); + if (atomic_dec_and_test(&ctx->cc_refcount)) + LBUG(); + } + + spin_unlock(&ctx->cc_sec->ps_lock); +} + +/**************************************** + * reverse context installation * + ****************************************/ + +static inline +unsigned int ctx_hash_index(int hashsize, __u64 key) +{ + return (unsigned int) (key & ((__u64) hashsize - 1)); +} + +static +void gss_sec_ctx_replace_pf(struct gss_sec *gsec, + struct ptlrpc_cli_ctx *new) +{ + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node *next; + HLIST_HEAD(freelist); + unsigned int hash; + ENTRY; + + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) new->cc_vcred.vc_uid); + LASSERT(hash < gsec_pf->gsp_chash_size); + + spin_lock(&gsec->gs_base.ps_lock); + + hlist_for_each_entry_safe(ctx, next, + &gsec_pf->gsp_chash[hash], cc_cache) { + if (!ctx_match_pf(ctx, &new->cc_vcred)) + continue; + + cli_ctx_expire(ctx); + ctx_unhash_pf(ctx, &freelist); + break; + } + + ctx_enhash_pf(new, &gsec_pf->gsp_chash[hash]); + + spin_unlock(&gsec->gs_base.ps_lock); + + ctx_list_destroy_pf(&freelist); + EXIT; +} + +static +int gss_install_rvs_cli_ctx_pf(struct gss_sec *gsec, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct vfs_cred vcred; + struct ptlrpc_cli_ctx *cli_ctx; + int rc; + ENTRY; + + vcred.vc_uid = 0; + vcred.vc_gid = 0; + + cli_ctx = ctx_create_pf(&gsec->gs_base, &vcred); + if (!cli_ctx) + RETURN(-ENOMEM); + + rc = gss_copy_rvc_cli_ctx(cli_ctx, svc_ctx); + if (rc) { + ctx_destroy_pf(cli_ctx->cc_sec, cli_ctx); + RETURN(rc); + } + + gss_sec_ctx_replace_pf(gsec, cli_ctx); + RETURN(0); +} + +static +void gss_ctx_cache_gc_pf(struct gss_sec_pipefs *gsec_pf, + struct hlist_head *freelist) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node *next; + int i; + ENTRY; + + sec = &gsec_pf->gsp_base.gs_base; + + CDEBUG(D_SEC, "do gc on sec %s@%p\n", sec->ps_policy->sp_name, sec); + + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + hlist_for_each_entry_safe(ctx, next, + &gsec_pf->gsp_chash[i], cc_cache) + ctx_check_death_locked_pf(ctx, freelist); + } + + sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval; + EXIT; +} + +static +struct ptlrpc_sec* gss_sec_create_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *sf) +{ + struct gss_sec_pipefs *gsec_pf; + int alloc_size, hash_size, i; + ENTRY; + +#define GSS_SEC_PIPEFS_CTX_HASH_SIZE (32) + + if (ctx || + sf->sf_flags & (PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_REVERSE)) + hash_size = 1; + else + hash_size = GSS_SEC_PIPEFS_CTX_HASH_SIZE; + + alloc_size = sizeof(*gsec_pf) + + sizeof(struct hlist_head) * hash_size; + + OBD_ALLOC(gsec_pf, alloc_size); + if (!gsec_pf) + RETURN(NULL); + + gsec_pf->gsp_chash_size = hash_size; + for (i = 0; i < hash_size; i++) + INIT_HLIST_HEAD(&gsec_pf->gsp_chash[i]); + + if (gss_sec_create_common(&gsec_pf->gsp_base, &gss_policy_pipefs, + imp, ctx, sf)) + goto err_free; + + if (ctx == NULL) { + if (gss_sec_pipe_upcall_init(&gsec_pf->gsp_base)) + goto err_destroy; + } else { + if (gss_install_rvs_cli_ctx_pf(&gsec_pf->gsp_base, ctx)) + goto err_destroy; + } + + RETURN(&gsec_pf->gsp_base.gs_base); + +err_destroy: + gss_sec_destroy_common(&gsec_pf->gsp_base); +err_free: + OBD_FREE(gsec_pf, alloc_size); + RETURN(NULL); +} + +static +void gss_sec_destroy_pf(struct ptlrpc_sec *sec) +{ + struct gss_sec_pipefs *gsec_pf; + struct gss_sec *gsec; + + CWARN("destroy %s@%p\n", sec->ps_policy->sp_name, sec); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + LASSERT(gsec_pf->gsp_chash); + LASSERT(gsec_pf->gsp_chash_size); + + gss_sec_pipe_upcall_fini(gsec); + + gss_sec_destroy_common(gsec); + + OBD_FREE(gsec, sizeof(*gsec_pf) + + sizeof(struct hlist_head) * gsec_pf->gsp_chash_size); +} + +static +struct ptlrpc_cli_ctx * gss_sec_lookup_ctx_pf(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx = NULL, *new = NULL; + struct hlist_head *hash_head; + struct hlist_node *next; + HLIST_HEAD(freelist); + unsigned int hash, gc = 0, found = 0; + ENTRY; + + might_sleep(); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + hash = ctx_hash_index(gsec_pf->gsp_chash_size, + (__u64) vcred->vc_uid); + hash_head = &gsec_pf->gsp_chash[hash]; + LASSERT(hash < gsec_pf->gsp_chash_size); + +retry: + spin_lock(&sec->ps_lock); + + /* gc_next == 0 means never do gc */ + if (remove_dead && sec->ps_gc_next && + cfs_time_after(cfs_time_current_sec(), sec->ps_gc_next)) { + gss_ctx_cache_gc_pf(gsec_pf, &freelist); + gc = 1; + } + + hlist_for_each_entry_safe(ctx, next, hash_head, cc_cache) { + if (gc == 0 && + ctx_check_death_locked_pf(ctx, + remove_dead ? &freelist : NULL)) + continue; + + if (ctx_match_pf(ctx, vcred)) { + found = 1; + break; + } + } + + if (found) { + if (new && new != ctx) { + /* lost the race, just free it */ + hlist_add_head(&new->cc_cache, &freelist); + new = NULL; + } + + /* hot node, move to head */ + if (hash_head->first != &ctx->cc_cache) { + __hlist_del(&ctx->cc_cache); + hlist_add_head(&ctx->cc_cache, hash_head); + } + } else { + /* don't allocate for reverse sec */ + if (sec_is_reverse(sec)) { + spin_unlock(&sec->ps_lock); + RETURN(NULL); + } + + if (new) { + ctx_enhash_pf(new, hash_head); + ctx = new; + } else if (create) { + spin_unlock(&sec->ps_lock); + new = ctx_create_pf(sec, vcred); + if (new) { + clear_bit(PTLRPC_CTX_NEW_BIT, &new->cc_flags); + goto retry; + } + } else { + ctx = NULL; + } + } + + /* hold a ref */ + if (ctx) + atomic_inc(&ctx->cc_refcount); + + spin_unlock(&sec->ps_lock); + + /* the allocator of the context must give the first push to refresh */ + if (new) { + LASSERT(new == ctx); + gss_cli_ctx_refresh_pf(new); + } + + ctx_list_destroy_pf(&freelist); + RETURN(ctx); +} + +static +void gss_sec_release_ctx_pf(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync) +{ + LASSERT(test_bit(PTLRPC_CTX_CACHED_BIT, &ctx->cc_flags) == 0); + LASSERT(hlist_unhashed(&ctx->cc_cache)); + + /* if required async, we must clear the UPTODATE bit to prevent extra + * rpcs during destroy procedure. */ + if (!sync) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + /* destroy this context */ + ctx_destroy_pf(sec, ctx); +} + +/* + * @uid: which user. "-1" means flush all. + * @grace: mark context DEAD, allow graceful destroy like notify + * server side, etc. + * @force: also flush busy entries. + * + * return the number of busy context encountered. + * + * In any cases, never touch "eternal" contexts. + */ +static +int gss_sec_flush_ctx_cache_pf(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + struct gss_sec *gsec; + struct gss_sec_pipefs *gsec_pf; + struct ptlrpc_cli_ctx *ctx; + struct hlist_node *next; + HLIST_HEAD(freelist); + int i, busy = 0; + ENTRY; + + might_sleep_if(grace); + + gsec = container_of(sec, struct gss_sec, gs_base); + gsec_pf = container_of(gsec, struct gss_sec_pipefs, gsp_base); + + spin_lock(&sec->ps_lock); + for (i = 0; i < gsec_pf->gsp_chash_size; i++) { + hlist_for_each_entry_safe(ctx, next, + &gsec_pf->gsp_chash[i], + cc_cache) { + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + + if (uid != -1 && uid != ctx->cc_vcred.vc_uid) + continue; + + if (atomic_read(&ctx->cc_refcount) > 1) { + busy++; + if (!force) + continue; + + CWARN("flush busy(%d) ctx %p(%u->%s) by force, " + "grace %d\n", + atomic_read(&ctx->cc_refcount), + ctx, ctx->cc_vcred.vc_uid, + sec2target_str(ctx->cc_sec), grace); + } + ctx_unhash_pf(ctx, &freelist); + + set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags); + if (!grace) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, + &ctx->cc_flags); + } + } + spin_unlock(&sec->ps_lock); + + ctx_list_destroy_pf(&freelist); + RETURN(busy); +} + +/**************************************** + * service apis * + ****************************************/ + +static +int gss_svc_accept_pf(struct ptlrpc_request *req) +{ + return gss_svc_accept(&gss_policy_pipefs, req); +} + +static +int gss_svc_install_rctx_pf(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec *sec; + int rc; + + sec = sptlrpc_import_sec_ref(imp); + LASSERT(sec); + rc = gss_install_rvs_cli_ctx_pf(sec2gsec(sec), ctx); + + sptlrpc_sec_put(sec); + return rc; +} + +/**************************************** + * rpc_pipefs definitions * + ****************************************/ + +#define LUSTRE_PIPE_ROOT "/lustre" +#define LUSTRE_PIPE_KRB5 LUSTRE_PIPE_ROOT"/krb5" + +struct gss_upcall_msg_data { + __u32 gum_seq; + __u32 gum_uid; + __u32 gum_gid; + __u32 gum_svc; /* MDS/OSS... */ + __u64 gum_nid; /* peer NID */ + __u8 gum_obd[64]; /* client obd name */ +}; + +struct gss_upcall_msg { + struct rpc_pipe_msg gum_base; + atomic_t gum_refcount; + struct list_head gum_list; + __u32 gum_mechidx; + struct gss_sec *gum_gsec; + struct gss_cli_ctx *gum_gctx; + struct gss_upcall_msg_data gum_data; +}; + +static atomic_t upcall_seq = ATOMIC_INIT(0); + +static inline +__u32 upcall_get_sequence(void) +{ + return (__u32) atomic_inc_return(&upcall_seq); +} + +enum mech_idx_t { + MECH_KRB5 = 0, + MECH_MAX +}; + +static inline +__u32 mech_name2idx(const char *name) +{ + LASSERT(!strcmp(name, "krb5")); + return MECH_KRB5; +} + +/* pipefs dentries for each mechanisms */ +static struct dentry *de_pipes[MECH_MAX] = { NULL, }; +/* all upcall messgaes linked here */ +static struct list_head upcall_lists[MECH_MAX]; +/* and protected by this */ +static spinlock_t upcall_locks[MECH_MAX]; + +static inline +void upcall_list_lock(int idx) +{ + spin_lock(&upcall_locks[idx]); +} + +static inline +void upcall_list_unlock(int idx) +{ + spin_unlock(&upcall_locks[idx]); +} + +static +void upcall_msg_enlist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_add(&msg->gum_list, &upcall_lists[idx]); + upcall_list_unlock(idx); +} + +static +void upcall_msg_delist(struct gss_upcall_msg *msg) +{ + __u32 idx = msg->gum_mechidx; + + upcall_list_lock(idx); + list_del_init(&msg->gum_list); + upcall_list_unlock(idx); +} + +/**************************************** + * rpc_pipefs upcall helpers * + ****************************************/ + +static +void gss_release_msg(struct gss_upcall_msg *gmsg) +{ + ENTRY; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + if (!atomic_dec_and_test(&gmsg->gum_refcount)) { + EXIT; + return; + } + + if (gmsg->gum_gctx) { + sptlrpc_cli_ctx_wakeup(&gmsg->gum_gctx->gc_base); + sptlrpc_cli_ctx_put(&gmsg->gum_gctx->gc_base, 1); + gmsg->gum_gctx = NULL; + } + + LASSERT(list_empty(&gmsg->gum_list)); + LASSERT(list_empty(&gmsg->gum_base.list)); + OBD_FREE_PTR(gmsg); + EXIT; +} + +static +void gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + LASSERT(spin_is_locked(&upcall_locks[idx])); + + if (list_empty(&gmsg->gum_list)) + return; + + list_del_init(&gmsg->gum_list); + LASSERT(atomic_read(&gmsg->gum_refcount) > 1); + atomic_dec(&gmsg->gum_refcount); +} + +static +void gss_unhash_msg(struct gss_upcall_msg *gmsg) +{ + __u32 idx = gmsg->gum_mechidx; + + LASSERT(idx < MECH_MAX); + upcall_list_lock(idx); + gss_unhash_msg_nolock(gmsg); + upcall_list_unlock(idx); +} + +static +void gss_msg_fail_ctx(struct gss_upcall_msg *gmsg) +{ + if (gmsg->gum_gctx) { + struct ptlrpc_cli_ctx *ctx = &gmsg->gum_gctx->gc_base; + + LASSERT(atomic_read(&ctx->cc_refcount) > 0); + sptlrpc_cli_ctx_expire(ctx); + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + } +} + +static +struct gss_upcall_msg * gss_find_upcall(__u32 mechidx, __u32 seq) +{ + struct gss_upcall_msg *gmsg; + + upcall_list_lock(mechidx); + list_for_each_entry(gmsg, &upcall_lists[mechidx], gum_list) { + if (gmsg->gum_data.gum_seq != seq) + continue; + + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + LASSERT(gmsg->gum_mechidx == mechidx); + + atomic_inc(&gmsg->gum_refcount); + upcall_list_unlock(mechidx); + return gmsg; + } + upcall_list_unlock(mechidx); + return NULL; +} + +static +int simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("buflen %u < %u\n", *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} + +/**************************************** + * rpc_pipefs apis * + ****************************************/ + +static +ssize_t gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + ENTRY; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + RETURN(left); + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + RETURN(mlen); +} + +static +ssize_t gss_pipe_downcall(struct file *filp, const char *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct gss_upcall_msg *gss_msg; + struct ptlrpc_cli_ctx *ctx; + struct gss_cli_ctx *gctx = NULL; + char *buf, *data; + int datalen; + int timeout, rc; + __u32 mechidx, seq, gss_err; + ENTRY; + + mechidx = (__u32) (long) rpci->private; + LASSERT(mechidx < MECH_MAX); + + OBD_ALLOC(buf, mlen); + if (!buf) + RETURN(-ENOMEM); + + if (copy_from_user(buf, src, mlen)) { + CERROR("failed copy user space data\n"); + GOTO(out_free, rc = -EFAULT); + } + data = buf; + datalen = mlen; + + /* data passed down format: + * - seq + * - timeout + * - gc_win / error + * - wire_ctx (rawobj) + * - mech_ctx (rawobj) + */ + if (simple_get_bytes(&data, &datalen, &seq, sizeof(seq))) { + CERROR("fail to get seq\n"); + GOTO(out_free, rc = -EFAULT); + } + + gss_msg = gss_find_upcall(mechidx, seq); + if (!gss_msg) { + CERROR("upcall %u has aborted earlier\n", seq); + GOTO(out_free, rc = -EINVAL); + } + + gss_unhash_msg(gss_msg); + gctx = gss_msg->gum_gctx; + LASSERT(gctx); + LASSERT(atomic_read(&gctx->gc_base.cc_refcount) > 0); + + /* timeout is not in use for now */ + if (simple_get_bytes(&data, &datalen, &timeout, sizeof(timeout))) + GOTO(out_msg, rc = -EFAULT); + + /* lgssd signal an error by gc_win == 0 */ + if (simple_get_bytes(&data, &datalen, &gctx->gc_win, + sizeof(gctx->gc_win))) + GOTO(out_msg, rc = -EFAULT); + + if (gctx->gc_win == 0) { + /* followed by: + * - rpc error + * - gss error + */ + if (simple_get_bytes(&data, &datalen, &rc, sizeof(rc))) + GOTO(out_msg, rc = -EFAULT); + if (simple_get_bytes(&data, &datalen, &gss_err,sizeof(gss_err))) + GOTO(out_msg, rc = -EFAULT); + + if (rc == 0 && gss_err == GSS_S_COMPLETE) { + CWARN("both rpc & gss error code not set\n"); + rc = -EPERM; + } + } else { + rawobj_t tmpobj; + + /* handle */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + if (rawobj_dup(&gctx->gc_handle, &tmpobj)) + GOTO(out_msg, rc = -ENOMEM); + + /* mechctx */ + if (rawobj_extract_local(&tmpobj, (__u32 **) &data, &datalen)) + GOTO(out_msg, rc = -EFAULT); + gss_err = lgss_import_sec_context(&tmpobj, + gss_msg->gum_gsec->gs_mech, + &gctx->gc_mechctx); + rc = 0; + } + + if (likely(rc == 0 && gss_err == GSS_S_COMPLETE)) { + gss_cli_ctx_uptodate(gctx); + } else { + ctx = &gctx->gc_base; + sptlrpc_cli_ctx_expire(ctx); + if (rc != -ERESTART || gss_err != GSS_S_COMPLETE) + set_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags); + + CERROR("refresh ctx %p(uid %d) failed: %d/0x%08x: %s\n", + ctx, ctx->cc_vcred.vc_uid, rc, gss_err, + test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags) ? + "fatal error" : "non-fatal"); + } + + rc = mlen; + +out_msg: + gss_release_msg(gss_msg); + +out_free: + OBD_FREE(buf, mlen); + /* FIXME + * hack pipefs: always return asked length unless all following + * downcalls might be messed up. */ + rc = mlen; + RETURN(rc); +} + +static +void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + static cfs_time_t ratelimit = 0; + ENTRY; + + LASSERT(list_empty(&msg->list)); + + /* normally errno is >= 0 */ + if (msg->errno >= 0) { + EXIT; + return; + } + + gmsg = container_of(msg, struct gss_upcall_msg, gum_base); + gumd = &gmsg->gum_data; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + CERROR("failed msg %p (seq %u, uid %u, svc %u, nid "LPX64", obd %.*s): " + "errno %d\n", msg, gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd, msg->errno); + + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg(gmsg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + cfs_time_t now = cfs_time_current_sec(); + + if (cfs_time_after(now, ratelimit)) { + CWARN("upcall timed out, is lgssd running?\n"); + ratelimit = now + 15; + } + } + gss_msg_fail_ctx(gmsg); + gss_release_msg(gmsg); + EXIT; +} + +static +void gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + __u32 idx; + ENTRY; + + idx = (__u32) (long) rpci->private; + LASSERT(idx < MECH_MAX); + + upcall_list_lock(idx); + while (!list_empty(&upcall_lists[idx])) { + struct gss_upcall_msg *gmsg; + struct gss_upcall_msg_data *gumd; + + gmsg = list_entry(upcall_lists[idx].next, + struct gss_upcall_msg, gum_list); + gumd = &gmsg->gum_data; + LASSERT(list_empty(&gmsg->gum_base.list)); + + CERROR("failing remaining msg %p:seq %u, uid %u, svc %u, " + "nid "LPX64", obd %.*s\n", gmsg, + gumd->gum_seq, gumd->gum_uid, gumd->gum_svc, + gumd->gum_nid, (int) sizeof(gumd->gum_obd), + gumd->gum_obd); + + gmsg->gum_base.errno = -EPIPE; + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg_nolock(gmsg); + + gss_msg_fail_ctx(gmsg); + + upcall_list_unlock(idx); + gss_release_msg(gmsg); + upcall_list_lock(idx); + } + upcall_list_unlock(idx); + EXIT; +} + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; + +/**************************************** + * upcall helper functions * + ****************************************/ + +static +int gss_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + struct obd_import *imp; + struct gss_sec *gsec; + struct gss_upcall_msg *gmsg; + int rc = 0; + ENTRY; + + might_sleep(); + + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_import); + LASSERT(ctx->cc_sec->ps_import->imp_obd); + + imp = ctx->cc_sec->ps_import; + if (!imp->imp_connection) { + CERROR("import has no connection set\n"); + RETURN(-EINVAL); + } + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + + OBD_ALLOC_PTR(gmsg); + if (!gmsg) + RETURN(-ENOMEM); + + /* initialize pipefs base msg */ + INIT_LIST_HEAD(&gmsg->gum_base.list); + gmsg->gum_base.data = &gmsg->gum_data; + gmsg->gum_base.len = sizeof(gmsg->gum_data); + gmsg->gum_base.copied = 0; + gmsg->gum_base.errno = 0; + + /* init upcall msg */ + atomic_set(&gmsg->gum_refcount, 1); + gmsg->gum_mechidx = mech_name2idx(gsec->gs_mech->gm_name); + gmsg->gum_gsec = gsec; + gmsg->gum_gctx = container_of(sptlrpc_cli_ctx_get(ctx), + struct gss_cli_ctx, gc_base); + gmsg->gum_data.gum_seq = upcall_get_sequence(); + gmsg->gum_data.gum_uid = ctx->cc_vcred.vc_uid; + gmsg->gum_data.gum_gid = 0; /* not used for now */ + gmsg->gum_data.gum_svc = import_to_gss_svc(imp); + gmsg->gum_data.gum_nid = imp->imp_connection->c_peer.nid; + strncpy(gmsg->gum_data.gum_obd, imp->imp_obd->obd_name, + sizeof(gmsg->gum_data.gum_obd)); + + /* This only could happen when sysadmin set it dead/expired + * using lctl by force. */ + if (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK) { + CWARN("ctx %p(%u->%s) was set flags %lx unexpectedly\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_flags); + + LASSERT(!(ctx->cc_flags & PTLRPC_CTX_UPTODATE)); + ctx->cc_flags |= PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR; + + rc = -EIO; + goto err_free; + } + + upcall_msg_enlist(gmsg); + + rc = rpc_queue_upcall(de_pipes[gmsg->gum_mechidx]->d_inode, + &gmsg->gum_base); + if (rc) { + CERROR("rpc_queue_upcall failed: %d\n", rc); + + upcall_msg_delist(gmsg); + goto err_free; + } + + RETURN(0); +err_free: + OBD_FREE_PTR(gmsg); + RETURN(rc); +} + +static +int gss_cli_ctx_refresh_pf(struct ptlrpc_cli_ctx *ctx) +{ + /* if we are refreshing for root, also update the reverse + * handle index, do not confuse reverse contexts. */ + if (ctx->cc_vcred.vc_uid == 0) { + struct gss_sec *gsec; + + gsec = container_of(ctx->cc_sec, struct gss_sec, gs_base); + gsec->gs_rvs_hdl = gss_get_next_ctx_index(); + } + + return gss_ctx_refresh_pf(ctx); +} + +/**************************************** + * lustre gss pipefs policy * + ****************************************/ + +static struct ptlrpc_ctx_ops gss_pipefs_ctxops = { + .match = gss_cli_ctx_match, + .refresh = gss_cli_ctx_refresh_pf, + .validate = gss_cli_ctx_validate_pf, + .die = gss_cli_ctx_die_pf, + .sign = gss_cli_ctx_sign, + .verify = gss_cli_ctx_verify, + .seal = gss_cli_ctx_seal, + .unseal = gss_cli_ctx_unseal, + .wrap_bulk = gss_cli_ctx_wrap_bulk, + .unwrap_bulk = gss_cli_ctx_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops gss_sec_pipefs_cops = { + .create_sec = gss_sec_create_pf, + .destroy_sec = gss_sec_destroy_pf, + .kill_sec = gss_sec_kill, + .lookup_ctx = gss_sec_lookup_ctx_pf, + .release_ctx = gss_sec_release_ctx_pf, + .flush_ctx_cache = gss_sec_flush_ctx_cache_pf, + .install_rctx = gss_sec_install_rctx, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, + .alloc_repbuf = gss_alloc_repbuf, + .free_repbuf = gss_free_repbuf, + .enlarge_reqbuf = gss_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops gss_sec_pipefs_sops = { + .accept = gss_svc_accept_pf, + .invalidate_ctx = gss_svc_invalidate_ctx, + .alloc_rs = gss_svc_alloc_rs, + .authorize = gss_svc_authorize, + .free_rs = gss_svc_free_rs, + .free_ctx = gss_svc_free_ctx, + .unwrap_bulk = gss_svc_unwrap_bulk, + .wrap_bulk = gss_svc_wrap_bulk, + .install_rctx = gss_svc_install_rctx_pf, +}; + +static struct ptlrpc_sec_policy gss_policy_pipefs = { + .sp_owner = THIS_MODULE, + .sp_name = "gss.pipefs", + .sp_policy = SPTLRPC_POLICY_GSS_PIPEFS, + .sp_cops = &gss_sec_pipefs_cops, + .sp_sops = &gss_sec_pipefs_sops, +}; + +static +int __init gss_init_pipefs_upcall(void) +{ + struct dentry *de; + + /* pipe dir */ + de = rpc_mkdir(LUSTRE_PIPE_ROOT, NULL); + if (IS_ERR(de) && PTR_ERR(de) != -EEXIST) { + CERROR("Failed to create gss pipe dir: %ld\n", PTR_ERR(de)); + return PTR_ERR(de); + } + + /* FIXME hack pipefs: dput will sometimes cause oops during module + * unload and lgssd close the pipe fds. */ + + /* krb5 mechanism */ + de = rpc_mkpipe(LUSTRE_PIPE_KRB5, (void *) MECH_KRB5, &gss_upcall_ops, + RPC_PIPE_WAIT_FOR_OPEN); + if (!de || IS_ERR(de)) { + CERROR("failed to make rpc_pipe %s: %ld\n", + LUSTRE_PIPE_KRB5, PTR_ERR(de)); + rpc_rmdir(LUSTRE_PIPE_ROOT); + return PTR_ERR(de); + } + + de_pipes[MECH_KRB5] = de; + INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]); + spin_lock_init(&upcall_locks[MECH_KRB5]); + + return 0; +} + +static +void __exit gss_exit_pipefs_upcall(void) +{ + __u32 i; + + for (i = 0; i < MECH_MAX; i++) { + LASSERT(list_empty(&upcall_lists[i])); + + /* dput pipe dentry here might cause lgssd oops. */ + de_pipes[i] = NULL; + } + + rpc_unlink(LUSTRE_PIPE_KRB5); + rpc_rmdir(LUSTRE_PIPE_ROOT); +} + +int __init gss_init_pipefs(void) +{ + int rc; + + rc = gss_init_pipefs_upcall(); + if (rc) + return rc; + + rc = sptlrpc_register_policy(&gss_policy_pipefs); + if (rc) { + gss_exit_pipefs_upcall(); + return rc; + } + + return 0; +} + +void __exit gss_exit_pipefs(void) +{ + gss_exit_pipefs_upcall(); + sptlrpc_unregister_policy(&gss_policy_pipefs); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c new file mode 100644 index 000000000000..474ecf805307 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_rawobj.c @@ -0,0 +1,242 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/gss/gss_rawobj.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include + +#include "gss_internal.h" + +int rawobj_empty(rawobj_t *obj) +{ + LASSERT(equi(obj->len, obj->data)); + return (obj->len == 0); +} + +int rawobj_alloc(rawobj_t *obj, char *buf, int len) +{ + LASSERT(obj); + LASSERT(len >= 0); + + obj->len = len; + if (len) { + OBD_ALLOC_LARGE(obj->data, len); + if (!obj->data) { + obj->len = 0; + RETURN(-ENOMEM); + } + memcpy(obj->data, buf, len); + } else + obj->data = NULL; + return 0; +} + +void rawobj_free(rawobj_t *obj) +{ + LASSERT(obj); + + if (obj->len) { + LASSERT(obj->data); + OBD_FREE_LARGE(obj->data, obj->len); + obj->len = 0; + obj->data = NULL; + } else + LASSERT(!obj->data); +} + +int rawobj_equal(rawobj_t *a, rawobj_t *b) +{ + LASSERT(a && b); + + return (a->len == b->len && + (!a->len || !memcmp(a->data, b->data, a->len))); +} + +int rawobj_dup(rawobj_t *dest, rawobj_t *src) +{ + LASSERT(src && dest); + + dest->len = src->len; + if (dest->len) { + OBD_ALLOC_LARGE(dest->data, dest->len); + if (!dest->data) { + dest->len = 0; + return -ENOMEM; + } + memcpy(dest->data, src->data, dest->len); + } else + dest->data = NULL; + return 0; +} + +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + __u32 len; + + LASSERT(obj); + LASSERT(buf); + LASSERT(buflen); + + len = cfs_size_round4(obj->len); + + if (*buflen < 4 + len) { + CERROR("buflen %u < %u\n", *buflen, 4 + len); + return -EINVAL; + } + + *(*buf)++ = cpu_to_le32(obj->len); + memcpy(*buf, obj->data, obj->len); + *buf += (len >> 2); + *buflen -= (4 + len); + + return 0; +} + +static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen, + int alloc, int local) +{ + __u32 len; + + if (*buflen < sizeof(__u32)) { + CERROR("buflen %u\n", *buflen); + return -EINVAL; + } + + obj->len = *(*buf)++; + if (!local) + obj->len = le32_to_cpu(obj->len); + *buflen -= sizeof(__u32); + + if (!obj->len) { + obj->data = NULL; + return 0; + } + + len = local ? obj->len : cfs_size_round4(obj->len); + if (*buflen < len) { + CERROR("buflen %u < %u\n", *buflen, len); + obj->len = 0; + return -EINVAL; + } + + if (!alloc) + obj->data = (__u8 *) *buf; + else { + OBD_ALLOC_LARGE(obj->data, obj->len); + if (!obj->data) { + CERROR("fail to alloc %u bytes\n", obj->len); + obj->len = 0; + return -ENOMEM; + } + memcpy(obj->data, *buf, obj->len); + } + + *((char **)buf) += len; + *buflen -= len; + + return 0; +} + +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 0); +} + +int rawobj_extract_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 0); +} + +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 1); +} + +int rawobj_extract_local_alloc(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 1, 1); +} + +int rawobj_from_netobj(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = netobj->len; + rawobj->data = netobj->data; + return 0; +} + +int rawobj_from_netobj_alloc(rawobj_t *rawobj, netobj_t *netobj) +{ + rawobj->len = 0; + rawobj->data = NULL; + + if (netobj->len == 0) + return 0; + + OBD_ALLOC_LARGE(rawobj->data, netobj->len); + if (rawobj->data == NULL) + return -ENOMEM; + + rawobj->len = netobj->len; + memcpy(rawobj->data, netobj->data, netobj->len); + return 0; +} + +/**************************************** + * misc more * + ****************************************/ + +int buffer_extract_bytes(const void **buf, __u32 *buflen, + void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("buflen %u < %u\n", *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c new file mode 100644 index 000000000000..31b50ea19c25 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/gss_svc_upcall.c @@ -0,0 +1,1099 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#define GSS_SVC_UPCALL_TIMEOUT (20) + +static spinlock_t __ctx_index_lock; +static __u64 __ctx_index; + +__u64 gss_get_next_ctx_index(void) +{ + __u64 idx; + + spin_lock(&__ctx_index_lock); + idx = __ctx_index++; + spin_unlock(&__ctx_index_lock); + + return idx; +} + +static inline unsigned long hash_mem(char *buf, int length, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + + do { + if (len == length) { + c = (char) len; + len = -1; + } else + c = *buf++; + + l = (l << 8) | c; + len++; + + if ((len & (BITS_PER_LONG/8-1)) == 0) + hash = cfs_hash_long(hash^l, BITS_PER_LONG); + } while (len); + + return hash >> (BITS_PER_LONG - bits); +} + +/**************************************** + * rsi cache * + ****************************************/ + +#define RSI_HASHBITS (6) +#define RSI_HASHMAX (1 << RSI_HASHBITS) +#define RSI_HASHMASK (RSI_HASHMAX - 1) + +struct rsi { + struct cache_head h; + __u32 lustre_svc; + __u64 nid; + wait_queue_head_t waitq; + rawobj_t in_handle, in_token; + rawobj_t out_handle, out_token; + int major_status, minor_status; +}; + +static struct cache_head *rsi_table[RSI_HASHMAX]; +static struct cache_detail rsi_cache; +static struct rsi *rsi_update(struct rsi *new, struct rsi *old); +static struct rsi *rsi_lookup(struct rsi *item); + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem((char *)item->in_handle.data, item->in_handle.len, + RSI_HASHBITS) ^ + hash_mem((char *)item->in_token.data, item->in_token.len, + RSI_HASHBITS); +} + +static inline int __rsi_match(struct rsi *item, struct rsi *tmp) +{ + return (rawobj_equal(&item->in_handle, &tmp->in_handle) && + rawobj_equal(&item->in_token, &tmp->in_token)); +} + +static void rsi_free(struct rsi *rsi) +{ + rawobj_free(&rsi->in_handle); + rawobj_free(&rsi->in_token); + rawobj_free(&rsi->out_handle); + rawobj_free(&rsi->out_token); +} + +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsi = container_of(h, struct rsi, h); + __u64 index = 0; + + /* if in_handle is null, provide kernel suggestion */ + if (rsi->in_handle.len == 0) + index = gss_get_next_ctx_index(); + + qword_addhex(bpp, blen, (char *) &rsi->lustre_svc, + sizeof(rsi->lustre_svc)); + qword_addhex(bpp, blen, (char *) &rsi->nid, sizeof(rsi->nid)); + qword_addhex(bpp, blen, (char *) &index, sizeof(index)); + qword_addhex(bpp, blen, rsi->in_handle.data, rsi->in_handle.len); + qword_addhex(bpp, blen, rsi->in_token.data, rsi->in_token.len); + (*bpp)[-1] = '\n'; +} + +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, rsi_request); +} + +static inline void __rsi_init(struct rsi *new, struct rsi *item) +{ + new->out_handle = RAWOBJ_EMPTY; + new->out_token = RAWOBJ_EMPTY; + + new->in_handle = item->in_handle; + item->in_handle = RAWOBJ_EMPTY; + new->in_token = item->in_token; + item->in_token = RAWOBJ_EMPTY; + + new->lustre_svc = item->lustre_svc; + new->nid = item->nid; + init_waitqueue_head(&new->waitq); +} + +static inline void __rsi_update(struct rsi *new, struct rsi *item) +{ + LASSERT(new->out_handle.len == 0); + LASSERT(new->out_token.len == 0); + + new->out_handle = item->out_handle; + item->out_handle = RAWOBJ_EMPTY; + new->out_token = item->out_token; + item->out_token = RAWOBJ_EMPTY; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +static void rsi_put(struct kref *ref) +{ + struct rsi *rsi = container_of(ref, struct rsi, h.ref); + + LASSERT(rsi->h.next == NULL); + rsi_free(rsi); + OBD_FREE_PTR(rsi); +} + +static int rsi_match(struct cache_head *a, struct cache_head *b) +{ + struct rsi *item = container_of(a, struct rsi, h); + struct rsi *tmp = container_of(b, struct rsi, h); + + return __rsi_match(item, tmp); +} + +static void rsi_init(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_init(new, item); +} + +static void update_rsi(struct cache_head *cnew, struct cache_head *citem) +{ + struct rsi *new = container_of(cnew, struct rsi, h); + struct rsi *item = container_of(citem, struct rsi, h); + + __rsi_update(new, item); +} + +static struct cache_head *rsi_alloc(void) +{ + struct rsi *rsi; + + OBD_ALLOC_PTR(rsi); + if (rsi) + return &rsi->h; + else + return NULL; +} + +static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + char *ep; + int len; + struct rsi rsii, *rsip = NULL; + time_t expiry; + int status = -EINVAL; + ENTRY; + + + memset(&rsii, 0, sizeof(rsii)); + + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.in_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsip = rsi_lookup(&rsii); + if (!rsip) + goto out; + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + if (expiry == 0) + goto out; + + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + + /* major */ + rsii.major_status = simple_strtol(buf, &ep, 10); + if (*ep) + goto out; + + /* minor */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.minor_status = simple_strtol(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_handle, buf, len)) { + status = -ENOMEM; + goto out; + } + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (rawobj_alloc(&rsii.out_token, buf, len)) { + status = -ENOMEM; + goto out; + } + + rsii.h.expiry_time = expiry; + rsip = rsi_update(&rsii, rsip); + status = 0; +out: + rsi_free(&rsii); + if (rsip) { + wake_up_all(&rsip->waitq); + cache_put(&rsip->h, &rsi_cache); + } else { + status = -ENOMEM; + } + + if (status) + CERROR("rsi parse error %d\n", status); + RETURN(status); +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.sptlrpc.init", + .cache_put = rsi_put, + .cache_upcall = rsi_upcall, + .cache_parse = rsi_parse, + .match = rsi_match, + .init = rsi_init, + .update = update_rsi, + .alloc = rsi_alloc, +}; + +static struct rsi *rsi_lookup(struct rsi *item) +{ + struct cache_head *ch; + int hash = rsi_hash(item); + + ch = sunrpc_cache_lookup(&rsi_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +static struct rsi *rsi_update(struct rsi *new, struct rsi *old) +{ + struct cache_head *ch; + int hash = rsi_hash(new); + + ch = sunrpc_cache_update(&rsi_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsi, h); + else + return NULL; +} + +/**************************************** + * rsc cache * + ****************************************/ + +#define RSC_HASHBITS (10) +#define RSC_HASHMAX (1 << RSC_HASHBITS) +#define RSC_HASHMASK (RSC_HASHMAX - 1) + +struct rsc { + struct cache_head h; + struct obd_device *target; + rawobj_t handle; + struct gss_svc_ctx ctx; +}; + +static struct cache_head *rsc_table[RSC_HASHMAX]; +static struct cache_detail rsc_cache; +static struct rsc *rsc_update(struct rsc *new, struct rsc *old); +static struct rsc *rsc_lookup(struct rsc *item); + +static void rsc_free(struct rsc *rsci) +{ + rawobj_free(&rsci->handle); + rawobj_free(&rsci->ctx.gsc_rvs_hdl); + lgss_delete_sec_context(&rsci->ctx.gsc_mechctx); +} + +static inline int rsc_hash(struct rsc *rsci) +{ + return hash_mem((char *)rsci->handle.data, + rsci->handle.len, RSC_HASHBITS); +} + +static inline int __rsc_match(struct rsc *new, struct rsc *tmp) +{ + return rawobj_equal(&new->handle, &tmp->handle); +} + +static inline void __rsc_init(struct rsc *new, struct rsc *tmp) +{ + new->handle = tmp->handle; + tmp->handle = RAWOBJ_EMPTY; + + new->target = NULL; + memset(&new->ctx, 0, sizeof(new->ctx)); + new->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; +} + +static inline void __rsc_update(struct rsc *new, struct rsc *tmp) +{ + new->ctx = tmp->ctx; + tmp->ctx.gsc_rvs_hdl = RAWOBJ_EMPTY; + tmp->ctx.gsc_mechctx = NULL; + + memset(&new->ctx.gsc_seqdata, 0, sizeof(new->ctx.gsc_seqdata)); + spin_lock_init(&new->ctx.gsc_seqdata.ssd_lock); +} + +static void rsc_put(struct kref *ref) +{ + struct rsc *rsci = container_of(ref, struct rsc, h.ref); + + LASSERT(rsci->h.next == NULL); + rsc_free(rsci); + OBD_FREE_PTR(rsci); +} + +static int rsc_match(struct cache_head *a, struct cache_head *b) +{ + struct rsc *new = container_of(a, struct rsc, h); + struct rsc *tmp = container_of(b, struct rsc, h); + + return __rsc_match(new, tmp); +} + +static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_init(new, tmp); +} + +static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) +{ + struct rsc *new = container_of(cnew, struct rsc, h); + struct rsc *tmp = container_of(ctmp, struct rsc, h); + + __rsc_update(new, tmp); +} + +static struct cache_head * rsc_alloc(void) +{ + struct rsc *rsc; + + OBD_ALLOC_PTR(rsc); + if (rsc) + return &rsc->h; + else + return NULL; +} + +static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) +{ + char *buf = mesg; + int len, rv, tmp_int; + struct rsc rsci, *rscp = NULL; + time_t expiry; + int status = -EINVAL; + struct gss_api_mech *gm = NULL; + + memset(&rsci, 0, sizeof(rsci)); + + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* remote flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get remote flag\n"); + goto out; + } + rsci.ctx.gsc_remote = (tmp_int != 0); + + /* root user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get oss user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_root = (tmp_int != 0); + + /* mds user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get mds user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_mds = (tmp_int != 0); + + /* oss user flag */ + rv = get_int(&mesg, &tmp_int); + if (rv) { + CERROR("fail to get oss user flag\n"); + goto out; + } + rsci.ctx.gsc_usr_oss = (tmp_int != 0); + + /* mapped uid */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_mapped_uid); + if (rv) { + CERROR("fail to get mapped uid\n"); + goto out; + } + + rscp = rsc_lookup(&rsci); + if (!rscp) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, (int *) &rsci.ctx.gsc_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) { + CERROR("NOENT? set rsc entry negative\n"); + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + } else { + rawobj_t tmp_buf; + unsigned long ctx_expiry; + + /* gid */ + if (get_int(&mesg, (int *) &rsci.ctx.gsc_gid)) + goto out; + + /* mech name */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + gm = lgss_name_to_mech(buf); + status = -EOPNOTSUPP; + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + + tmp_buf.len = len; + tmp_buf.data = (unsigned char *)buf; + if (lgss_import_sec_context(&tmp_buf, gm, + &rsci.ctx.gsc_mechctx)) + goto out; + + /* currently the expiry time passed down from user-space + * is invalid, here we retrive it from mech. */ + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + goto out; + } + expiry = (time_t) ctx_expiry; + } + + rsci.h.expiry_time = expiry; + rscp = rsc_update(&rsci, rscp); + status = 0; +out: + if (gm) + lgss_mech_put(gm); + rsc_free(&rsci); + if (rscp) + cache_put(&rscp->h, &rsc_cache); + else + status = -ENOMEM; + + if (status) + CERROR("parse rsc error %d\n", status); + return status; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.sptlrpc.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, + .match = rsc_match, + .init = rsc_init, + .update = update_rsc, + .alloc = rsc_alloc, +}; + +static struct rsc *rsc_lookup(struct rsc *item) +{ + struct cache_head *ch; + int hash = rsc_hash(item); + + ch = sunrpc_cache_lookup(&rsc_cache, &item->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +static struct rsc *rsc_update(struct rsc *new, struct rsc *old) +{ + struct cache_head *ch; + int hash = rsc_hash(new); + + ch = sunrpc_cache_update(&rsc_cache, &new->h, &old->h, hash); + if (ch) + return container_of(ch, struct rsc, h); + else + return NULL; +} + +#define COMPAT_RSC_PUT(item, cd) cache_put((item), (cd)) + +/**************************************** + * rsc cache flush * + ****************************************/ + +typedef int rsc_entry_match(struct rsc *rscp, long data); + +static void rsc_flush(rsc_entry_match *match, long data) +{ + struct cache_head **ch; + struct rsc *rscp; + int n; + ENTRY; + + write_lock(&rsc_cache.hash_lock); + for (n = 0; n < RSC_HASHMAX; n++) { + for (ch = &rsc_cache.hash_table[n]; *ch;) { + rscp = container_of(*ch, struct rsc, h); + + if (!match(rscp, data)) { + ch = &((*ch)->next); + continue; + } + + /* it seems simply set NEGATIVE doesn't work */ + *ch = (*ch)->next; + rscp->h.next = NULL; + cache_get(&rscp->h); + set_bit(CACHE_NEGATIVE, &rscp->h.flags); + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + rsc_cache.entries--; + } + } + write_unlock(&rsc_cache.hash_lock); + EXIT; +} + +static int match_uid(struct rsc *rscp, long uid) +{ + if ((int) uid == -1) + return 1; + return ((int) rscp->ctx.gsc_uid == (int) uid); +} + +static int match_target(struct rsc *rscp, long target) +{ + return (rscp->target == (struct obd_device *) target); +} + +static inline void rsc_flush_uid(int uid) +{ + if (uid == -1) + CWARN("flush all gss contexts...\n"); + + rsc_flush(match_uid, (long) uid); +} + +static inline void rsc_flush_target(struct obd_device *target) +{ + rsc_flush(match_target, (long) target); +} + +void gss_secsvc_flush(struct obd_device *target) +{ + rsc_flush_target(target); +} +EXPORT_SYMBOL(gss_secsvc_flush); + +static struct rsc *gss_svc_searchbyctx(rawobj_t *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (rawobj_dup(&rsci.handle, handle)) + return NULL; + + found = rsc_lookup(&rsci); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +int gss_svc_upcall_install_rvs_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + struct rsc rsci, *rscp = NULL; + unsigned long ctx_expiry; + __u32 major; + int rc; + ENTRY; + + memset(&rsci, 0, sizeof(rsci)); + + if (rawobj_alloc(&rsci.handle, (char *) &gsec->gs_rvs_hdl, + sizeof(gsec->gs_rvs_hdl))) + GOTO(out, rc = -ENOMEM); + + rscp = rsc_lookup(&rsci); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + major = lgss_copy_reverse_context(gctx->gc_mechctx, + &rsci.ctx.gsc_mechctx); + if (major != GSS_S_COMPLETE) + GOTO(out, rc = -ENOMEM); + + if (lgss_inquire_context(rsci.ctx.gsc_mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + GOTO(out, rc = -EINVAL); + } + rsci.h.expiry_time = (time_t) ctx_expiry; + + if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0) + rsci.ctx.gsc_usr_mds = 1; + else if (strcmp(imp->imp_obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) + rsci.ctx.gsc_usr_oss = 1; + else + rsci.ctx.gsc_usr_root = 1; + + rscp = rsc_update(&rsci, rscp); + if (rscp == NULL) + GOTO(out, rc = -ENOMEM); + + rscp->target = imp->imp_obd; + rawobj_dup(&gctx->gc_svc_handle, &rscp->handle); + + CWARN("create reverse svc ctx %p to %s: idx "LPX64"\n", + &rscp->ctx, obd2cli_tgt(imp->imp_obd), gsec->gs_rvs_hdl); + rc = 0; +out: + if (rscp) + cache_put(&rscp->h, &rsc_cache); + rsc_free(&rsci); + + if (rc) + CERROR("create reverse svc ctx: idx "LPX64", rc %d\n", + gsec->gs_rvs_hdl, rc); + RETURN(rc); +} + +int gss_svc_upcall_expire_rvs_ctx(rawobj_t *handle) +{ + const cfs_time_t expire = 20; + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) expire soon\n", + &rscp->ctx, rscp); + + rscp->h.expiry_time = cfs_time_current_sec() + expire; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +int gss_svc_upcall_dup_handle(rawobj_t *handle, struct gss_svc_ctx *ctx) +{ + struct rsc *rscp = container_of(ctx, struct rsc, ctx); + + return rawobj_dup(handle, &rscp->handle); +} + +int gss_svc_upcall_update_sequence(rawobj_t *handle, __u32 seq) +{ + struct rsc *rscp; + + rscp = gss_svc_searchbyctx(handle); + if (rscp) { + CDEBUG(D_SEC, "reverse svcctx %p (rsc %p) update seq to %u\n", + &rscp->ctx, rscp, seq + 1); + + rscp->ctx.gsc_rvs_seq = seq + 1; + COMPAT_RSC_PUT(&rscp->h, &rsc_cache); + } + return 0; +} + +static struct cache_deferred_req* cache_upcall_defer(struct cache_req *req) +{ + return NULL; +} +static struct cache_req cache_upcall_chandle = { cache_upcall_defer }; + +int gss_svc_upcall_handle_init(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + struct obd_device *target, + __u32 lustre_svc, + rawobj_t *rvs_hdl, + rawobj_t *in_token) +{ + struct ptlrpc_reply_state *rs; + struct rsc *rsci = NULL; + struct rsi *rsip = NULL, rsikey; + wait_queue_t wait; + int replen = sizeof(struct ptlrpc_body); + struct gss_rep_header *rephdr; + int first_check = 1; + int rc = SECSVC_DROP; + ENTRY; + + memset(&rsikey, 0, sizeof(rsikey)); + rsikey.lustre_svc = lustre_svc; + rsikey.nid = (__u64) req->rq_peer.nid; + + /* duplicate context handle. for INIT it always 0 */ + if (rawobj_dup(&rsikey.in_handle, &gw->gw_handle)) { + CERROR("fail to dup context handle\n"); + GOTO(out, rc); + } + + if (rawobj_dup(&rsikey.in_token, in_token)) { + CERROR("can't duplicate token\n"); + rawobj_free(&rsikey.in_handle); + GOTO(out, rc); + } + + rsip = rsi_lookup(&rsikey); + rsi_free(&rsikey); + if (!rsip) { + CERROR("error in rsi_lookup.\n"); + + if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } + + cache_get(&rsip->h); /* take an extra ref */ + init_waitqueue_head(&rsip->waitq); + init_waitqueue_entry_current(&wait); + add_wait_queue(&rsip->waitq, &wait); + +cache_check: + /* Note each time cache_check() will drop a reference if return + * non-zero. We hold an extra reference on initial rsip, but must + * take care of following calls. */ + rc = cache_check(&rsi_cache, &rsip->h, &cache_upcall_chandle); + switch (rc) { + case -EAGAIN: { + int valid; + + if (first_check) { + first_check = 0; + + read_lock(&rsi_cache.hash_lock); + valid = test_bit(CACHE_VALID, &rsip->h.flags); + if (valid == 0) + set_current_state(TASK_INTERRUPTIBLE); + read_unlock(&rsi_cache.hash_lock); + + if (valid == 0) + schedule_timeout(GSS_SVC_UPCALL_TIMEOUT * + HZ); + + cache_get(&rsip->h); + goto cache_check; + } + CWARN("waited %ds timeout, drop\n", GSS_SVC_UPCALL_TIMEOUT); + break; + } + case -ENOENT: + CWARN("cache_check return ENOENT, drop\n"); + break; + case 0: + /* if not the first check, we have to release the extra + * reference we just added on it. */ + if (!first_check) + cache_put(&rsip->h, &rsi_cache); + CDEBUG(D_SEC, "cache_check is good\n"); + break; + } + + remove_wait_queue(&rsip->waitq, &wait); + cache_put(&rsip->h, &rsi_cache); + + if (rc) + GOTO(out, rc = SECSVC_DROP); + + rc = SECSVC_DROP; + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + CERROR("authentication failed\n"); + + if (!gss_pack_err_notify(req, GSS_S_FAILURE, 0)) + rc = SECSVC_COMPLETE; + + GOTO(out, rc); + } else { + cache_get(&rsci->h); + grctx->src_ctx = &rsci->ctx; + } + + if (rawobj_dup(&rsci->ctx.gsc_rvs_hdl, rvs_hdl)) { + CERROR("failed duplicate reverse handle\n"); + GOTO(out, rc); + } + + rsci->target = target; + + CDEBUG(D_SEC, "server create rsc %p(%u->%s)\n", + rsci, rsci->ctx.gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + if (rsip->out_handle.len > PTLRPC_GSS_MAX_HANDLE_SIZE) { + CERROR("handle size %u too large\n", rsip->out_handle.len); + GOTO(out, rc = SECSVC_DROP); + } + + grctx->src_init = 1; + grctx->src_reserve_len = cfs_size_round4(rsip->out_token.len); + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("failed to pack reply: %d\n", rc); + GOTO(out, rc = SECSVC_DROP); + } + + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_bufcount == 3); + LASSERT(rs->rs_repbuf->lm_buflens[0] >= + sizeof(*rephdr) + rsip->out_handle.len); + LASSERT(rs->rs_repbuf->lm_buflens[2] >= rsip->out_token.len); + + rephdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + rephdr->gh_version = PTLRPC_GSS_VERSION; + rephdr->gh_flags = 0; + rephdr->gh_proc = PTLRPC_GSS_PROC_ERR; + rephdr->gh_major = rsip->major_status; + rephdr->gh_minor = rsip->minor_status; + rephdr->gh_seqwin = GSS_SEQ_WIN; + rephdr->gh_handle.len = rsip->out_handle.len; + memcpy(rephdr->gh_handle.data, rsip->out_handle.data, + rsip->out_handle.len); + + memcpy(lustre_msg_buf(rs->rs_repbuf, 2, 0), rsip->out_token.data, + rsip->out_token.len); + + rs->rs_repdata_len = lustre_shrink_msg(rs->rs_repbuf, 2, + rsip->out_token.len, 0); + + rc = SECSVC_OK; + +out: + /* it looks like here we should put rsip also, but this mess up + * with NFS cache mgmt code... FIXME */ +#if 0 + if (rsip) + rsi_put(&rsip->h, &rsi_cache); +#endif + + if (rsci) { + /* if anything went wrong, we don't keep the context too */ + if (rc != SECSVC_OK) + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else + CDEBUG(D_SEC, "create rsc with idx "LPX64"\n", + gss_handle_to_u64(&rsci->handle)); + + COMPAT_RSC_PUT(&rsci->h, &rsc_cache); + } + RETURN(rc); +} + +struct gss_svc_ctx *gss_svc_upcall_get_ctx(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct rsc *rsc; + + rsc = gss_svc_searchbyctx(&gw->gw_handle); + if (!rsc) { + CWARN("Invalid gss ctx idx "LPX64" from %s\n", + gss_handle_to_u64(&gw->gw_handle), + libcfs_nid2str(req->rq_peer.nid)); + return NULL; + } + + return &rsc->ctx; +} + +void gss_svc_upcall_put_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + COMPAT_RSC_PUT(&rsc->h, &rsc_cache); +} + +void gss_svc_upcall_destroy_ctx(struct gss_svc_ctx *ctx) +{ + struct rsc *rsc = container_of(ctx, struct rsc, ctx); + + /* can't be found */ + set_bit(CACHE_NEGATIVE, &rsc->h.flags); + /* to be removed at next scan */ + rsc->h.expiry_time = 1; +} + +int __init gss_init_svc_upcall(void) +{ + int i; + + spin_lock_init(&__ctx_index_lock); + /* + * this helps reducing context index confliction. after server reboot, + * conflicting request from clients might be filtered out by initial + * sequence number checking, thus no chance to sent error notification + * back to clients. + */ + cfs_get_random_bytes(&__ctx_index, sizeof(__ctx_index)); + + + cache_register(&rsi_cache); + cache_register(&rsc_cache); + + /* FIXME this looks stupid. we intend to give lsvcgssd a chance to open + * the init upcall channel, otherwise there's big chance that the first + * upcall issued before the channel be opened thus nfsv4 cache code will + * drop the request direclty, thus lead to unnecessary recovery time. + * here we wait at miximum 1.5 seconds. */ + for (i = 0; i < 6; i++) { + if (atomic_read(&rsi_cache.readers) > 0) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + LASSERT(HZ >= 4); + schedule_timeout(HZ / 4); + } + + if (atomic_read(&rsi_cache.readers) == 0) + CWARN("Init channel is not opened by lsvcgssd, following " + "request might be dropped until lsvcgssd is active\n"); + + return 0; +} + +void __exit gss_exit_svc_upcall(void) +{ + cache_purge(&rsi_cache); + cache_unregister(&rsi_cache); + + cache_purge(&rsc_cache); + cache_unregister(&rsc_cache); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c new file mode 100644 index 000000000000..2522e0517282 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/lproc_gss.c @@ -0,0 +1,219 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static struct proc_dir_entry *gss_proc_root = NULL; +static struct proc_dir_entry *gss_proc_lk = NULL; + +/* + * statistic of "out-of-sequence-window" + */ +static struct { + spinlock_t oos_lock; + atomic_t oos_cli_count; /* client occurrence */ + int oos_cli_behind; /* client max seqs behind */ + atomic_t oos_svc_replay[3]; /* server replay detected */ + atomic_t oos_svc_pass[3]; /* server verified ok */ +} gss_stat_oos = { + .oos_cli_count = ATOMIC_INIT(0), + .oos_cli_behind = 0, + .oos_svc_replay = { ATOMIC_INIT(0), }, + .oos_svc_pass = { ATOMIC_INIT(0), }, +}; + +void gss_stat_oos_record_cli(int behind) +{ + atomic_inc(&gss_stat_oos.oos_cli_count); + + spin_lock(&gss_stat_oos.oos_lock); + if (behind > gss_stat_oos.oos_cli_behind) + gss_stat_oos.oos_cli_behind = behind; + spin_unlock(&gss_stat_oos.oos_lock); +} + +void gss_stat_oos_record_svc(int phase, int replay) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (replay) + atomic_inc(&gss_stat_oos.oos_svc_replay[phase]); + else + atomic_inc(&gss_stat_oos.oos_svc_pass[phase]); +} + +static int gss_proc_read_oos(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int written; + + written = snprintf(page, count, + "seqwin: %u\n" + "backwin: %u\n" + "client fall behind seqwin\n" + " occurrence: %d\n" + " max seq behind: %d\n" + "server replay detected:\n" + " phase 0: %d\n" + " phase 1: %d\n" + " phase 2: %d\n" + "server verify ok:\n" + " phase 2: %d\n", + GSS_SEQ_WIN_MAIN, + GSS_SEQ_WIN_BACK, + atomic_read(&gss_stat_oos.oos_cli_count), + gss_stat_oos.oos_cli_behind, + atomic_read(&gss_stat_oos.oos_svc_replay[0]), + atomic_read(&gss_stat_oos.oos_svc_replay[1]), + atomic_read(&gss_stat_oos.oos_svc_replay[2]), + atomic_read(&gss_stat_oos.oos_svc_pass[2])); + + return written; +} + +static int gss_proc_write_secinit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int rc; + + rc = gss_do_ctx_init_rpc((char *) buffer, count); + if (rc) { + LASSERT(rc < 0); + return rc; + } + + return ((int) count); +} + +static struct lprocfs_vars gss_lprocfs_vars[] = { + { "replays", gss_proc_read_oos, NULL }, + { "init_channel", NULL, gss_proc_write_secinit, NULL, NULL, 0222 }, + { NULL } +}; + +/* + * for userspace helper lgss_keyring. + * + * debug_level: [0, 4], defined in utils/gss/lgss_utils.h + */ +static int gss_lk_debug_level = 1; + +static int gss_lk_proc_read_dl(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return snprintf(page, count, "%u\n", gss_lk_debug_level); +} + +static int gss_lk_proc_write_dl(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val < 0 || val > 4) + return -ERANGE; + + gss_lk_debug_level = val; + return count; +} + +static struct lprocfs_vars gss_lk_lprocfs_vars[] = { + { "debug_level", gss_lk_proc_read_dl, gss_lk_proc_write_dl, NULL }, + { NULL } +}; + +void gss_exit_lproc(void) +{ + if (gss_proc_lk) { + lprocfs_remove(&gss_proc_lk); + gss_proc_lk = NULL; + } + + if (gss_proc_root) { + lprocfs_remove(&gss_proc_root); + gss_proc_root = NULL; + } +} + +int gss_init_lproc(void) +{ + int rc; + + spin_lock_init(&gss_stat_oos.oos_lock); + + gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root, + gss_lprocfs_vars, NULL); + if (IS_ERR(gss_proc_root)) { + gss_proc_root = NULL; + GOTO(err_out, rc = PTR_ERR(gss_proc_root)); + } + + gss_proc_lk = lprocfs_register("lgss_keyring", gss_proc_root, + gss_lk_lprocfs_vars, NULL); + if (IS_ERR(gss_proc_lk)) { + gss_proc_lk = NULL; + GOTO(err_out, rc = PTR_ERR(gss_proc_root)); + } + + return 0; + +err_out: + CERROR("failed to initialize gss lproc entries: %d\n", rc); + gss_exit_lproc(); + return rc; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c new file mode 100644 index 000000000000..ebca858ca183 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/gss/sec_gss.c @@ -0,0 +1,2916 @@ +/* + * Modifications for Lustre + * + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#include +#include + +/* + * early reply have fixed size, respectively in privacy and integrity mode. + * so we calculate them only once. + */ +static int gss_at_reply_off_integ; +static int gss_at_reply_off_priv; + + +static inline int msg_last_segidx(struct lustre_msg *msg) +{ + LASSERT(msg->lm_bufcount > 0); + return msg->lm_bufcount - 1; +} +static inline int msg_last_seglen(struct lustre_msg *msg) +{ + return msg->lm_buflens[msg_last_segidx(msg)]; +} + +/******************************************** + * wire data swabber * + ********************************************/ + +static +void gss_header_swabber(struct gss_header *ghdr) +{ + __swab32s(&ghdr->gh_flags); + __swab32s(&ghdr->gh_proc); + __swab32s(&ghdr->gh_seq); + __swab32s(&ghdr->gh_svc); + __swab32s(&ghdr->gh_pad1); + __swab32s(&ghdr->gh_handle.len); +} + +struct gss_header *gss_swab_header(struct lustre_msg *msg, int segment, + int swabbed) +{ + struct gss_header *ghdr; + + ghdr = lustre_msg_buf(msg, segment, sizeof(*ghdr)); + if (ghdr == NULL) + return NULL; + + if (swabbed) + gss_header_swabber(ghdr); + + if (sizeof(*ghdr) + ghdr->gh_handle.len > msg->lm_buflens[segment]) { + CERROR("gss header has length %d, now %u received\n", + (int) sizeof(*ghdr) + ghdr->gh_handle.len, + msg->lm_buflens[segment]); + return NULL; + } + + return ghdr; +} + +#if 0 +static +void gss_netobj_swabber(netobj_t *obj) +{ + __swab32s(&obj->len); +} + +netobj_t *gss_swab_netobj(struct lustre_msg *msg, int segment) +{ + netobj_t *obj; + + obj = lustre_swab_buf(msg, segment, sizeof(*obj), gss_netobj_swabber); + if (obj && sizeof(*obj) + obj->len > msg->lm_buflens[segment]) { + CERROR("netobj require length %u but only %u received\n", + (unsigned int) sizeof(*obj) + obj->len, + msg->lm_buflens[segment]); + return NULL; + } + + return obj; +} +#endif + +/* + * payload should be obtained from mechanism. but currently since we + * only support kerberos, we could simply use fixed value. + * krb5 "meta" data: + * - krb5 header: 16 + * - krb5 checksum: 20 + * + * for privacy mode, payload also include the cipher text which has the same + * size as plain text, plus possible confounder, padding both at maximum cipher + * block size. + */ +#define GSS_KRB5_INTEG_MAX_PAYLOAD (40) + +static inline +int gss_mech_payload(struct gss_ctx *mechctx, int msgsize, int privacy) +{ + if (privacy) + return GSS_KRB5_INTEG_MAX_PAYLOAD + 16 + 16 + 16 + msgsize; + else + return GSS_KRB5_INTEG_MAX_PAYLOAD; +} + +/* + * return signature size, otherwise < 0 to indicate error + */ +static int gss_sign_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + enum lustre_sec_part sp, + __u32 flags, __u32 proc, __u32 seq, __u32 svc, + rawobj_t *handle) +{ + struct gss_header *ghdr; + rawobj_t text[4], mic; + int textcnt, max_textcnt, mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + /* gss hdr */ + LASSERT(msg->lm_buflens[0] >= + sizeof(*ghdr) + (handle ? handle->len : 0)); + ghdr = lustre_msg_buf(msg, 0, 0); + + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) sp; + ghdr->gh_flags = flags; + ghdr->gh_proc = proc; + ghdr->gh_seq = seq; + ghdr->gh_svc = svc; + if (!handle) { + /* fill in a fake one */ + ghdr->gh_handle.len = 0; + } else { + ghdr->gh_handle.len = handle->len; + memcpy(ghdr->gh_handle.data, handle->data, handle->len); + } + + /* no actual signature for null mode */ + if (svc == SPTLRPC_SVC_NULL) + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + /* MIC */ + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) { + CERROR("fail to generate MIC: %08x\n", major); + return -EPERM; + } + LASSERT(mic.len <= msg->lm_buflens[mic_idx]); + + return lustre_shrink_msg(msg, mic_idx, mic.len, 0); +} + +/* + * return gss error + */ +static +__u32 gss_verify_msg(struct lustre_msg *msg, + struct gss_ctx *mechctx, + __u32 svc) +{ + rawobj_t text[4], mic; + int textcnt, max_textcnt; + int mic_idx; + __u32 major; + + LASSERT(msg->lm_bufcount >= 2); + + if (svc == SPTLRPC_SVC_NULL) + return GSS_S_COMPLETE; + + mic_idx = msg_last_segidx(msg); + max_textcnt = (svc == SPTLRPC_SVC_AUTH) ? 1 : mic_idx; + + for (textcnt = 0; textcnt < max_textcnt; textcnt++) { + text[textcnt].len = msg->lm_buflens[textcnt]; + text[textcnt].data = lustre_msg_buf(msg, textcnt, 0); + } + + mic.len = msg->lm_buflens[mic_idx]; + mic.data = lustre_msg_buf(msg, mic_idx, 0); + + major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic); + if (major != GSS_S_COMPLETE) + CERROR("mic verify error: %08x\n", major); + + return major; +} + +/* + * return gss error code + */ +static +__u32 gss_unseal_msg(struct gss_ctx *mechctx, + struct lustre_msg *msgbuf, + int *msg_len, int msgbuf_len) +{ + rawobj_t clear_obj, hdrobj, token; + __u8 *clear_buf; + int clear_buflen; + __u32 major; + ENTRY; + + if (msgbuf->lm_bufcount != 2) { + CERROR("invalid bufcount %d\n", msgbuf->lm_bufcount); + RETURN(GSS_S_FAILURE); + } + + /* allocate a temporary clear text buffer, same sized as token, + * we assume the final clear text size <= token size */ + clear_buflen = lustre_msg_buflen(msgbuf, 1); + OBD_ALLOC_LARGE(clear_buf, clear_buflen); + if (!clear_buf) + RETURN(GSS_S_FAILURE); + + /* buffer objects */ + hdrobj.len = lustre_msg_buflen(msgbuf, 0); + hdrobj.data = lustre_msg_buf(msgbuf, 0, 0); + token.len = lustre_msg_buflen(msgbuf, 1); + token.data = lustre_msg_buf(msgbuf, 1, 0); + clear_obj.len = clear_buflen; + clear_obj.data = clear_buf; + + major = lgss_unwrap(mechctx, &hdrobj, &token, &clear_obj); + if (major != GSS_S_COMPLETE) { + CERROR("unwrap message error: %08x\n", major); + GOTO(out_free, major = GSS_S_FAILURE); + } + LASSERT(clear_obj.len <= clear_buflen); + LASSERT(clear_obj.len <= msgbuf_len); + + /* now the decrypted message */ + memcpy(msgbuf, clear_obj.data, clear_obj.len); + *msg_len = clear_obj.len; + + major = GSS_S_COMPLETE; +out_free: + OBD_FREE_LARGE(clear_buf, clear_buflen); + RETURN(major); +} + +/******************************************** + * gss client context manipulation helpers * + ********************************************/ + +int cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->cc_refcount)); + + if (!test_and_set_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags)) { + if (!ctx->cc_early_expire) + clear_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + CWARN("ctx %p(%u->%s) get expired: %lu(%+lds)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, + ctx->cc_expire == 0 ? 0 : + cfs_time_sub(ctx->cc_expire, cfs_time_current_sec())); + + sptlrpc_cli_ctx_wakeup(ctx); + return 1; + } + + return 0; +} + +/* + * return 1 if the context is dead. + */ +int cli_ctx_check_death(struct ptlrpc_cli_ctx *ctx) +{ + if (unlikely(cli_ctx_is_dead(ctx))) + return 1; + + /* expire is 0 means never expire. a newly created gss context + * which during upcall may has 0 expiration */ + if (ctx->cc_expire == 0) + return 0; + + /* check real expiration */ + if (cfs_time_after(ctx->cc_expire, cfs_time_current_sec())) + return 0; + + cli_ctx_expire(ctx); + return 1; +} + +void gss_cli_ctx_uptodate(struct gss_cli_ctx *gctx) +{ + struct ptlrpc_cli_ctx *ctx = &gctx->gc_base; + unsigned long ctx_expiry; + + if (lgss_inquire_context(gctx->gc_mechctx, &ctx_expiry)) { + CERROR("ctx %p(%u): unable to inquire, expire it now\n", + gctx, ctx->cc_vcred.vc_uid); + ctx_expiry = 1; /* make it expired now */ + } + + ctx->cc_expire = gss_round_ctx_expiry(ctx_expiry, + ctx->cc_sec->ps_flvr.sf_flags); + + /* At this point this ctx might have been marked as dead by + * someone else, in which case nobody will make further use + * of it. we don't care, and mark it UPTODATE will help + * destroying server side context when it be destroied. */ + set_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags); + + if (sec_is_reverse(ctx->cc_sec)) { + CWARN("server installed reverse ctx %p idx "LPX64", " + "expiry %lu(%+lds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec()); + } else { + CWARN("client refreshed ctx %p idx "LPX64" (%u->%s), " + "expiry %lu(%+lds)\n", ctx, + gss_handle_to_u64(&gctx->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + ctx->cc_expire, ctx->cc_expire - cfs_time_current_sec()); + + /* install reverse svc ctx for root context */ + if (ctx->cc_vcred.vc_uid == 0) + gss_sec_install_rctx(ctx->cc_sec->ps_import, + ctx->cc_sec, ctx); + } + + sptlrpc_cli_ctx_wakeup(ctx); +} + +static void gss_cli_ctx_finalize(struct gss_cli_ctx *gctx) +{ + LASSERT(gctx->gc_base.cc_sec); + + if (gctx->gc_mechctx) { + lgss_delete_sec_context(&gctx->gc_mechctx); + gctx->gc_mechctx = NULL; + } + + if (!rawobj_empty(&gctx->gc_svc_handle)) { + /* forward ctx: mark buddy reverse svcctx soon-expire. */ + if (!sec_is_reverse(gctx->gc_base.cc_sec) && + !rawobj_empty(&gctx->gc_svc_handle)) + gss_svc_upcall_expire_rvs_ctx(&gctx->gc_svc_handle); + + rawobj_free(&gctx->gc_svc_handle); + } + + rawobj_free(&gctx->gc_handle); +} + +/* + * Based on sequence number algorithm as specified in RFC 2203. + * + * modified for our own problem: arriving request has valid sequence number, + * but unwrapping request might cost a long time, after that its sequence + * are not valid anymore (fall behind the window). It rarely happen, mostly + * under extreme load. + * + * note we should not check sequence before verify the integrity of incoming + * request, because just one attacking request with high sequence number might + * cause all following request be dropped. + * + * so here we use a multi-phase approach: prepare 2 sequence windows, + * "main window" for normal sequence and "back window" for fall behind sequence. + * and 3-phase checking mechanism: + * 0 - before integrity verification, perform a initial sequence checking in + * main window, which only try and don't actually set any bits. if the + * sequence is high above the window or fit in the window and the bit + * is 0, then accept and proceed to integrity verification. otherwise + * reject this sequence. + * 1 - after integrity verification, check in main window again. if this + * sequence is high above the window or fit in the window and the bit + * is 0, then set the bit and accept; if it fit in the window but bit + * already set, then reject; if it fall behind the window, then proceed + * to phase 2. + * 2 - check in back window. if it is high above the window or fit in the + * window and the bit is 0, then set the bit and accept. otherwise reject. + * + * return value: + * 1: looks like a replay + * 0: is ok + * -1: is a replay + * + * note phase 0 is necessary, because otherwise replay attacking request of + * sequence which between the 2 windows can't be detected. + * + * this mechanism can't totally solve the problem, but could help much less + * number of valid requests be dropped. + */ +static +int gss_do_check_seq(unsigned long *window, __u32 win_size, __u32 *max_seq, + __u32 seq_num, int phase) +{ + LASSERT(phase >= 0 && phase <= 2); + + if (seq_num > *max_seq) { + /* + * 1. high above the window + */ + if (phase == 0) + return 0; + + if (seq_num >= *max_seq + win_size) { + memset(window, 0, win_size / 8); + *max_seq = seq_num; + } else { + while(*max_seq < seq_num) { + (*max_seq)++; + __clear_bit((*max_seq) % win_size, window); + } + } + __set_bit(seq_num % win_size, window); + } else if (seq_num + win_size <= *max_seq) { + /* + * 2. low behind the window + */ + if (phase == 0 || phase == 2) + goto replay; + + CWARN("seq %u is %u behind (size %d), check backup window\n", + seq_num, *max_seq - win_size - seq_num, win_size); + return 1; + } else { + /* + * 3. fit into the window + */ + switch (phase) { + case 0: + if (test_bit(seq_num % win_size, window)) + goto replay; + break; + case 1: + case 2: + if (__test_and_set_bit(seq_num % win_size, window)) + goto replay; + break; + } + } + + return 0; + +replay: + CERROR("seq %u (%s %s window) is a replay: max %u, winsize %d\n", + seq_num, + seq_num + win_size > *max_seq ? "in" : "behind", + phase == 2 ? "backup " : "main", + *max_seq, win_size); + return -1; +} + +/* + * Based on sequence number algorithm as specified in RFC 2203. + * + * if @set == 0: initial check, don't set any bit in window + * if @sec == 1: final check, set bit in window + */ +int gss_check_seq_num(struct gss_svc_seq_data *ssd, __u32 seq_num, int set) +{ + int rc = 0; + + spin_lock(&ssd->ssd_lock); + + if (set == 0) { + /* + * phase 0 testing + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 0); + if (unlikely(rc)) + gss_stat_oos_record_svc(0, 1); + } else { + /* + * phase 1 checking main window + */ + rc = gss_do_check_seq(ssd->ssd_win_main, GSS_SEQ_WIN_MAIN, + &ssd->ssd_max_main, seq_num, 1); + switch (rc) { + case -1: + gss_stat_oos_record_svc(1, 1); + /* fall through */ + case 0: + goto exit; + } + /* + * phase 2 checking back window + */ + rc = gss_do_check_seq(ssd->ssd_win_back, GSS_SEQ_WIN_BACK, + &ssd->ssd_max_back, seq_num, 2); + if (rc) + gss_stat_oos_record_svc(2, 1); + else + gss_stat_oos_record_svc(2, 0); + } +exit: + spin_unlock(&ssd->ssd_lock); + return rc; +} + +/*************************************** + * cred APIs * + ***************************************/ + +static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, + int msgsize, int privacy) +{ + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx, + struct sptlrpc_flavor *flvr, + int reply, int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT); + + if ((!reply && !read) || (reply && read)) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_cli_payload(ctx, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_cli_payload(ctx, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) +{ + return (ctx->cc_vcred.vc_uid == vcred->vc_uid); +} + +void gss_cli_ctx_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_CTX_NEW) + strncat(buf, "new,", bufsize); + if (flags & PTLRPC_CTX_UPTODATE) + strncat(buf, "uptodate,", bufsize); + if (flags & PTLRPC_CTX_DEAD) + strncat(buf, "dead,", bufsize); + if (flags & PTLRPC_CTX_ERROR) + strncat(buf, "error,", bufsize); + if (flags & PTLRPC_CTX_CACHED) + strncat(buf, "cached,", bufsize); + if (flags & PTLRPC_CTX_ETERNAL) + strncat(buf, "eternal,", bufsize); + if (buf[0] == '\0') + strncat(buf, "-,", bufsize); + + buf[strlen(buf) - 1] = '\0'; +} + +int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + __u32 flags = 0, seq, svc; + int rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(req->rq_cli_ctx == ctx); + + /* nothing to do for context negotiation RPCs */ + if (req->rq_ctx_init) + RETURN(0); + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + flags |= LUSTRE_GSS_PACK_USER; + +redo: + seq = atomic_inc_return(&gctx->gc_seq); + + rc = gss_sign_msg(req->rq_reqbuf, gctx->gc_mechctx, + ctx->cc_sec->ps_part, + flags, gctx->gc_proc, seq, svc, + &gctx->gc_handle); + if (rc < 0) + RETURN(rc); + + /* gss_sign_msg() msg might take long time to finish, in which period + * more rpcs could be wrapped up and sent out. if we found too many + * of them we should repack this rpc, because sent it too late might + * lead to the sequence number fall behind the window on server and + * be dropped. also applies to gss_cli_ctx_seal(). + * + * Note: null mode dosen't check sequence number. */ + if (svc != SPTLRPC_SVC_NULL && + atomic_read(&gctx->gc_seq) - seq > GSS_SEQ_REPACK_THRESHOLD) { + int behind = atomic_read(&gctx->gc_seq) - seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry signing\n", req, behind); + goto redo; + } + + req->rq_reqdata_len = rc; + RETURN(0); +} + +static +int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct gss_header *ghdr) +{ + struct gss_err_header *errhdr; + int rc; + + LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR); + + errhdr = (struct gss_err_header *) ghdr; + + CWARN("req x"LPU64"/t"LPU64", ctx %p idx "LPX64"(%u->%s): " + "%sserver respond (%08x/%08x)\n", + req->rq_xid, req->rq_transno, ctx, + gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + sec_is_reverse(ctx->cc_sec) ? "reverse" : "", + errhdr->gh_major, errhdr->gh_minor); + + /* context fini rpc, let it failed */ + if (req->rq_ctx_fini) { + CWARN("context fini rpc failed\n"); + return -EINVAL; + } + + /* reverse sec, just return error, don't expire this ctx because it's + * crucial to callback rpcs. note if the callback rpc failed because + * of bit flip during network transfer, the client will be evicted + * directly. so more gracefully we probably want let it retry for + * number of times. */ + if (sec_is_reverse(ctx->cc_sec)) + return -EINVAL; + + if (errhdr->gh_major != GSS_S_NO_CONTEXT && + errhdr->gh_major != GSS_S_BAD_SIG) + return -EACCES; + + /* server return NO_CONTEXT might be caused by context expire + * or server reboot/failover. we try to refresh a new ctx which + * be transparent to upper layer. + * + * In some cases, our gss handle is possible to be incidentally + * identical to another handle since the handle itself is not + * fully random. In krb5 case, the GSS_S_BAD_SIG will be + * returned, maybe other gss error for other mechanism. + * + * if we add new mechanism, make sure the correct error are + * returned in this case. */ + CWARN("%s: server might lost the context, retrying\n", + errhdr->gh_major == GSS_S_NO_CONTEXT ? "NO_CONTEXT" : "BAD_SIG"); + + sptlrpc_cli_ctx_expire(ctx); + + /* we need replace the ctx right here, otherwise during + * resent we'll hit the logic in sptlrpc_req_refresh_ctx() + * which keep the ctx with RESEND flag, thus we'll never + * get rid of this ctx. */ + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc == 0) + req->rq_resend = 1; + + return rc; +} + +int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr, *reqhdr; + struct lustre_msg *msg = req->rq_repdata; + __u32 major; + int pack_bulk, swabbed, rc = 0; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* special case for context negotiation, rq_repmsg/rq_replen actually + * are not used currently. but early reply always be treated normally */ + if (req->rq_ctx_init && !req->rq_early) { + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + RETURN(0); + } + + if (msg->lm_bufcount < 2 || msg->lm_bufcount > 4) { + CERROR("unexpected bufcount %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = ptlrpc_rep_need_swab(req); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + reqhdr = lustre_msg_buf(msg, 0, sizeof(*reqhdr)); + LASSERT(reqhdr); + + if (ghdr->gh_version != reqhdr->gh_version) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, reqhdr->gh_version); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (ghdr->gh_seq != reqhdr->gh_seq) { + CERROR("seqnum %u mismatch, expect %u\n", + ghdr->gh_seq, reqhdr->gh_seq); + RETURN(-EPROTO); + } + + if (ghdr->gh_svc != reqhdr->gh_svc) { + CERROR("svc %u mismatch, expect %u\n", + ghdr->gh_svc, reqhdr->gh_svc); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc); + if (major != GSS_S_COMPLETE) { + CERROR("failed to verify reply: %x\n", major); + RETURN(-EPERM); + } + + if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) { + __u32 cksum; + + cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(msg, 1, 0), + lustre_msg_buflen(msg, 1)); + if (cksum != msg->lm_cksum) { + CWARN("early reply checksum mismatch: " + "%08x != %08x\n", cksum, msg->lm_cksum); + RETURN(-EPROTO); + } + } + + if (pack_bulk) { + /* bulk checksum is right after the lustre msg */ + if (msg->lm_bufcount < 3) { + CERROR("Invalid reply bufcount %u\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + rc = bulk_sec_desc_unpack(msg, 2, swabbed); + if (rc) { + CERROR("unpack bulk desc: %d\n", rc); + RETURN(rc); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, 1, 0); + req->rq_replen = msg->lm_buflens[1]; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unknown gss proc %d\n", ghdr->gh_proc); + rc = -EPROTO; + } + + RETURN(rc); +} + +int gss_cli_ctx_seal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u32 buflens[2], major; + int wiresize, rc; + ENTRY; + + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_reqlen); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + /* final clear data length */ + req->rq_clrdata_len = lustre_msg_size_v2(req->rq_clrbuf->lm_bufcount, + req->rq_clrbuf->lm_buflens); + + /* calculate wire data length */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(&gctx->gc_base, req->rq_clrdata_len, 1); + wiresize = lustre_msg_size_v2(2, buflens); + + /* allocate wire buffer */ + if (req->rq_pool) { + /* pre-allocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf != req->rq_clrbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + } else { + OBD_ALLOC_LARGE(req->rq_reqbuf, wiresize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + req->rq_reqbuf_len = wiresize; + } + + lustre_init_msg_v2(req->rq_reqbuf, 2, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + /* gss header */ + ghdr = lustre_msg_buf(req->rq_reqbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = (__u8) ctx->cc_sec->ps_part; + ghdr->gh_flags = 0; + ghdr->gh_proc = gctx->gc_proc; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = gctx->gc_handle.len; + memcpy(ghdr->gh_handle.data, gctx->gc_handle.data, gctx->gc_handle.len); + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + if (req->rq_pack_udesc) + ghdr->gh_flags |= LUSTRE_GSS_PACK_USER; + +redo: + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + + /* buffer objects */ + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = req->rq_clrdata_len; + msgobj.data = (__u8 *) req->rq_clrbuf; + token.len = lustre_msg_buflen(req->rq_reqbuf, 1); + token.data = lustre_msg_buf(req->rq_reqbuf, 1, 0); + + major = lgss_wrap(gctx->gc_mechctx, &hdrobj, &msgobj, + req->rq_clrbuf_len, &token); + if (major != GSS_S_COMPLETE) { + CERROR("priv: wrap message error: %08x\n", major); + GOTO(err_free, rc = -EPERM); + } + LASSERT(token.len <= buflens[1]); + + /* see explain in gss_cli_ctx_sign() */ + if (unlikely(atomic_read(&gctx->gc_seq) - ghdr->gh_seq > + GSS_SEQ_REPACK_THRESHOLD)) { + int behind = atomic_read(&gctx->gc_seq) - ghdr->gh_seq; + + gss_stat_oos_record_cli(behind); + CWARN("req %p: %u behind, retry sealing\n", req, behind); + + ghdr->gh_seq = atomic_inc_return(&gctx->gc_seq); + goto redo; + } + + /* now set the final wire data length */ + req->rq_reqdata_len = lustre_shrink_msg(req->rq_reqbuf, 1, token.len,0); + RETURN(0); + +err_free: + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + RETURN(rc); +} + +int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req) +{ + struct gss_cli_ctx *gctx; + struct gss_header *ghdr; + struct lustre_msg *msg = req->rq_repdata; + int msglen, pack_bulk, swabbed, rc; + __u32 major; + ENTRY; + + LASSERT(req->rq_cli_ctx == ctx); + LASSERT(req->rq_ctx_init == 0); + LASSERT(msg); + + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + swabbed = ptlrpc_rep_need_swab(req); + + ghdr = gss_swab_header(msg, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(-EPROTO); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u mismatch, expect %u\n", + ghdr->gh_version, PTLRPC_GSS_VERSION); + RETURN(-EPROTO); + } + + switch (ghdr->gh_proc) { + case PTLRPC_GSS_PROC_DATA: + pack_bulk = ghdr->gh_flags & LUSTRE_GSS_PACK_BULK; + + if (!req->rq_early && !equi(req->rq_pack_bulk == 1, pack_bulk)){ + CERROR("%s bulk flag in reply\n", + req->rq_pack_bulk ? "missing" : "unexpected"); + RETURN(-EPROTO); + } + + if (swabbed) + gss_header_swabber(ghdr); + + /* use rq_repdata_len as buffer size, which assume unseal + * doesn't need extra memory space. for precise control, we'd + * better calculate out actual buffer size as + * (repbuf_len - offset - repdata_len) */ + major = gss_unseal_msg(gctx->gc_mechctx, msg, + &msglen, req->rq_repdata_len); + if (major != GSS_S_COMPLETE) { + CERROR("failed to unwrap reply: %x\n", major); + rc = -EPERM; + break; + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EPROTO); + } + + if (msg->lm_bufcount < 1) { + CERROR("Invalid reply buffer: empty\n"); + RETURN(-EPROTO); + } + + if (pack_bulk) { + if (msg->lm_bufcount < 2) { + CERROR("bufcount %u: missing bulk sec desc\n", + msg->lm_bufcount); + RETURN(-EPROTO); + } + + /* bulk checksum is the last segment */ + if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1, + swabbed)) + RETURN(-EPROTO); + } + + req->rq_repmsg = lustre_msg_buf(msg, 0, 0); + req->rq_replen = msg->lm_buflens[0]; + + rc = 0; + break; + case PTLRPC_GSS_PROC_ERR: + if (req->rq_early) { + CERROR("server return error with early reply\n"); + rc = -EPROTO; + } else { + rc = gss_cli_ctx_handle_err_notify(ctx, req, ghdr); + } + break; + default: + CERROR("unexpected proc %d\n", ghdr->gh_proc); + rc = -EPERM; + } + + RETURN(rc); +} + +/********************************************* + * reverse context installation * + *********************************************/ + +static inline +int gss_install_rvs_svc_ctx(struct obd_import *imp, + struct gss_sec *gsec, + struct gss_cli_ctx *gctx) +{ + return gss_svc_upcall_install_rvs_ctx(imp, gsec, gctx); +} + +/********************************************* + * GSS security APIs * + *********************************************/ +int gss_sec_create_common(struct gss_sec *gsec, + struct ptlrpc_sec_policy *policy, + struct obd_import *imp, + struct ptlrpc_svc_ctx *svcctx, + struct sptlrpc_flavor *sf) +{ + struct ptlrpc_sec *sec; + + LASSERT(imp); + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS); + + gsec->gs_mech = lgss_subflavor_to_mech( + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + if (!gsec->gs_mech) { + CERROR("gss backend 0x%x not found\n", + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); + return -EOPNOTSUPP; + } + + spin_lock_init(&gsec->gs_lock); + gsec->gs_rvs_hdl = 0ULL; + + /* initialize upper ptlrpc_sec */ + sec = &gsec->gs_base; + sec->ps_policy = policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_flvr = *sf; + sec->ps_import = class_import_get(imp); + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + + if (!svcctx) { + sec->ps_gc_interval = GSS_GC_INTERVAL; + } else { + LASSERT(sec_is_reverse(sec)); + + /* never do gc on reverse sec */ + sec->ps_gc_interval = 0; + } + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_add_user(); + + CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""), + policy->sp_name, gsec); + return 0; +} + +void gss_sec_destroy_common(struct gss_sec *gsec) +{ + struct ptlrpc_sec *sec = &gsec->gs_base; + ENTRY; + + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + + if (gsec->gs_mech) { + lgss_mech_put(gsec->gs_mech); + gsec->gs_mech = NULL; + } + + class_import_put(sec->ps_import); + + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) + sptlrpc_enc_pool_del_user(); + + EXIT; +} + +void gss_sec_kill(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +int gss_cli_ctx_init_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_ctx_ops *ctxops, + struct vfs_cred *vcred) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + gctx->gc_win = 0; + atomic_set(&gctx->gc_seq, 0); + + INIT_HLIST_NODE(&ctx->cc_cache); + atomic_set(&ctx->cc_refcount, 0); + ctx->cc_sec = sec; + ctx->cc_ops = ctxops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_NEW; + ctx->cc_vcred = *vcred; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + /* take a ref on belonging sec, balanced in ctx destroying */ + atomic_inc(&sec->ps_refcount); + /* statistic only */ + atomic_inc(&sec->ps_nctx); + + CDEBUG(D_SEC, "%s@%p: create ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + return 0; +} + +/* + * return value: + * 1: the context has been taken care of by someone else + * 0: proceed to really destroy the context locally + */ +int gss_cli_ctx_fini_common(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_cli_ctx *gctx = ctx2gctx(ctx); + + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + /* + * remove UPTODATE flag of reverse ctx thus we won't send fini rpc, + * this is to avoid potential problems of client side reverse svc ctx + * be mis-destroyed in various recovery senarios. anyway client can + * manage its reverse ctx well by associating it with its buddy ctx. + */ + if (sec_is_reverse(sec)) + ctx->cc_flags &= ~PTLRPC_CTX_UPTODATE; + + if (gctx->gc_mechctx) { + /* the final context fini rpc will use this ctx too, and it's + * asynchronous which finished by request_out_callback(). so + * we add refcount, whoever drop finally drop the refcount to + * 0 should responsible for the rest of destroy. */ + atomic_inc(&ctx->cc_refcount); + + gss_do_ctx_fini_rpc(gctx); + gss_cli_ctx_finalize(gctx); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return 1; + } + + if (sec_is_reverse(sec)) + CWARN("reverse sec %p: destroy ctx %p\n", + ctx->cc_sec, ctx); + else + CWARN("%s@%p: destroy ctx %p(%u->%s)\n", + sec->ps_policy->sp_name, ctx->cc_sec, + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + + return 0; +} + +static +int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int bufsize, txtsize; + int bufcnt = 2; + __u32 buflens[5]; + ENTRY; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - user descriptor (optional) + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_udesc) { + buflens[bufcnt] = sptlrpc_current_user_desc_size(); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 0, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + bufsize = lustre_msg_size_v2(bufcnt, buflens); + + if (!req->rq_reqbuf) { + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_reqbuf, bufsize); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = bufsize; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= bufsize); + memset(req->rq_reqbuf, 0, bufsize); + } + + lustre_init_msg_v2(req->rq_reqbuf, bufcnt, buflens, NULL); + req->rq_reqbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, msgsize); + LASSERT(req->rq_reqmsg); + + /* pack user desc here, later we might leave current user's process */ + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, 2); + + RETURN(0); +} + +static +int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 ibuflens[3], wbuflens[2]; + int ibufcnt; + int clearsize, wiresize; + ENTRY; + + LASSERT(req->rq_clrbuf == NULL); + LASSERT(req->rq_clrbuf_len == 0); + + /* Inner (clear) buffers + * - lustre message + * - user descriptor (optional) + * - bulk checksum (optional) + */ + ibufcnt = 1; + ibuflens[0] = msgsize; + + if (req->rq_pack_udesc) + ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size(); + if (req->rq_pack_bulk) + ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, 0, + req->rq_bulk_read); + + clearsize = lustre_msg_size_v2(ibufcnt, ibuflens); + /* to allow append padding during encryption */ + clearsize += GSS_MAX_CIPHER_BLOCK; + + /* Wrapper (wire) buffers + * - gss header + * - cipher text + */ + wbuflens[0] = PTLRPC_GSS_HEADER_SIZE; + wbuflens[1] = gss_cli_payload(req->rq_cli_ctx, clearsize, 1); + wiresize = lustre_msg_size_v2(2, wbuflens); + + if (req->rq_pool) { + /* rq_reqbuf is preallocated */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= wiresize); + + memset(req->rq_reqbuf, 0, req->rq_reqbuf_len); + + /* if the pre-allocated buffer is big enough, we just pack + * both clear buf & request buf in it, to avoid more alloc. */ + if (clearsize + wiresize <= req->rq_reqbuf_len) { + req->rq_clrbuf = + (void *) (((char *) req->rq_reqbuf) + wiresize); + } else { + CWARN("pre-allocated buf size %d is not enough for " + "both clear (%d) and cipher (%d) text, proceed " + "with extra allocation\n", req->rq_reqbuf_len, + clearsize, wiresize); + } + } + + if (!req->rq_clrbuf) { + clearsize = size_roundup_power2(clearsize); + + OBD_ALLOC_LARGE(req->rq_clrbuf, clearsize); + if (!req->rq_clrbuf) + RETURN(-ENOMEM); + } + req->rq_clrbuf_len = clearsize; + + lustre_init_msg_v2(req->rq_clrbuf, ibufcnt, ibuflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, msgsize); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_clrbuf, 1); + + RETURN(0); +} + +/* + * NOTE: any change of request buffer allocation should also consider + * changing enlarge_reqbuf() series functions. + */ +int gss_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_reqbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_reqbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + int privacy; + ENTRY; + + LASSERT(!req->rq_pool || req->rq_reqbuf); + privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV; + + if (!req->rq_clrbuf) + goto release_reqbuf; + + /* release clear buffer */ + LASSERT(privacy); + LASSERT(req->rq_clrbuf_len); + + if (req->rq_pool == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + + req->rq_clrbuf = NULL; + req->rq_clrbuf_len = 0; + +release_reqbuf: + if (!req->rq_pool && req->rq_reqbuf) { + LASSERT(req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + + EXIT; +} + +static int do_alloc_repbuf(struct ptlrpc_request *req, int bufsize) +{ + bufsize = size_roundup_power2(bufsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, bufsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = bufsize; + return 0; +} + +static +int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, int msgsize) +{ + int txtsize; + __u32 buflens[4]; + int bufcnt = 2; + int alloc_size; + + /* + * on-wire data layout: + * - gss header + * - lustre message + * - bulk sec descriptor (optional) + * - signature (optional) + * - svc == NULL: NULL + * - svc == AUTH: signature of gss header + * - svc == INTG: signature of all above + * + * if this is context negotiation, reserver fixed space + * at the last (signature) segment regardless of svc mode. + */ + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + txtsize = buflens[0]; + + buflens[1] = msgsize; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if (req->rq_ctx_init) + buflens[bufcnt++] = GSS_CTX_INIT_MAX_LEN; + else if (svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_cli_payload(req->rq_cli_ctx, txtsize,0); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + + /* add space for early reply */ + alloc_size += gss_at_reply_off_integ; + + return do_alloc_repbuf(req, alloc_size); +} + +static +int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int txtsize; + __u32 buflens[2]; + int bufcnt; + int alloc_size; + + /* inner buffers */ + bufcnt = 1; + buflens[0] = msgsize; + + if (req->rq_pack_bulk) + buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); + txtsize = lustre_msg_size_v2(bufcnt, buflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffers */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, txtsize, 1); + + alloc_size = lustre_msg_size_v2(bufcnt, buflens); + /* add space for early reply */ + alloc_size += gss_at_reply_off_priv; + + return do_alloc_repbuf(req, alloc_size); +} + +int gss_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + ENTRY; + + LASSERT(!req->rq_pack_bulk || + (req->rq_bulk_read || req->rq_bulk_write)); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_alloc_repbuf_intg(sec, req, svc, msgsize); + case SPTLRPC_SVC_PRIV: + return gss_alloc_repbuf_priv(sec, req, msgsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +void gss_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + req->rq_repdata = NULL; + req->rq_repdata_len = 0; +} + +static int get_enlarged_msgsize(struct lustre_msg *msg, + int segment, int newsize) +{ + int save, newmsg_size; + + LASSERT(newsize >= msg->lm_buflens[segment]); + + save = msg->lm_buflens[segment]; + msg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment] = save; + + return newmsg_size; +} + +static int get_enlarged_msgsize2(struct lustre_msg *msg, + int segment1, int newsize1, + int segment2, int newsize2) +{ + int save1, save2, newmsg_size; + + LASSERT(newsize1 >= msg->lm_buflens[segment1]); + LASSERT(newsize2 >= msg->lm_buflens[segment2]); + + save1 = msg->lm_buflens[segment1]; + save2 = msg->lm_buflens[segment2]; + msg->lm_buflens[segment1] = newsize1; + msg->lm_buflens[segment2] = newsize2; + newmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + msg->lm_buflens[segment1] = save1; + msg->lm_buflens[segment2] = save2; + + return newmsg_size; +} + +static +int gss_enlarge_reqbuf_intg(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int svc, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int txtsize, sigsize = 0, i; + int newmsg_size, newbuf_size; + + /* + * gss header is at seg 0; + * embedded msg is at seg 1; + * signature (if any) is at the last seg + */ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len > req->rq_reqlen); + LASSERT(req->rq_reqbuf->lm_bufcount >= 2); + LASSERT(lustre_msg_buf(req->rq_reqbuf, 1, 0) == req->rq_reqmsg); + + /* 1. compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + LASSERT(newmsg_size >= req->rq_reqbuf->lm_buflens[1]); + + /* 2. compute new wrapper msg size */ + if (svc == SPTLRPC_SVC_NULL) { + /* no signature, get size directly */ + newbuf_size = get_enlarged_msgsize(req->rq_reqbuf, + 1, newmsg_size); + } else { + txtsize = req->rq_reqbuf->lm_buflens[0]; + + if (svc == SPTLRPC_SVC_INTG) { + for (i = 1; i < req->rq_reqbuf->lm_bufcount; i++) + txtsize += req->rq_reqbuf->lm_buflens[i]; + txtsize += newmsg_size - req->rq_reqbuf->lm_buflens[1]; + } + + sigsize = gss_cli_payload(req->rq_cli_ctx, txtsize, 0); + LASSERT(sigsize >= msg_last_seglen(req->rq_reqbuf)); + + newbuf_size = get_enlarged_msgsize2( + req->rq_reqbuf, + 1, newmsg_size, + msg_last_segidx(req->rq_reqbuf), + sigsize); + } + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 1, 0); + } + + /* do enlargement, from wrapper to embedded, from end to begin */ + if (svc != SPTLRPC_SVC_NULL) + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, + msg_last_segidx(req->rq_reqbuf), + sigsize); + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, 1, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +static +int gss_enlarge_reqbuf_priv(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newclrbuf; + int newmsg_size, newclrbuf_size, newcipbuf_size; + __u32 buflens[3]; + + /* + * embedded msg is at seg 0 of clear buffer; + * cipher text is at seg 2 of cipher buffer; + */ + LASSERT(req->rq_pool || + (req->rq_reqbuf == NULL && req->rq_reqbuf_len == 0)); + LASSERT(req->rq_reqbuf == NULL || + (req->rq_pool && req->rq_reqbuf->lm_bufcount == 3)); + LASSERT(req->rq_clrbuf); + LASSERT(req->rq_clrbuf_len > req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_clrbuf, 0, 0) == req->rq_reqmsg); + + /* compute new embedded msg size */ + newmsg_size = get_enlarged_msgsize(req->rq_reqmsg, segment, newsize); + + /* compute new clear buffer size */ + newclrbuf_size = get_enlarged_msgsize(req->rq_clrbuf, 0, newmsg_size); + newclrbuf_size += GSS_MAX_CIPHER_BLOCK; + + /* compute new cipher buffer size */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(req->rq_cli_ctx, buflens[0], 0); + buflens[2] = gss_cli_payload(req->rq_cli_ctx, newclrbuf_size, 1); + newcipbuf_size = lustre_msg_size_v2(3, buflens); + + /* handle the case that we put both clear buf and cipher buf into + * pre-allocated single buffer. */ + if (unlikely(req->rq_pool) && + req->rq_clrbuf >= req->rq_reqbuf && + (char *) req->rq_clrbuf < + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + /* it couldn't be better we still fit into the + * pre-allocated buffer. */ + if (newclrbuf_size + newcipbuf_size <= req->rq_reqbuf_len) { + void *src, *dst; + + /* move clear text backward. */ + src = req->rq_clrbuf; + dst = (char *) req->rq_reqbuf + newcipbuf_size; + + memmove(dst, src, req->rq_clrbuf_len); + + req->rq_clrbuf = (struct lustre_msg *) dst; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + } else { + /* sadly we have to split out the clear buffer */ + LASSERT(req->rq_reqbuf_len >= newcipbuf_size); + LASSERT(req->rq_clrbuf_len < newclrbuf_size); + } + } + + if (req->rq_clrbuf_len < newclrbuf_size) { + newclrbuf_size = size_roundup_power2(newclrbuf_size); + + OBD_ALLOC_LARGE(newclrbuf, newclrbuf_size); + if (newclrbuf == NULL) + RETURN(-ENOMEM); + + memcpy(newclrbuf, req->rq_clrbuf, req->rq_clrbuf_len); + + if (req->rq_reqbuf == NULL || + req->rq_clrbuf < req->rq_reqbuf || + (char *) req->rq_clrbuf >= + (char *) req->rq_reqbuf + req->rq_reqbuf_len) { + OBD_FREE_LARGE(req->rq_clrbuf, req->rq_clrbuf_len); + } + + req->rq_clrbuf = newclrbuf; + req->rq_clrbuf_len = newclrbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_clrbuf, 0, 0); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_clrbuf, 0, newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + RETURN(0); +} + +int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + + LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini); + + switch (svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + return gss_enlarge_reqbuf_intg(sec, req, svc, segment, newsize); + case SPTLRPC_SVC_PRIV: + return gss_enlarge_reqbuf_priv(sec, req, segment, newsize); + default: + LASSERTF(0, "bad rpc flavor %x\n", req->rq_flvr.sf_rpc); + return 0; + } +} + +int gss_sec_install_rctx(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx) +{ + struct gss_sec *gsec; + struct gss_cli_ctx *gctx; + int rc; + + gsec = container_of(sec, struct gss_sec, gs_base); + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + + rc = gss_install_rvs_svc_ctx(imp, gsec, gctx); + return rc; +} + +/******************************************** + * server side API * + ********************************************/ + +static inline +int gss_svc_reqctx_is_special(struct gss_svc_reqctx *grctx) +{ + LASSERT(grctx); + return (grctx->src_init || grctx->src_init_continue || + grctx->src_err_notify); +} + +static +void gss_svc_reqctx_free(struct gss_svc_reqctx *grctx) +{ + if (grctx->src_ctx) + gss_svc_upcall_put_ctx(grctx->src_ctx); + + sptlrpc_policy_put(grctx->src_base.sc_policy); + OBD_FREE_PTR(grctx); +} + +static inline +void gss_svc_reqctx_addref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + atomic_inc(&grctx->src_base.sc_refcount); +} + +static inline +void gss_svc_reqctx_decref(struct gss_svc_reqctx *grctx) +{ + LASSERT(atomic_read(&grctx->src_base.sc_refcount) > 0); + + if (atomic_dec_and_test(&grctx->src_base.sc_refcount)) + gss_svc_reqctx_free(grctx); +} + +static +int gss_svc_sign(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx, + __u32 svc) +{ + __u32 flags = 0; + int rc; + ENTRY; + + LASSERT(rs->rs_msg == lustre_msg_buf(rs->rs_repbuf, 1, 0)); + + /* embedded lustre_msg might have been shrinked */ + if (req->rq_replen != rs->rs_repbuf->lm_buflens[1]) + lustre_shrink_msg(rs->rs_repbuf, 1, req->rq_replen, 1); + + if (req->rq_pack_bulk) + flags |= LUSTRE_GSS_PACK_BULK; + + rc = gss_sign_msg(rs->rs_repbuf, grctx->src_ctx->gsc_mechctx, + LUSTRE_SP_ANY, flags, PTLRPC_GSS_PROC_DATA, + grctx->src_wirectx.gw_seq, svc, NULL); + if (rc < 0) + RETURN(rc); + + rs->rs_repdata_len = rc; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = gss_at_reply_off_integ; + else + req->rq_reply_off = 0; + } else { + if (svc == SPTLRPC_SVC_NULL) + rs->rs_repbuf->lm_cksum = crc32_le(!(__u32) 0, + lustre_msg_buf(rs->rs_repbuf, 1, 0), + lustre_msg_buflen(rs->rs_repbuf, 1)); + req->rq_reply_off = 0; + } + + RETURN(0); +} + +int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct ptlrpc_reply_state *rs; + struct gss_err_header *ghdr; + int replen = sizeof(struct ptlrpc_body); + int rc; + ENTRY; + + //if (OBD_FAIL_CHECK_ORSET(OBD_FAIL_SVCGSS_ERR_NOTIFY, OBD_FAIL_ONCE)) + // RETURN(-EINVAL); + + grctx->src_err_notify = 1; + grctx->src_reserve_len = 0; + + rc = lustre_pack_reply_v2(req, 1, &replen, NULL, 0); + if (rc) { + CERROR("could not pack reply, err %d\n", rc); + RETURN(rc); + } + + /* gss hdr */ + rs = req->rq_reply_state; + LASSERT(rs->rs_repbuf->lm_buflens[1] >= sizeof(*ghdr)); + ghdr = lustre_msg_buf(rs->rs_repbuf, 0, 0); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_ERR; + ghdr->gh_major = major; + ghdr->gh_minor = minor; + ghdr->gh_handle.len = 0; /* fake context handle */ + + rs->rs_repdata_len = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + CDEBUG(D_SEC, "prepare gss error notify(0x%x/0x%x) to %s\n", + major, minor, libcfs_nid2str(req->rq_peer.nid)); + RETURN(0); +} + +static +int gss_svc_handle_init(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct lustre_msg *reqbuf = req->rq_reqbuf; + struct obd_uuid *uuid; + struct obd_device *target; + rawobj_t uuid_obj, rvs_hdl, in_token; + __u32 lustre_svc; + __u32 *secdata, seclen; + int swabbed, rc; + ENTRY; + + CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc, + libcfs_nid2str(req->rq_peer.nid)); + + req->rq_ctx_init = 1; + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + CERROR("unexpected bulk flag\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) { + CERROR("proc %u: invalid handle length %u\n", + gw->gw_proc, gw->gw_handle.len); + RETURN(SECSVC_DROP); + } + + if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4){ + CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + /* ctx initiate payload is in last segment */ + secdata = lustre_msg_buf(reqbuf, reqbuf->lm_bufcount - 1, 0); + seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1]; + + if (seclen < 4 + 4) { + CERROR("sec size %d too small\n", seclen); + RETURN(SECSVC_DROP); + } + + /* lustre svc type */ + lustre_svc = le32_to_cpu(*secdata++); + seclen -= 4; + + /* extract target uuid, note this code is somewhat fragile + * because touched internal structure of obd_uuid */ + if (rawobj_extract(&uuid_obj, &secdata, &seclen)) { + CERROR("failed to extract target uuid\n"); + RETURN(SECSVC_DROP); + } + uuid_obj.data[uuid_obj.len - 1] = '\0'; + + uuid = (struct obd_uuid *) uuid_obj.data; + target = class_uuid2obd(uuid); + if (!target || target->obd_stopping || !target->obd_set_up) { + CERROR("target '%s' is not available for context init (%s)\n", + uuid->uuid, target == NULL ? "no target" : + (target->obd_stopping ? "stopping" : "not set up")); + RETURN(SECSVC_DROP); + } + + /* extract reverse handle */ + if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) { + CERROR("failed extract reverse handle\n"); + RETURN(SECSVC_DROP); + } + + /* extract token */ + if (rawobj_extract(&in_token, &secdata, &seclen)) { + CERROR("can't extract token\n"); + RETURN(SECSVC_DROP); + } + + rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc, + &rvs_hdl, &in_token); + if (rc != SECSVC_OK) + RETURN(rc); + + if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss || + grctx->src_ctx->gsc_usr_root) + CWARN("create svc ctx %p: user from %s authenticated as %s\n", + grctx->src_ctx, libcfs_nid2str(req->rq_peer.nid), + grctx->src_ctx->gsc_usr_mds ? "mds" : + (grctx->src_ctx->gsc_usr_oss ? "oss" : "root")); + else + CWARN("create svc ctx %p: accept user %u from %s\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid, + libcfs_nid2str(req->rq_peer.nid)); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor\n"); + RETURN(SECSVC_DROP); + } + if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(reqbuf, 2, 0); + } + + req->rq_reqmsg = lustre_msg_buf(reqbuf, 1, 0); + req->rq_reqlen = lustre_msg_buflen(reqbuf, 1); + + RETURN(rc); +} + +/* + * last segment must be the gss signature. + */ +static +int gss_svc_verify_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int offset = 2; + int swabbed; + ENTRY; + + *major = GSS_S_COMPLETE; + + if (msg->lm_bufcount < 2) { + CERROR("Too few segments (%u) in request\n", msg->lm_bufcount); + RETURN(-EINVAL); + } + + if (gw->gw_svc == SPTLRPC_SVC_NULL) + goto verified; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to verify request: %x\n", *major); + RETURN(-EACCES); + } + + if (gctx->gsc_reverse == 0 && + gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + +verified: + swabbed = ptlrpc_req_need_swab(req); + + /* user descriptor */ + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("no user desc included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + /* check bulk_sec_desc data */ + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < (offset + 1)) { + CERROR("missing bulk sec descriptor\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(msg, 1, 0); + req->rq_reqlen = msg->lm_buflens[1]; + RETURN(0); +} + +static +int gss_svc_unseal_request(struct ptlrpc_request *req, + struct gss_svc_reqctx *grctx, + struct gss_wire_ctx *gw, + __u32 *major) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + struct lustre_msg *msg = req->rq_reqbuf; + int swabbed, msglen, offset = 1; + ENTRY; + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 0)) { + CERROR("phase 0: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + *major = gss_unseal_msg(gctx->gsc_mechctx, msg, + &msglen, req->rq_reqdata_len); + if (*major != GSS_S_COMPLETE) { + CERROR("failed to unwrap request: %x\n", *major); + RETURN(-EACCES); + } + + if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { + CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); + *major = GSS_S_DUPLICATE_TOKEN; + RETURN(-EACCES); + } + + swabbed = __lustre_unpack_msg(msg, msglen); + if (swabbed < 0) { + CERROR("Failed to unpack after decryption\n"); + RETURN(-EINVAL); + } + req->rq_reqdata_len = msglen; + + if (msg->lm_bufcount < 1) { + CERROR("Invalid buffer: is empty\n"); + RETURN(-EINVAL); + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no user descriptor included\n"); + RETURN(-EINVAL); + } + + if (sptlrpc_unpack_user_desc(msg, offset, swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(-EINVAL); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, offset, 0); + offset++; + } + + if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { + if (msg->lm_bufcount < offset + 1) { + CERROR("no bulk checksum included\n"); + RETURN(-EINVAL); + } + + if (bulk_sec_desc_unpack(msg, offset, swabbed)) + RETURN(-EINVAL); + + req->rq_pack_bulk = 1; + grctx->src_reqbsd = lustre_msg_buf(msg, offset, 0); + grctx->src_reqbsd_size = lustre_msg_buflen(msg, offset); + } + + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, 0, 0); + req->rq_reqlen = req->rq_reqbuf->lm_buflens[0]; + RETURN(0); +} + +static +int gss_svc_handle_data(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major = 0; + int rc = 0; + ENTRY; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + major = GSS_S_NO_CONTEXT; + goto error; + } + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_verify_request(req, grctx, gw, &major); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_unseal_request(req, grctx, gw, &major); + break; + default: + CERROR("unsupported gss service %d\n", gw->gw_svc); + rc = -EINVAL; + } + + if (rc == 0) + RETURN(SECSVC_OK); + + CERROR("svc %u failed: major 0x%08x: req xid "LPU64" ctx %p idx " + LPX64"(%u->%s)\n", gw->gw_svc, major, req->rq_xid, + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); +error: + /* we only notify client in case of NO_CONTEXT/BAD_SIG, which + * might happen after server reboot, to allow recovery. */ + if ((major == GSS_S_NO_CONTEXT || major == GSS_S_BAD_SIG) && + gss_pack_err_notify(req, major, 0) == 0) + RETURN(SECSVC_COMPLETE); + + RETURN(SECSVC_DROP); +} + +static +int gss_svc_handle_destroy(struct ptlrpc_request *req, + struct gss_wire_ctx *gw) +{ + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + __u32 major; + ENTRY; + + req->rq_ctx_fini = 1; + req->rq_no_reply = 1; + + grctx->src_ctx = gss_svc_upcall_get_ctx(req, gw); + if (!grctx->src_ctx) { + CDEBUG(D_SEC, "invalid gss context handle for destroy.\n"); + RETURN(SECSVC_DROP); + } + + if (gw->gw_svc != SPTLRPC_SVC_INTG) { + CERROR("svc %u is not supported in destroy.\n", gw->gw_svc); + RETURN(SECSVC_DROP); + } + + if (gss_svc_verify_request(req, grctx, gw, &major)) + RETURN(SECSVC_DROP); + + CWARN("destroy svc ctx %p idx "LPX64" (%u->%s)\n", + grctx->src_ctx, gss_handle_to_u64(&gw->gw_handle), + grctx->src_ctx->gsc_uid, libcfs_nid2str(req->rq_peer.nid)); + + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { + if (req->rq_reqbuf->lm_bufcount < 4) { + CERROR("missing user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + if (sptlrpc_unpack_user_desc(req->rq_reqbuf, 2, + ptlrpc_req_need_swab(req))) { + CERROR("Mal-formed user descriptor, ignore it\n"); + RETURN(SECSVC_OK); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(req->rq_reqbuf, 2, 0); + } + + RETURN(SECSVC_OK); +} + +int gss_svc_accept(struct ptlrpc_sec_policy *policy, struct ptlrpc_request *req) +{ + struct gss_header *ghdr; + struct gss_svc_reqctx *grctx; + struct gss_wire_ctx *gw; + int swabbed, rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_svc_ctx == NULL); + + if (req->rq_reqbuf->lm_bufcount < 2) { + CERROR("buf count only %d\n", req->rq_reqbuf->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + ghdr = gss_swab_header(req->rq_reqbuf, 0, swabbed); + if (ghdr == NULL) { + CERROR("can't decode gss header\n"); + RETURN(SECSVC_DROP); + } + + /* sanity checks */ + if (ghdr->gh_version != PTLRPC_GSS_VERSION) { + CERROR("gss version %u, expect %u\n", ghdr->gh_version, + PTLRPC_GSS_VERSION); + RETURN(SECSVC_DROP); + } + + req->rq_sp_from = ghdr->gh_sp; + + /* alloc grctx data */ + OBD_ALLOC_PTR(grctx); + if (!grctx) + RETURN(SECSVC_DROP); + + grctx->src_base.sc_policy = sptlrpc_policy_get(policy); + atomic_set(&grctx->src_base.sc_refcount, 1); + req->rq_svc_ctx = &grctx->src_base; + gw = &grctx->src_wirectx; + + /* save wire context */ + gw->gw_flags = ghdr->gh_flags; + gw->gw_proc = ghdr->gh_proc; + gw->gw_seq = ghdr->gh_seq; + gw->gw_svc = ghdr->gh_svc; + rawobj_from_netobj(&gw->gw_handle, &ghdr->gh_handle); + + /* keep original wire header which subject to checksum verification */ + if (swabbed) + gss_header_swabber(ghdr); + + switch(ghdr->gh_proc) { + case PTLRPC_GSS_PROC_INIT: + case PTLRPC_GSS_PROC_CONTINUE_INIT: + rc = gss_svc_handle_init(req, gw); + break; + case PTLRPC_GSS_PROC_DATA: + rc = gss_svc_handle_data(req, gw); + break; + case PTLRPC_GSS_PROC_DESTROY: + rc = gss_svc_handle_destroy(req, gw); + break; + default: + CERROR("unknown proc %u\n", gw->gw_proc); + rc = SECSVC_DROP; + break; + } + + switch (rc) { + case SECSVC_OK: + LASSERT (grctx->src_ctx); + + req->rq_auth_gss = 1; + req->rq_auth_remote = grctx->src_ctx->gsc_remote; + req->rq_auth_usr_mdt = grctx->src_ctx->gsc_usr_mds; + req->rq_auth_usr_ost = grctx->src_ctx->gsc_usr_oss; + req->rq_auth_usr_root = grctx->src_ctx->gsc_usr_root; + req->rq_auth_uid = grctx->src_ctx->gsc_uid; + req->rq_auth_mapped_uid = grctx->src_ctx->gsc_mapped_uid; + break; + case SECSVC_COMPLETE: + break; + case SECSVC_DROP: + gss_svc_reqctx_free(grctx); + req->rq_svc_ctx = NULL; + break; + } + + RETURN(rc); +} + +void gss_svc_invalidate_ctx(struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_svc_reqctx *grctx; + ENTRY; + + if (svc_ctx == NULL) { + EXIT; + return; + } + + grctx = gss_svc_ctx2reqctx(svc_ctx); + + CWARN("gss svc invalidate ctx %p(%u)\n", + grctx->src_ctx, grctx->src_ctx->gsc_uid); + gss_svc_upcall_destroy_ctx(grctx->src_ctx); + + EXIT; +} + +static inline +int gss_svc_payload(struct gss_svc_reqctx *grctx, int early, + int msgsize, int privacy) +{ + /* we should treat early reply normally, but which is actually sharing + * the same ctx with original request, so in this case we should + * ignore the special ctx's special flags */ + if (early == 0 && gss_svc_reqctx_is_special(grctx)) + return grctx->src_reserve_len; + + return gss_mech_payload(NULL, msgsize, privacy); +} + +static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx, + struct sptlrpc_flavor *flvr, + int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + if (read) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_mech_payload(NULL, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_mech_payload(NULL, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + +int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_reply_state *rs; + int early, privacy, svc, bsd_off = 0; + __u32 ibuflens[2], buflens[4]; + int ibufcnt = 0, bufcnt; + int txtsize, wmsg_size, rs_size; + ENTRY; + + LASSERT(msglen % 8 == 0); + + if (req->rq_pack_bulk && !req->rq_bulk_read && !req->rq_bulk_write) { + CERROR("client request bulk sec on non-bulk rpc\n"); + RETURN(-EPROTO); + } + + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); + early = (req->rq_packed_final == 0); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + if (!early && gss_svc_reqctx_is_special(grctx)) + privacy = 0; + else + privacy = (svc == SPTLRPC_SVC_PRIV); + + if (privacy) { + /* inner clear buffers */ + ibufcnt = 1; + ibuflens[0] = msglen; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = ibufcnt; + ibuflens[ibufcnt++] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + } + + txtsize = lustre_msg_size_v2(ibufcnt, ibuflens); + txtsize += GSS_MAX_CIPHER_BLOCK; + + /* wrapper buffer */ + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_svc_payload(grctx, early, txtsize, 1); + } else { + bufcnt = 2; + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = msglen; + + txtsize = buflens[0]; + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[1]; + + if (req->rq_pack_bulk) { + LASSERT(grctx->src_reqbsd); + + bsd_off = bufcnt; + buflens[bufcnt] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); + if (svc == SPTLRPC_SVC_INTG) + txtsize += buflens[bufcnt]; + bufcnt++; + } + + if ((!early && gss_svc_reqctx_is_special(grctx)) || + svc != SPTLRPC_SVC_NULL) + buflens[bufcnt++] = gss_svc_payload(grctx, early, + txtsize, 0); + } + + wmsg_size = lustre_msg_size_v2(bufcnt, buflens); + + rs_size = sizeof(*rs) + wmsg_size; + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = wmsg_size; + + /* initialize the buffer */ + if (privacy) { + lustre_init_msg_v2(rs->rs_repbuf, ibufcnt, ibuflens, NULL); + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 0, msglen); + } else { + lustre_init_msg_v2(rs->rs_repbuf, bufcnt, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + rs->rs_msg = lustre_msg_buf(rs->rs_repbuf, 1, 0); + } + + if (bsd_off) { + grctx->src_repbsd = lustre_msg_buf(rs->rs_repbuf, bsd_off, 0); + grctx->src_repbsd_size = lustre_msg_buflen(rs->rs_repbuf, + bsd_off); + } + + gss_svc_reqctx_addref(grctx); + rs->rs_svc_ctx = req->rq_svc_ctx; + + LASSERT(rs->rs_msg); + req->rq_reply_state = rs; + RETURN(0); +} + +static int gss_svc_seal(struct ptlrpc_request *req, + struct ptlrpc_reply_state *rs, + struct gss_svc_reqctx *grctx) +{ + struct gss_svc_ctx *gctx = grctx->src_ctx; + rawobj_t hdrobj, msgobj, token; + struct gss_header *ghdr; + __u8 *token_buf; + int token_buflen; + __u32 buflens[2], major; + int msglen, rc; + ENTRY; + + /* get clear data length. note embedded lustre_msg might + * have been shrinked */ + if (req->rq_replen != lustre_msg_buflen(rs->rs_repbuf, 0)) + msglen = lustre_shrink_msg(rs->rs_repbuf, 0, req->rq_replen, 1); + else + msglen = lustre_msg_size_v2(rs->rs_repbuf->lm_bufcount, + rs->rs_repbuf->lm_buflens); + + /* temporarily use tail of buffer to hold gss header data */ + LASSERT(msglen + PTLRPC_GSS_HEADER_SIZE <= rs->rs_repbuf_len); + ghdr = (struct gss_header *) ((char *) rs->rs_repbuf + + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE); + ghdr->gh_version = PTLRPC_GSS_VERSION; + ghdr->gh_sp = LUSTRE_SP_ANY; + ghdr->gh_flags = 0; + ghdr->gh_proc = PTLRPC_GSS_PROC_DATA; + ghdr->gh_seq = grctx->src_wirectx.gw_seq; + ghdr->gh_svc = SPTLRPC_SVC_PRIV; + ghdr->gh_handle.len = 0; + if (req->rq_pack_bulk) + ghdr->gh_flags |= LUSTRE_GSS_PACK_BULK; + + /* allocate temporary cipher buffer */ + token_buflen = gss_mech_payload(gctx->gsc_mechctx, msglen, 1); + OBD_ALLOC_LARGE(token_buf, token_buflen); + if (token_buf == NULL) + RETURN(-ENOMEM); + + hdrobj.len = PTLRPC_GSS_HEADER_SIZE; + hdrobj.data = (__u8 *) ghdr; + msgobj.len = msglen; + msgobj.data = (__u8 *) rs->rs_repbuf; + token.len = token_buflen; + token.data = token_buf; + + major = lgss_wrap(gctx->gsc_mechctx, &hdrobj, &msgobj, + rs->rs_repbuf_len - PTLRPC_GSS_HEADER_SIZE, &token); + if (major != GSS_S_COMPLETE) { + CERROR("wrap message error: %08x\n", major); + GOTO(out_free, rc = -EPERM); + } + LASSERT(token.len <= token_buflen); + + /* we are about to override data at rs->rs_repbuf, nullify pointers + * to which to catch further illegal usage. */ + if (req->rq_pack_bulk) { + grctx->src_repbsd = NULL; + grctx->src_repbsd_size = 0; + } + + /* now fill the actual wire data + * - gss header + * - gss token + */ + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = token.len; + + rs->rs_repdata_len = lustre_msg_size_v2(2, buflens); + LASSERT(rs->rs_repdata_len <= rs->rs_repbuf_len); + + lustre_init_msg_v2(rs->rs_repbuf, 2, buflens, NULL); + rs->rs_repbuf->lm_secflvr = req->rq_flvr.sf_rpc; + + memcpy(lustre_msg_buf(rs->rs_repbuf, 0, 0), ghdr, + PTLRPC_GSS_HEADER_SIZE); + memcpy(lustre_msg_buf(rs->rs_repbuf, 1, 0), token.data, token.len); + + /* reply offset */ + if (req->rq_packed_final && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) + req->rq_reply_off = gss_at_reply_off_priv; + else + req->rq_reply_off = 0; + + /* to catch upper layer's further access */ + rs->rs_msg = NULL; + req->rq_repmsg = NULL; + req->rq_replen = 0; + + rc = 0; +out_free: + OBD_FREE_LARGE(token_buf, token_buflen); + RETURN(rc); +} + +int gss_svc_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct gss_wire_ctx *gw = &grctx->src_wirectx; + int early, rc; + ENTRY; + + early = (req->rq_packed_final == 0); + + if (!early && gss_svc_reqctx_is_special(grctx)) { + LASSERT(rs->rs_repdata_len != 0); + + req->rq_reply_off = gss_at_reply_off_integ; + RETURN(0); + } + + /* early reply could happen in many cases */ + if (!early && + gw->gw_proc != PTLRPC_GSS_PROC_DATA && + gw->gw_proc != PTLRPC_GSS_PROC_DESTROY) { + CERROR("proc %d not support\n", gw->gw_proc); + RETURN(-EINVAL); + } + + LASSERT(grctx->src_ctx); + + switch (gw->gw_svc) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + rc = gss_svc_sign(req, rs, grctx, gw->gw_svc); + break; + case SPTLRPC_SVC_PRIV: + rc = gss_svc_seal(req, rs, grctx); + break; + default: + CERROR("Unknown service %d\n", gw->gw_svc); + GOTO(out, rc = -EINVAL); + } + rc = 0; + +out: + RETURN(rc); +} + +void gss_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct gss_svc_reqctx *grctx; + + LASSERT(rs->rs_svc_ctx); + grctx = container_of(rs->rs_svc_ctx, struct gss_svc_reqctx, src_base); + + gss_svc_reqctx_decref(grctx); + rs->rs_svc_ctx = NULL; + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +void gss_svc_free_ctx(struct ptlrpc_svc_ctx *ctx) +{ + LASSERT(atomic_read(&ctx->sc_refcount) == 0); + gss_svc_reqctx_free(gss_svc_ctx2reqctx(ctx)); +} + +int gss_copy_rvc_cli_ctx(struct ptlrpc_cli_ctx *cli_ctx, + struct ptlrpc_svc_ctx *svc_ctx) +{ + struct gss_cli_ctx *cli_gctx = ctx2gctx(cli_ctx); + struct gss_svc_ctx *svc_gctx = gss_svc_ctx2gssctx(svc_ctx); + struct gss_ctx *mechctx = NULL; + + LASSERT(cli_gctx); + LASSERT(svc_gctx && svc_gctx->gsc_mechctx); + + cli_gctx->gc_proc = PTLRPC_GSS_PROC_DATA; + cli_gctx->gc_win = GSS_SEQ_WIN; + + /* The problem is the reverse ctx might get lost in some recovery + * situations, and the same svc_ctx will be used to re-create it. + * if there's callback be sentout before that, new reverse ctx start + * with sequence 0 will lead to future callback rpc be treated as + * replay. + * + * each reverse root ctx will record its latest sequence number on its + * buddy svcctx before be destroied, so here we continue use it. + */ + atomic_set(&cli_gctx->gc_seq, svc_gctx->gsc_rvs_seq); + + if (gss_svc_upcall_dup_handle(&cli_gctx->gc_svc_handle, svc_gctx)) { + CERROR("failed to dup svc handle\n"); + goto err_out; + } + + if (lgss_copy_reverse_context(svc_gctx->gsc_mechctx, &mechctx) != + GSS_S_COMPLETE) { + CERROR("failed to copy mech context\n"); + goto err_svc_handle; + } + + if (rawobj_dup(&cli_gctx->gc_handle, &svc_gctx->gsc_rvs_hdl)) { + CERROR("failed to dup reverse handle\n"); + goto err_ctx; + } + + cli_gctx->gc_mechctx = mechctx; + gss_cli_ctx_uptodate(cli_gctx); + + return 0; + +err_ctx: + lgss_delete_sec_context(&mechctx); +err_svc_handle: + rawobj_free(&cli_gctx->gc_svc_handle); +err_out: + return -ENOMEM; +} + +static void gss_init_at_reply_offset(void) +{ + __u32 buflens[3]; + int clearsize; + + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = lustre_msg_early_size(); + buflens[2] = gss_cli_payload(NULL, buflens[1], 0); + gss_at_reply_off_integ = lustre_msg_size_v2(3, buflens); + + buflens[0] = lustre_msg_early_size(); + clearsize = lustre_msg_size_v2(1, buflens); + buflens[0] = PTLRPC_GSS_HEADER_SIZE; + buflens[1] = gss_cli_payload(NULL, clearsize, 0); + buflens[2] = gss_cli_payload(NULL, clearsize, 1); + gss_at_reply_off_priv = lustre_msg_size_v2(3, buflens); +} + +int __init sptlrpc_gss_init(void) +{ + int rc; + + rc = gss_init_lproc(); + if (rc) + return rc; + + rc = gss_init_cli_upcall(); + if (rc) + goto out_lproc; + + rc = gss_init_svc_upcall(); + if (rc) + goto out_cli_upcall; + + rc = init_kerberos_module(); + if (rc) + goto out_svc_upcall; + + /* register policy after all other stuff be intialized, because it + * might be in used immediately after the registration. */ + + rc = gss_init_keyring(); + if (rc) + goto out_kerberos; + +#ifdef HAVE_GSS_PIPEFS + rc = gss_init_pipefs(); + if (rc) + goto out_keyring; +#endif + + gss_init_at_reply_offset(); + + return 0; + +#ifdef HAVE_GSS_PIPEFS +out_keyring: + gss_exit_keyring(); +#endif + +out_kerberos: + cleanup_kerberos_module(); +out_svc_upcall: + gss_exit_svc_upcall(); +out_cli_upcall: + gss_exit_cli_upcall(); +out_lproc: + gss_exit_lproc(); + return rc; +} + +static void __exit sptlrpc_gss_exit(void) +{ + gss_exit_keyring(); +#ifdef HAVE_GSS_PIPEFS + gss_exit_pipefs(); +#endif + cleanup_kerberos_module(); + gss_exit_svc_upcall(); + gss_exit_cli_upcall(); + gss_exit_lproc(); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("GSS security policy for Lustre"); +MODULE_LICENSE("GPL"); + +module_init(sptlrpc_gss_init); +module_exit(sptlrpc_gss_exit); diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c new file mode 100644 index 000000000000..47a3c0512739 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/import.c @@ -0,0 +1,1613 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/import.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +struct ptlrpc_connect_async_args { + __u64 pcaa_peer_committed; + int pcaa_initial_connect; +}; + +/** + * Updates import \a imp current state to provided \a state value + * Helper function. Must be called under imp_lock. + */ +static void __import_set_state(struct obd_import *imp, + enum lustre_imp_state state) +{ + imp->imp_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; + imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = + cfs_time_current_sec(); + imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % + IMP_STATE_HIST_LEN; +} + +/* A CLOSED import should remain so. */ +#define IMPORT_SET_STATE_NOLOCK(imp, state) \ +do { \ + if (imp->imp_state != LUSTRE_IMP_CLOSED) { \ + CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \ + imp, obd2cli_tgt(imp->imp_obd), \ + ptlrpc_import_state_name(imp->imp_state), \ + ptlrpc_import_state_name(state)); \ + __import_set_state(imp, state); \ + } \ +} while(0) + +#define IMPORT_SET_STATE(imp, state) \ +do { \ + spin_lock(&imp->imp_lock); \ + IMPORT_SET_STATE_NOLOCK(imp, state); \ + spin_unlock(&imp->imp_lock); \ +} while(0) + + +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void * data, int rc); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); + +/* Only this function is allowed to change the import state when it is + * CLOSED. I would rather refcount the import and free it after + * disconnection like we do with exports. To do that, the client_obd + * will need to save the peer info somewhere other than in the import, + * though. */ +int ptlrpc_init_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + + imp->imp_generation++; + imp->imp_state = LUSTRE_IMP_NEW; + + spin_unlock(&imp->imp_lock); + + return 0; +} +EXPORT_SYMBOL(ptlrpc_init_import); + +#define UUID_STR "_UUID" +void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) +{ + *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) + ? uuid : uuid + strlen(prefix); + + *uuid_len = strlen(*uuid_start); + + if (*uuid_len < strlen(UUID_STR)) + return; + + if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), + UUID_STR, strlen(UUID_STR))) + *uuid_len -= strlen(UUID_STR); +} +EXPORT_SYMBOL(deuuidify); + +/** + * Returns true if import was FULL, false if import was already not + * connected. + * @imp - import to be disconnected + * @conn_cnt - connection count (epoch) of the request that timed out + * and caused the disconnection. In some cases, multiple + * inflight requests can fail to a single target (e.g. OST + * bulk requests) and if one has already caused a reconnection + * (increasing the import->conn_cnt) the older failure should + * not also cause a reconnection. If zero it forces a reconnect. + */ +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) +{ + int rc = 0; + + spin_lock(&imp->imp_lock); + + if (imp->imp_state == LUSTRE_IMP_FULL && + (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { + char *target_start; + int target_len; + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + if (imp->imp_replayable) { + LCONSOLE_WARN("%s: Connection to %.*s (at %s) was " + "lost; in progress operations using this " + "service will wait for recovery to complete\n", + imp->imp_obd->obd_name, target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } else { + LCONSOLE_ERROR_MSG(0x166, "%s: Connection to " + "%.*s (at %s) was lost; in progress " + "operations using this service will fail\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } + ptlrpc_deactivate_timeouts(imp); + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); + spin_unlock(&imp->imp_lock); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); + rc = 1; + } else { + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", + imp->imp_client->cli_name, imp, + (imp->imp_state == LUSTRE_IMP_FULL && + imp->imp_conn_cnt > conn_cnt) ? + "reconnected" : "not connected", imp->imp_conn_cnt, + conn_cnt, ptlrpc_import_state_name(imp->imp_state)); + } + + return rc; +} + +/* Must be called with imp_lock held! */ +static void ptlrpc_deactivate_and_unlock_import(struct obd_import *imp) +{ + ENTRY; + LASSERT(spin_is_locked(&imp->imp_lock)); + + CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); + imp->imp_invalid = 1; + imp->imp_generation++; + spin_unlock(&imp->imp_lock); + + ptlrpc_abort_inflight(imp); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + + EXIT; +} + +/* + * This acts as a barrier; all existing requests are rejected, and + * no new requests will be accepted until the import is valid again. + */ +void ptlrpc_deactivate_import(struct obd_import *imp) +{ + spin_lock(&imp->imp_lock); + ptlrpc_deactivate_and_unlock_import(imp); +} +EXPORT_SYMBOL(ptlrpc_deactivate_import); + +static unsigned int +ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now) +{ + long dl; + + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time_t now = cfs_time_current_sec(); + struct list_head *tmp, *n; + struct ptlrpc_request *req; + unsigned int timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + } + spin_unlock(&imp->imp_lock); + return timeout; +} + +/** + * This function will invalidate the import, if necessary, then block + * for all the RPC completions, and finally notify the obd to + * invalidate its state (ie cancel locks, clear pending requests, + * etc). + */ +void ptlrpc_invalidate_import(struct obd_import *imp) +{ + struct list_head *tmp, *n; + struct ptlrpc_request *req; + struct l_wait_info lwi; + unsigned int timeout; + int rc; + + atomic_inc(&imp->imp_inval_count); + + if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) + ptlrpc_deactivate_import(imp); + + LASSERT(imp->imp_invalid); + + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. */ + do { + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + timeout = ptlrpc_inflight_timeout(imp); + timeout += timeout / 3; + + if (timeout == 0) + timeout = obd_timeout; + } else { + /* decrease the interval to increase race condition */ + timeout = 1; + } + + CDEBUG(D_RPCTRACE,"Sleeping %d sec for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_timeout_cap(cfs_time_seconds(timeout)), + (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2, + NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), + &lwi); + if (rc) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); + + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", + cli_tgt, rc, + atomic_read(&imp->imp_inflight)); + + spin_lock(&imp->imp_lock); + if (atomic_read(&imp->imp_inflight) == 0) { + int count = atomic_read(&imp->imp_unregistering); + + /* We know that "unregistering" rpcs only can + * survive in sending or delaying lists (they + * maybe waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no inflight and unregistering != 0, this + * is bug. */ + LASSERTF(count == 0, "Some RPCs are still " + "unregistering: %d\n", count); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 0; + } else { + list_for_each_safe(tmp, n, + &imp->imp_sending_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on sending list"); + } + list_for_each_safe(tmp, n, + &imp->imp_delayed_list) { + req = list_entry(tmp, + struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, + "still on delayed list"); + } + + CERROR("%s: RPCs in \"%s\" phase found (%d). " + "Network is sluggish? Waiting them " + "to error out.\n", cli_tgt, + ptlrpc_phase2str(RQ_PHASE_UNREGISTERING), + atomic_read(&imp-> + imp_unregistering)); + } + spin_unlock(&imp->imp_lock); + } + } while (rc != 0); + + /* + * Let's additionally check that no new rpcs added to import in + * "invalidate" state. + */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); + sptlrpc_import_flush_all_ctx(imp); + + atomic_dec(&imp->imp_inval_count); + wake_up_all(&imp->imp_recovery_waitq); +} +EXPORT_SYMBOL(ptlrpc_invalidate_import); + +/* unset imp_invalid */ +void ptlrpc_activate_import(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + + spin_lock(&imp->imp_lock); + imp->imp_invalid = 0; + ptlrpc_activate_timeouts(imp); + spin_unlock(&imp->imp_lock); + obd_import_event(obd, imp, IMP_EVENT_ACTIVE); +} +EXPORT_SYMBOL(ptlrpc_activate_import); + +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) +{ + ENTRY; + + LASSERT(!imp->imp_dlm_fake); + + if (ptlrpc_set_import_discon(imp, conn_cnt)) { + if (!imp->imp_replayable) { + CDEBUG(D_HA, "import %s@%s for %s not replayable, " + "auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import(imp); + } + + CDEBUG(D_HA, "%s: waking up pinger\n", + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_force_verify = 1; + spin_unlock(&imp->imp_lock); + + ptlrpc_pinger_wake_up(); + } + EXIT; +} +EXPORT_SYMBOL(ptlrpc_fail_import); + +int ptlrpc_reconnect_import(struct obd_import *imp) +{ + ptlrpc_set_import_discon(imp, 0); + /* Force a new connect attempt */ + ptlrpc_invalidate_import(imp); + /* Do a fresh connect next time by zeroing the handle */ + ptlrpc_disconnect_import(imp, 1); + /* Wait for all invalidate calls to finish */ + if (atomic_read(&imp->imp_inval_count) > 0) { + int rc; + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inval_count) == 0), + &lwi); + if (rc) + CERROR("Interrupted, inval=%d\n", + atomic_read(&imp->imp_inval_count)); + } + + /* Allow reconnect attempts */ + imp->imp_obd->obd_no_recov = 0; + /* Remove 'invalid' flag */ + ptlrpc_activate_import(imp); + /* Attempt a new connect */ + ptlrpc_recover_import(imp, NULL, 0); + return 0; +} +EXPORT_SYMBOL(ptlrpc_reconnect_import); + +/** + * Connection on import \a imp is changed to another one (if more than one is + * present). We typically chose connection that we have not tried to connect to + * the longest + */ +static int import_select_connection(struct obd_import *imp) +{ + struct obd_import_conn *imp_conn = NULL, *conn; + struct obd_export *dlmexp; + char *target_start; + int target_len, tried_all = 1; + ENTRY; + + spin_lock(&imp->imp_lock); + + if (list_empty(&imp->imp_conn_list)) { + CERROR("%s: no connections available\n", + imp->imp_obd->obd_name); + spin_unlock(&imp->imp_lock); + RETURN(-EINVAL); + } + + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n", + imp->imp_obd->obd_name, + libcfs_nid2str(conn->oic_conn->c_peer.nid), + conn->oic_last_attempt); + + /* If we have not tried this connection since + the last successful attempt, go with this one */ + if ((conn->oic_last_attempt == 0) || + cfs_time_beforeq_64(conn->oic_last_attempt, + imp->imp_last_success_conn)) { + imp_conn = conn; + tried_all = 0; + break; + } + + /* If all of the connections have already been tried + since the last successful connection; just choose the + least recently used */ + if (!imp_conn) + imp_conn = conn; + else if (cfs_time_before_64(conn->oic_last_attempt, + imp_conn->oic_last_attempt)) + imp_conn = conn; + } + + /* if not found, simply choose the current one */ + if (!imp_conn || imp->imp_force_reconnect) { + LASSERT(imp->imp_conn_current); + imp_conn = imp->imp_conn_current; + tried_all = 0; + } + LASSERT(imp_conn->oic_conn); + + /* If we've tried everything, and we're back to the beginning of the + list, increase our timeout and try again. It will be reset when + we do finally connect. (FIXME: really we should wait for all network + state associated with the last connection attempt to drain before + trying to reconnect on it.) */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; + if (at_get(at) < CONNECTION_SWITCH_MAX) { + at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); + if (at_get(at) > CONNECTION_SWITCH_MAX) + at_reset(at, CONNECTION_SWITCH_MAX); + } + LASSERT(imp_conn->oic_last_attempt); + CDEBUG(D_HA, "%s: tried all connections, increasing latency " + "to %ds\n", imp->imp_obd->obd_name, at_get(at)); + } + + imp_conn->oic_last_attempt = cfs_time_current_64(); + + /* switch connection, don't mind if it's same as the current one */ + if (imp->imp_connection) + ptlrpc_connection_put(imp->imp_connection); + imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + LASSERT(dlmexp != NULL); + if (dlmexp->exp_connection) + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + class_export_put(dlmexp); + + if (imp->imp_conn_current != imp_conn) { + if (imp->imp_conn_current) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + + CDEBUG(D_HA, "%s: Connection changing to" + " %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + } + + imp->imp_conn_current = imp_conn; + } + + CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", + imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, + libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); + + spin_unlock(&imp->imp_lock); + + RETURN(0); +} + +/* + * must be called under imp_lock + */ +static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) +{ + struct ptlrpc_request *req; + struct list_head *tmp; + + if (list_empty(&imp->imp_replay_list)) + return 0; + tmp = imp->imp_replay_list.next; + req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); + *transno = req->rq_transno; + if (req->rq_transno == 0) { + DEBUG_REQ(D_ERROR, req, "zero transno in replay"); + LBUG(); + } + + return 1; +} + +/** + * Attempt to (re)connect import \a imp. This includes all preparations, + * initializing CONNECT RPC request and passing it to ptlrpcd for + * actual sending. + * Returns 0 on success or error code. + */ +int ptlrpc_connect_import(struct obd_import *imp) +{ + struct obd_device *obd = imp->imp_obd; + int initial_connect = 0; + int set_transno = 0; + __u64 committed_before_reconnect = 0; + struct ptlrpc_request *request; + char *bufs[] = { NULL, + obd2cli_tgt(imp->imp_obd), + obd->obd_uuid.uuid, + (char *)&imp->imp_dlm_handle, + (char *)&imp->imp_connect_data }; + struct ptlrpc_connect_async_args *aa; + int rc; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + spin_unlock(&imp->imp_lock); + CERROR("can't connect to a closed import\n"); + RETURN(-EINVAL); + } else if (imp->imp_state == LUSTRE_IMP_FULL) { + spin_unlock(&imp->imp_lock); + CERROR("already connected\n"); + RETURN(0); + } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) { + spin_unlock(&imp->imp_lock); + CERROR("already connecting\n"); + RETURN(-EALREADY); + } + + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); + + imp->imp_conn_cnt++; + imp->imp_resend_replay = 0; + + if (!lustre_handle_is_used(&imp->imp_remote_handle)) + initial_connect = 1; + else + committed_before_reconnect = imp->imp_peer_committed_transno; + + set_transno = ptlrpc_first_transno(imp, + &imp->imp_connect_data.ocd_transno); + spin_unlock(&imp->imp_lock); + + rc = import_select_connection(imp); + if (rc) + GOTO(out, rc); + + rc = sptlrpc_import_sec_adapt(imp, NULL, 0); + if (rc) + GOTO(out, rc); + + /* Reset connect flags to the originally requested flags, in case + * the server is updated on-the-fly we will get the new features. */ + imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig; + /* Reset ocd_version each time so the server knows the exact versions */ + imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE; + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, + &obd->obd_uuid, &imp->imp_connect_data, NULL); + if (rc) + GOTO(out, rc); + + request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); + if (request == NULL) + GOTO(out, rc = -ENOMEM); + + rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, + imp->imp_connect_op, bufs, NULL); + if (rc) { + ptlrpc_request_free(request); + GOTO(out, rc); + } + + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_time(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + + lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER); + + request->rq_no_resend = request->rq_no_delay = 1; + request->rq_send_state = LUSTRE_IMP_CONNECTING; + /* Allow a slightly larger reply for future growth compatibility */ + req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, + sizeof(struct obd_connect_data)+16*sizeof(__u64)); + ptlrpc_request_set_replen(request); + request->rq_interpret_reply = ptlrpc_connect_interpret; + + CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args)); + aa = ptlrpc_req_async_args(request); + memset(aa, 0, sizeof *aa); + + aa->pcaa_peer_committed = committed_before_reconnect; + aa->pcaa_initial_connect = initial_connect; + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_INITIAL); + } + + if (set_transno) + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_TRANSNO); + + DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", + request->rq_timeout); + ptlrpcd_add_req(request, PDL_POLICY_ROUND, -1); + rc = 0; +out: + if (rc != 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_connect_import); + +static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) +{ + int force_verify; + + spin_lock(&imp->imp_lock); + force_verify = imp->imp_force_verify != 0; + spin_unlock(&imp->imp_lock); + + if (force_verify) + ptlrpc_pinger_wake_up(); +} + +static int ptlrpc_busy_reconnect(int rc) +{ + return (rc == -EBUSY) || (rc == -EAGAIN); +} + +/** + * interpret_reply callback for connect RPCs. + * Looks into returned status of connect operation and decides + * what to do with the import - i.e enter recovery, promote it to + * full state for normal operations of disconnect it due to an error. + */ +static int ptlrpc_connect_interpret(const struct lu_env *env, + struct ptlrpc_request *request, + void *data, int rc) +{ + struct ptlrpc_connect_async_args *aa = data; + struct obd_import *imp = request->rq_import; + struct client_obd *cli = &imp->imp_obd->u.cli; + struct lustre_handle old_hdl; + __u64 old_connect_flags; + int msg_flags; + struct obd_connect_data *ocd; + struct obd_export *exp; + int ret; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + imp->imp_connect_tried = 1; + spin_unlock(&imp->imp_lock); + RETURN(0); + } + + if (rc) { + /* if this reconnect to busy export - not need select new target + * for connecting*/ + imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); + spin_unlock(&imp->imp_lock); + ptlrpc_maybe_ping_import_soon(imp); + GOTO(out, rc); + } + spin_unlock(&imp->imp_lock); + + LASSERT(imp->imp_conn_current); + + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + + ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, + RCL_SERVER); + /* server replied obd_connect_data is always bigger */ + ocd = req_capsule_server_sized_get(&request->rq_pill, + &RMF_CONNECT_DATA, ret); + + if (ocd == NULL) { + CERROR("%s: no connect data from server\n", + imp->imp_obd->obd_name); + rc = -EPROTO; + GOTO(out, rc); + } + + spin_lock(&imp->imp_lock); + + /* All imports are pingable */ + imp->imp_pingable = 1; + imp->imp_force_reconnect = 0; + imp->imp_force_verify = 0; + + imp->imp_connect_data = *ocd; + + CDEBUG(D_HA, "%s: connect to target with instance %u\n", + imp->imp_obd->obd_name, ocd->ocd_instance); + exp = class_conn2export(&imp->imp_dlm_handle); + + spin_unlock(&imp->imp_lock); + + /* check that server granted subset of flags we asked for. */ + if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != + ocd->ocd_connect_flags) { + CERROR("%s: Server didn't granted asked subset of flags: " + "asked="LPX64" grranted="LPX64"\n", + imp->imp_obd->obd_name,imp->imp_connect_flags_orig, + ocd->ocd_connect_flags); + GOTO(out, rc = -EPROTO); + } + + if (!exp) { + /* This could happen if export is cleaned during the + connect attempt */ + CERROR("%s: missing export after connect\n", + imp->imp_obd->obd_name); + GOTO(out, rc = -ENODEV); + } + old_connect_flags = exp_connect_flags(exp); + exp->exp_connect_data = *ocd; + imp->imp_obd->obd_self_export->exp_connect_data = *ocd; + class_export_put(exp); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); + + if (aa->pcaa_initial_connect) { + spin_lock(&imp->imp_lock); + if (msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + spin_unlock(&imp->imp_lock); + CDEBUG(D_HA, "connected to replayable target: %s\n", + obd2cli_tgt(imp->imp_obd)); + } else { + imp->imp_replayable = 0; + spin_unlock(&imp->imp_lock); + } + + /* if applies, adjust the imp->imp_msg_magic here + * according to reply flags */ + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + /* Initial connects are allowed for clients with non-random + * uuids when servers are in recovery. Simply signal the + * servers replay is complete and wait in REPLAY_WAIT. */ + if (msg_flags & MSG_CONNECT_RECOVERING) { + CDEBUG(D_HA, "connect to %s during recovery\n", + obd2cli_tgt(imp->imp_obd)); + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); + } else { + IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); + ptlrpc_activate_import(imp); + } + + GOTO(finish, rc = 0); + } + + /* Determine what recovery state to move the import to. */ + if (MSG_CONNECT_RECONNECT & msg_flags) { + memset(&old_hdl, 0, sizeof(old_hdl)); + if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), + sizeof (old_hdl))) { + LCONSOLE_WARN("Reconnect to %s (at @%s) failed due " + "bad handle "LPX64"\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_dlm_handle.cookie); + GOTO(out, rc = -ENOTCONN); + } + + if (memcmp(&imp->imp_remote_handle, + lustre_msg_get_handle(request->rq_repmsg), + sizeof(imp->imp_remote_handle))) { + int level = msg_flags & MSG_CONNECT_RECOVERING ? + D_HA : D_WARNING; + + /* Bug 16611/14775: if server handle have changed, + * that means some sort of disconnection happened. + * If the server is not in recovery, that also means it + * already erased all of our state because of previous + * eviction. If it is in recovery - we are safe to + * participate since we can reestablish all of our state + * with server again */ + if ((MSG_CONNECT_RECOVERING & msg_flags)) { + CDEBUG(level,"%s@%s changed server handle from " + LPX64" to "LPX64 + " but is still in recovery\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } else { + LCONSOLE_WARN("Evicted from %s (at %s) " + "after server handle changed from " + LPX64" to "LPX64"\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection-> \ + c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, + lustre_msg_get_handle( + request->rq_repmsg)->cookie); + } + + + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + + if (!(MSG_CONNECT_RECOVERING & msg_flags)) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + GOTO(finish, rc = 0); + } + + } else { + CDEBUG(D_HA, "reconnected to %s@%s after partition\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + } + + if (imp->imp_invalid) { + CDEBUG(D_HA, "%s: reconnected but import is invalid; " + "marking evicted\n", imp->imp_obd->obd_name); + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + } else if (MSG_CONNECT_RECOVERING & msg_flags) { + CDEBUG(D_HA, "%s: reconnected to %s during replay\n", + imp->imp_obd->obd_name, + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 1; + spin_unlock(&imp->imp_lock); + + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); + } else { + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } + } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { + LASSERT(imp->imp_replayable); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + imp->imp_last_replay_transno = 0; + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); + } else { + DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags" + " not set: %x)", imp->imp_obd->obd_name, msg_flags); + imp->imp_remote_handle = + *lustre_msg_get_handle(request->rq_repmsg); + IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); + } + + /* Sanity checks for a reconnected import. */ + if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) { + CERROR("imp_replayable flag does not match server " + "after reconnect. We should LBUG right here.\n"); + } + + if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && + lustre_msg_get_last_committed(request->rq_repmsg) < + aa->pcaa_peer_committed) { + CERROR("%s went back in time (transno "LPD64 + " was previously committed, server now claims "LPD64 + ")! See https://bugzilla.lustre.org/show_bug.cgi?" + "id=9646\n", + obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, + lustre_msg_get_last_committed(request->rq_repmsg)); + } + +finish: + rc = ptlrpc_import_recovery_state_machine(imp); + if (rc != 0) { + if (rc == -ENOTCONN) { + CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;" + "invalidating and reconnecting\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_connect_import(imp); + imp->imp_connect_tried = 1; + RETURN(0); + } + } else { + + spin_lock(&imp->imp_lock); + list_del(&imp->imp_conn_current->oic_item); + list_add(&imp->imp_conn_current->oic_item, + &imp->imp_conn_list); + imp->imp_last_success_conn = + imp->imp_conn_current->oic_last_attempt; + + spin_unlock(&imp->imp_lock); + + if (!ocd->ocd_ibits_known && + ocd->ocd_connect_flags & OBD_CONNECT_IBITS) + CERROR("Inodebits aware server returned zero compatible" + " bits?\n"); + + if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version > LUSTRE_VERSION_CODE + + LUSTRE_VERSION_OFFSET_WARN || + ocd->ocd_version < LUSTRE_VERSION_CODE - + LUSTRE_VERSION_OFFSET_WARN)) { + /* Sigh, some compilers do not like #ifdef in the middle + of macro arguments */ + const char *older = "older. Consider upgrading server " + "or downgrading client"; + const char *newer = "newer than client version. " + "Consider upgrading client"; + + LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) " + "is much %s (%s)\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + ocd->ocd_version > LUSTRE_VERSION_CODE ? + newer : older, LUSTRE_VERSION_STRING); + } + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + /* Check if server has LU-1252 fix applied to not always swab + * the IR MNE entries. Do this only once per connection. This + * fixup is version-limited, because we don't want to carry the + * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we + * need interop with unpatched 2.2 servers. For newer servers, + * the client will do MNE swabbing only as needed. LU-1644 */ + if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && + OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && + OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && + strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) == 0)) + imp->imp_need_mne_swab = 1; + else /* clear if server was upgraded since last connect */ + imp->imp_need_mne_swab = 0; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { + /* We sent to the server ocd_cksum_types with bits set + * for algorithms we understand. The server masked off + * the checksum types it doesn't support */ + if ((ocd->ocd_cksum_types & + cksum_types_supported_client()) == 0) { + LCONSOLE_WARN("The negotiation of the checksum " + "alogrithm to use with server %s " + "failed (%x/%x), disabling " + "checksums\n", + obd2cli_tgt(imp->imp_obd), + ocd->ocd_cksum_types, + cksum_types_supported_client()); + cli->cl_checksum = 0; + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } else { + cli->cl_supp_cksum_types = ocd->ocd_cksum_types; + } + } else { + /* The server does not support OBD_CONNECT_CKSUM. + * Enforce ADLER for backward compatibility*/ + cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; + } + cli->cl_cksum_type =cksum_type_select(cli->cl_supp_cksum_types); + + if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + cli->cl_max_pages_per_rpc = + min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT, + cli->cl_max_pages_per_rpc); + else if (imp->imp_connect_op == MDS_CONNECT || + imp->imp_connect_op == MGS_CONNECT) + cli->cl_max_pages_per_rpc = 1; + + /* Reset ns_connect_flags only for initial connect. It might be + * changed in while using FS and if we reset it in reconnect + * this leads to losing user settings done before such as + * disable lru_resize, etc. */ + if (old_connect_flags != exp_connect_flags(exp) || + aa->pcaa_initial_connect) { + CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server " + "flags: "LPX64"\n", imp->imp_obd->obd_name, + ocd->ocd_connect_flags); + imp->imp_obd->obd_namespace->ns_connect_flags = + ocd->ocd_connect_flags; + imp->imp_obd->obd_namespace->ns_orig_connect_flags = + ocd->ocd_connect_flags; + } + + if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + /* We need a per-message support flag, because + a. we don't know if the incoming connect reply + supports AT or not (in reply_in_callback) + until we unpack it. + b. failovered server means export and flags are gone + (in ptlrpc_send_reply). + Can only be set when we know AT is supported at + both ends */ + imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; + else + imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; + + if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; + else + imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; + + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && + (cli->cl_max_pages_per_rpc > 0)); + } + +out: + imp->imp_connect_tried = 1; + + if (rc != 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); + if (rc == -EACCES) { + /* + * Give up trying to reconnect + * EACCES means client has no permission for connection + */ + imp->imp_obd->obd_no_recov = 1; + ptlrpc_deactivate_import(imp); + } + + if (rc == -EPROTO) { + struct obd_connect_data *ocd; + + /* reply message might not be ready */ + if (request->rq_repmsg == NULL) + RETURN(-EPROTO); + + ocd = req_capsule_server_get(&request->rq_pill, + &RMF_CONNECT_DATA); + if (ocd && + (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && + (ocd->ocd_version != LUSTRE_VERSION_CODE)) { + /* Actually servers are only supposed to refuse + connection from liblustre clients, so we should + never see this from VFS context */ + LCONSOLE_ERROR_MSG(0x16a, "Server %s version " + "(%d.%d.%d.%d)" + " refused connection from this client " + "with an incompatible version (%s). " + "Client must be recompiled\n", + obd2cli_tgt(imp->imp_obd), + OBD_OCD_VERSION_MAJOR(ocd->ocd_version), + OBD_OCD_VERSION_MINOR(ocd->ocd_version), + OBD_OCD_VERSION_PATCH(ocd->ocd_version), + OBD_OCD_VERSION_FIX(ocd->ocd_version), + LUSTRE_VERSION_STRING); + ptlrpc_deactivate_import(imp); + IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED); + } + RETURN(-EPROTO); + } + + ptlrpc_maybe_ping_import_soon(imp); + + CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", + obd2cli_tgt(imp->imp_obd), + (char *)imp->imp_connection->c_remote_uuid.uuid, rc); + } + + wake_up_all(&imp->imp_recovery_waitq); + RETURN(rc); +} + +/** + * interpret callback for "completed replay" RPCs. + * \see signal_completed_replay + */ +static int completed_replay_interpret(const struct lu_env *env, + struct ptlrpc_request *req, + void * data, int rc) +{ + ENTRY; + atomic_dec(&req->rq_import->imp_replay_inflight); + if (req->rq_status == 0 && + !req->rq_import->imp_vbr_failed) { + ptlrpc_import_recovery_state_machine(req->rq_import); + } else { + if (req->rq_import->imp_vbr_failed) { + CDEBUG(D_WARNING, + "%s: version recovery fails, reconnecting\n", + req->rq_import->imp_obd->obd_name); + } else { + CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, " + "reconnecting\n", + req->rq_import->imp_obd->obd_name, + req->rq_status); + } + ptlrpc_connect_import(req->rq_import); + } + + RETURN(0); +} + +/** + * Let server know that we have no requests to replay anymore. + * Achieved by just sending a PING request + */ +static int signal_completed_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) + RETURN(0); + + LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); + atomic_inc(&imp->imp_replay_inflight); + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, + OBD_PING); + if (req == NULL) { + atomic_dec(&imp->imp_replay_inflight); + RETURN(-ENOMEM); + } + + ptlrpc_request_set_replen(req); + req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; + lustre_msg_add_flags(req->rq_reqmsg, + MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); + if (AT_OFF) + req->rq_timeout *= 3; + req->rq_interpret_reply = completed_replay_interpret; + + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + RETURN(0); +} + +/** + * In kernel code all import invalidation happens in its own + * separate thread, so that whatever application happened to encounter + * a problem could still be killed or otherwise continue + */ +static int ptlrpc_invalidate_import_thread(void *data) +{ + struct obd_import *imp = data; + + ENTRY; + + unshare_fs_struct(); + + CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + ptlrpc_invalidate_import(imp); + + if (obd_dump_on_eviction) { + CERROR("dump the log upon eviction\n"); + libcfs_debug_dumplog(); + } + + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + ptlrpc_import_recovery_state_machine(imp); + + class_import_put(imp); + RETURN(0); +} + +/** + * This is the state machine for client-side recovery on import. + * + * Typicaly we have two possibly paths. If we came to server and it is not + * in recovery, we just enter IMP_EVICTED state, invalidate our import + * state and reconnect from scratch. + * If we came to server that is in recovery, we enter IMP_REPLAY import state. + * We go through our list of requests to replay and send them to server one by + * one. + * After sending all request from the list we change import state to + * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server + * and also all the locks we don't yet have and wait for server to grant us. + * After that we send a special "replay completed" request and change import + * state to IMP_REPLAY_WAIT. + * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER + * state and resend all requests from sending list. + * After that we promote import to FULL state and send all delayed requests + * and import is fully operational after that. + * + */ +int ptlrpc_import_recovery_state_machine(struct obd_import *imp) +{ + int rc = 0; + int inflight; + char *target_start; + int target_len; + + ENTRY; + if (imp->imp_state == LUSTRE_IMP_EVICTED) { + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + /* Don't care about MGC eviction */ + if (strcmp(imp->imp_obd->obd_type->typ_name, + LUSTRE_MGC_NAME) != 0) { + LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted " + "by %.*s; in progress operations " + "using this service will fail.\n", + imp->imp_obd->obd_name, target_len, + target_start); + } + CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + /* reset vbr_failed flag upon eviction */ + spin_lock(&imp->imp_lock); + imp->imp_vbr_failed = 0; + spin_unlock(&imp->imp_lock); + + { + task_t *task; + /* bug 17802: XXX client_disconnect_export vs connect request + * race. if client will evicted at this time, we start + * invalidate thread without reference to import and import can + * be freed at same time. */ + class_import_get(imp); + task = kthread_run(ptlrpc_invalidate_import_thread, imp, + "ll_imp_inval"); + if (IS_ERR(task)) { + class_import_put(imp); + CERROR("error starting invalidate thread: %d\n", rc); + rc = PTR_ERR(task); + } else { + rc = 0; + } + RETURN(rc); + } + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY) { + CDEBUG(D_HA, "replay requested by %s\n", + obd2cli_tgt(imp->imp_obd)); + rc = ptlrpc_replay_next(imp, &inflight); + if (inflight == 0 && + atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); + rc = ldlm_replay_locks(imp); + if (rc) + GOTO(out, rc); + } + rc = 0; + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT); + rc = signal_completed_replay(imp); + if (rc) + GOTO(out, rc); + } + + } + + if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) { + if (atomic_read(&imp->imp_replay_inflight) == 0) { + IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); + } + } + + if (imp->imp_state == LUSTRE_IMP_RECOVER) { + CDEBUG(D_HA, "reconnected to %s@%s\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + rc = ptlrpc_resend(imp); + if (rc) + GOTO(out, rc); + IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); + ptlrpc_activate_import(imp); + + deuuidify(obd2cli_tgt(imp->imp_obd), NULL, + &target_start, &target_len); + LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); + } + + if (imp->imp_state == LUSTRE_IMP_FULL) { + wake_up_all(&imp->imp_recovery_waitq); + ptlrpc_wake_delayed(imp); + } + +out: + RETURN(rc); +} + +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) +{ + struct ptlrpc_request *req; + int rq_opc, rc = 0; + int nowait = imp->imp_obd->obd_force; + ENTRY; + + if (nowait) + GOTO(set_state, rc); + + switch (imp->imp_connect_op) { + case OST_CONNECT: rq_opc = OST_DISCONNECT; break; + case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break; + case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break; + default: + CERROR("don't know how to disconnect from %s (connect_op %d)\n", + obd2cli_tgt(imp->imp_obd), imp->imp_connect_op); + RETURN(-EINVAL); + } + + if (ptlrpc_import_in_recovery(imp)) { + struct l_wait_info lwi; + cfs_duration_t timeout; + + + if (AT_OFF) { + if (imp->imp_server_timeout) + timeout = cfs_time_seconds(obd_timeout / 2); + else + timeout = cfs_time_seconds(obd_timeout); + } else { + int idx = import_at_get_index(imp, + imp->imp_client->cli_request_portal); + timeout = cfs_time_seconds( + at_get(&imp->imp_at.iat_service_estimate[idx])); + } + + lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), + back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + + } + + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_FULL) + GOTO(out, 0); + + spin_unlock(&imp->imp_lock); + + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, + LUSTRE_OBD_VERSION, rq_opc); + if (req) { + /* We are disconnecting, do not retry a failed DISCONNECT rpc if + * it fails. We can get through the above with a down server + * if the client doesn't know the server is gone yet. */ + req->rq_no_resend = 1; + + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min_t(int, req->rq_timeout, + INITIAL_CONNECT_TIMEOUT); + + IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); + req->rq_send_state = LUSTRE_IMP_CONNECTING; + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + +set_state: + spin_lock(&imp->imp_lock); +out: + if (noclose) + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); + else + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + spin_unlock(&imp->imp_lock); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_disconnect_import); + +void ptlrpc_cleanup_imp(struct obd_import *imp) +{ + ENTRY; + + spin_lock(&imp->imp_lock); + IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); + imp->imp_generation++; + spin_unlock(&imp->imp_lock); + ptlrpc_abort_inflight(imp); + + EXIT; +} +EXPORT_SYMBOL(ptlrpc_cleanup_imp); + +/* Adaptive Timeout utils */ +extern unsigned int at_min, at_max, at_history; + +/* Bin into timeslices using AT_BINS bins. + This gives us a max of the last binlimit*AT_BINS secs without the storage, + but still smoothing out a return to normalcy from a slow response. + (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ +int at_measured(struct adaptive_timeout *at, unsigned int val) +{ + unsigned int old = at->at_current; + time_t now = cfs_time_current_sec(); + time_t binlimit = max_t(time_t, at_history / AT_BINS, 1); + + LASSERT(at); + CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", + val, at, now - at->at_binstart, at->at_current, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); + + if (val == 0) + /* 0's don't count, because we never want our timeout to + drop to 0, and because 0 could mean an error */ + return 0; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = now; + at->at_hist[0] = val; + at->at_binstart = now; + } else if (now - at->at_binstart < binlimit ) { + /* in bin 0 */ + at->at_hist[0] = max(val, at->at_hist[0]); + at->at_current = max(val, at->at_current); + } else { + int i, shift; + unsigned int maxv = val; + /* move bins over */ + shift = (now - at->at_binstart) / binlimit; + LASSERT(shift > 0); + for(i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max(maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = val; + at->at_current = maxv; + at->at_binstart += shift * binlimit; + } + + if (at->at_current > at->at_worst_ever) { + at->at_worst_ever = at->at_current; + at->at_worst_time = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + for proc only */ + at->at_current = val; + + if (at_max > 0) + at->at_current = min(at->at_current, at_max); + at->at_current = max(at->at_current, at_min); + + if (at->at_current != old) + CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d " + "(val=%u) hist %u %u %u %u\n", at, + old, at->at_current, at->at_current - old, val, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); + + /* if we changed, report the old value */ + old = (at->at_current != old) ? old : 0; + + spin_unlock(&at->at_lock); + return old; +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) +{ + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not enough portals? */ + LASSERT(i < IMP_AT_MAX_PORTALS); + + at->iat_portal[i] = portal; +out: + spin_unlock(&imp->imp_lock); + return i; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c new file mode 100644 index 000000000000..2f55ce26ccba --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/layout.c @@ -0,0 +1,2396 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/layout.c + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov + */ +/* + * This file contains the "capsule/pill" abstraction layered above PTLRPC. + * + * Every struct ptlrpc_request contains a "pill", which points to a description + * of the format that the request conforms to. + */ + +#if !defined(__REQ_LAYOUT_USER__) + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +/* LUSTRE_VERSION_CODE */ +#include + +#include +/* lustre_swab_mdt_body */ +#include +/* obd2cli_tgt() (required by DEBUG_REQ()) */ +#include + +/* __REQ_LAYOUT_USER__ */ +#endif +/* struct ptlrpc_request, lustre_msg* */ +#include +#include +#include +#include + +/* + * RQFs (see below) refer to two struct req_msg_field arrays describing the + * client request and server reply, respectively. + */ +/* empty set of fields... for suitable definition of emptiness. */ +static const struct req_msg_field *empty[] = { + &RMF_PTLRPC_BODY +}; + +static const struct req_msg_field *mgs_target_info_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_TARGET_INFO +}; + +static const struct req_msg_field *mgs_set_info[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_SEND_PARAM +}; + +static const struct req_msg_field *mgs_config_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_BODY +}; + +static const struct req_msg_field *mgs_config_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MGS_CONFIG_RES +}; + +static const struct req_msg_field *log_cancel_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LOGCOOKIES +}; + +static const struct req_msg_field *mdt_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY +}; + +static const struct req_msg_field *mdt_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *quotactl_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_QUOTACTL +}; + +static const struct req_msg_field *quota_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *ldlm_intent_quota_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB, + &RMF_QUOTA_BODY +}; + +static const struct req_msg_field *mdt_close_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_EPOCH, + &RMF_REC_REINT, + &RMF_CAPA1 +}; + +static const struct req_msg_field *obd_statfs_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_STATFS +}; + +static const struct req_msg_field *seq_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_OPC, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *seq_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_SEQ_RANGE +}; + +static const struct req_msg_field *fld_query_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_OPC, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *fld_query_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FLD_MDFLD +}; + +static const struct req_msg_field *mds_getattr_name_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT +}; + +static const struct req_msg_field *mds_reint_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_reint_create_slave_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_rmt_acl_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_create_sym_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_reint_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_link_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_rename_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_SYMTGT, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_last_unlink_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_LOGCOOKIES, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_reint_setattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_MDT_EPOCH, + &RMF_EADATA, + &RMF_LOGCOOKIES, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *mds_reint_setxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_REC_REINT, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *mdt_swap_layouts[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_SWAP_LAYOUTS, + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *obd_connect_client[] = { + &RMF_PTLRPC_BODY, + &RMF_TGTUUID, + &RMF_CLUUID, + &RMF_CONN, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_connect_server[] = { + &RMF_PTLRPC_BODY, + &RMF_CONNECT_DATA +}; + +static const struct req_msg_field *obd_set_info_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_SETINFO_VAL +}; + +static const struct req_msg_field *ost_grant_shrink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *mds_getinfo_client[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_KEY, + &RMF_GETINFO_VALLEN +}; + +static const struct req_msg_field *mds_getinfo_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GETINFO_VAL, +}; + +static const struct req_msg_field *ldlm_enqueue_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ +}; + +static const struct req_msg_field *ldlm_enqueue_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP +}; + +static const struct req_msg_field *ldlm_enqueue_lvb_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_cp_callback_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_gl_callback_desc_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_DLM_GL_DESC +}; + +static const struct req_msg_field *ldlm_gl_callback_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_LVB +}; + +static const struct req_msg_field *ldlm_intent_basic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, +}; + +static const struct req_msg_field *ldlm_intent_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT +}; + +static const struct req_msg_field *ldlm_intent_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL +}; + +static const struct req_msg_field *ldlm_intent_layout_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_LAYOUT_INTENT, + &RMF_EADATA /* for new layout to be set up */ +}; +static const struct req_msg_field *ldlm_intent_open_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *ldlm_intent_getattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */ + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *ldlm_intent_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REP, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ldlm_intent_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */ + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *ldlm_intent_open_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */ + &RMF_CAPA1, + &RMF_CAPA2, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *ldlm_intent_unlink_client[] = { + &RMF_PTLRPC_BODY, + &RMF_DLM_REQ, + &RMF_LDLM_INTENT, + &RMF_REC_REINT, /* coincides with mds_reint_unlink_client[] */ + &RMF_CAPA1, + &RMF_NAME +}; + +static const struct req_msg_field *mds_getxattr_client[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_NAME, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getxattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *mds_getattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_setattr_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDT_MD, + &RMF_ACL, + &RMF_CAPA1, + &RMF_CAPA2 +}; + +static const struct req_msg_field *mds_update_client[] = { + &RMF_PTLRPC_BODY, + &RMF_UPDATE, +}; + +static const struct req_msg_field *mds_update_server[] = { + &RMF_PTLRPC_BODY, + &RMF_UPDATE_REPLY, +}; + +static const struct req_msg_field *llog_origin_handle_create_client[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_NAME +}; + +static const struct req_msg_field *llogd_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY +}; + +static const struct req_msg_field *llog_log_hdr_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOG_LOG_HDR +}; + +static const struct req_msg_field *llogd_conn_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_CONN_BODY +}; + +static const struct req_msg_field *llog_origin_handle_next_block_server[] = { + &RMF_PTLRPC_BODY, + &RMF_LLOGD_BODY, + &RMF_EADATA +}; + +static const struct req_msg_field *obd_idx_read_client[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *obd_idx_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_IDX_INFO +}; + +static const struct req_msg_field *ost_body_only[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_body_capa[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_destroy_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_DLM_REQ, + &RMF_CAPA1 +}; + + +static const struct req_msg_field *ost_brw_client[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_OBD_IOOBJ, + &RMF_NIOBUF_REMOTE, + &RMF_CAPA1 +}; + +static const struct req_msg_field *ost_brw_read_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY +}; + +static const struct req_msg_field *ost_brw_write_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OST_BODY, + &RMF_RCS +}; + +static const struct req_msg_field *ost_get_info_generic_server[] = { + &RMF_PTLRPC_BODY, + &RMF_GENERIC_DATA, +}; + +static const struct req_msg_field *ost_get_info_generic_client[] = { + &RMF_PTLRPC_BODY, + &RMF_SETINFO_KEY +}; + +static const struct req_msg_field *ost_get_last_id_server[] = { + &RMF_PTLRPC_BODY, + &RMF_OBD_ID +}; + +static const struct req_msg_field *ost_get_last_fid_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FID, +}; + +static const struct req_msg_field *ost_get_fiemap_client[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_KEY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *ost_get_fiemap_server[] = { + &RMF_PTLRPC_BODY, + &RMF_FIEMAP_VAL +}; + +static const struct req_msg_field *mdt_hsm_progress[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_PROGRESS, +}; + +static const struct req_msg_field *mdt_hsm_ct_register[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_ARCHIVE, +}; + +static const struct req_msg_field *mdt_hsm_ct_unregister[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, +}; + +static const struct req_msg_field *mdt_hsm_action_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_CURRENT_ACTION, +}; + +static const struct req_msg_field *mdt_hsm_state_get_server[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_HSM_USER_STATE, +}; + +static const struct req_msg_field *mdt_hsm_state_set[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_CAPA1, + &RMF_HSM_STATE_SET, +}; + +static const struct req_msg_field *mdt_hsm_request[] = { + &RMF_PTLRPC_BODY, + &RMF_MDT_BODY, + &RMF_MDS_HSM_REQUEST, + &RMF_MDS_HSM_USER_ITEM, + &RMF_GENERIC_DATA, +}; + +static struct req_format *req_formats[] = { + &RQF_OBD_PING, + &RQF_OBD_SET_INFO, + &RQF_OBD_IDX_READ, + &RQF_SEC_CTX, + &RQF_MGS_TARGET_REG, + &RQF_MGS_SET_INFO, + &RQF_MGS_CONFIG_READ, + &RQF_SEQ_QUERY, + &RQF_FLD_QUERY, + &RQF_MDS_CONNECT, + &RQF_MDS_DISCONNECT, + &RQF_MDS_GET_INFO, + &RQF_MDS_GETSTATUS, + &RQF_MDS_STATFS, + &RQF_MDS_GETATTR, + &RQF_MDS_GETATTR_NAME, + &RQF_MDS_GETXATTR, + &RQF_MDS_SYNC, + &RQF_MDS_CLOSE, + &RQF_MDS_PIN, + &RQF_MDS_UNPIN, + &RQF_MDS_READPAGE, + &RQF_MDS_WRITEPAGE, + &RQF_MDS_IS_SUBDIR, + &RQF_MDS_DONE_WRITING, + &RQF_MDS_REINT, + &RQF_MDS_REINT_CREATE, + &RQF_MDS_REINT_CREATE_RMT_ACL, + &RQF_MDS_REINT_CREATE_SLAVE, + &RQF_MDS_REINT_CREATE_SYM, + &RQF_MDS_REINT_OPEN, + &RQF_MDS_REINT_UNLINK, + &RQF_MDS_REINT_LINK, + &RQF_MDS_REINT_RENAME, + &RQF_MDS_REINT_SETATTR, + &RQF_MDS_REINT_SETXATTR, + &RQF_MDS_QUOTACHECK, + &RQF_MDS_QUOTACTL, + &RQF_MDS_HSM_PROGRESS, + &RQF_MDS_HSM_CT_REGISTER, + &RQF_MDS_HSM_CT_UNREGISTER, + &RQF_MDS_HSM_STATE_GET, + &RQF_MDS_HSM_STATE_SET, + &RQF_MDS_HSM_ACTION, + &RQF_MDS_HSM_REQUEST, + &RQF_MDS_SWAP_LAYOUTS, + &RQF_UPDATE_OBJ, + &RQF_QC_CALLBACK, + &RQF_OST_CONNECT, + &RQF_OST_DISCONNECT, + &RQF_OST_QUOTACHECK, + &RQF_OST_QUOTACTL, + &RQF_OST_GETATTR, + &RQF_OST_SETATTR, + &RQF_OST_CREATE, + &RQF_OST_PUNCH, + &RQF_OST_SYNC, + &RQF_OST_DESTROY, + &RQF_OST_BRW_READ, + &RQF_OST_BRW_WRITE, + &RQF_OST_STATFS, + &RQF_OST_SET_GRANT_INFO, + &RQF_OST_GET_INFO_GENERIC, + &RQF_OST_GET_INFO_LAST_ID, + &RQF_OST_GET_INFO_LAST_FID, + &RQF_OST_SET_INFO_LAST_FID, + &RQF_OST_GET_INFO_FIEMAP, + &RQF_LDLM_ENQUEUE, + &RQF_LDLM_ENQUEUE_LVB, + &RQF_LDLM_CONVERT, + &RQF_LDLM_CANCEL, + &RQF_LDLM_CALLBACK, + &RQF_LDLM_CP_CALLBACK, + &RQF_LDLM_BL_CALLBACK, + &RQF_LDLM_GL_CALLBACK, + &RQF_LDLM_GL_DESC_CALLBACK, + &RQF_LDLM_INTENT, + &RQF_LDLM_INTENT_BASIC, + &RQF_LDLM_INTENT_LAYOUT, + &RQF_LDLM_INTENT_GETATTR, + &RQF_LDLM_INTENT_OPEN, + &RQF_LDLM_INTENT_CREATE, + &RQF_LDLM_INTENT_UNLINK, + &RQF_LDLM_INTENT_QUOTA, + &RQF_QUOTA_DQACQ, + &RQF_LOG_CANCEL, + &RQF_LLOG_ORIGIN_HANDLE_CREATE, + &RQF_LLOG_ORIGIN_HANDLE_DESTROY, + &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + &RQF_LLOG_ORIGIN_CONNECT +}; + +struct req_msg_field { + const __u32 rmf_flags; + const char *rmf_name; + /** + * Field length. (-1) means "variable length". If the + * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length, + * but the actual size must be a whole multiple of \a rmf_size. + */ + const int rmf_size; + void (*rmf_swabber)(void *); + void (*rmf_dumper)(void *); + int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR]; +}; + +enum rmf_flags { + /** + * The field is a string, must be NUL-terminated. + */ + RMF_F_STRING = 1 << 0, + /** + * The field's buffer size need not match the declared \a rmf_size. + */ + RMF_F_NO_SIZE_CHECK = 1 << 1, + /** + * The field's buffer size must be a whole multiple of the declared \a + * rmf_size and the \a rmf_swabber function must work on the declared \a + * rmf_size worth of bytes. + */ + RMF_F_STRUCT_ARRAY = 1 << 2 +}; + +struct req_capsule; + +/* + * Request fields. + */ +#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \ + .rmf_name = (name), \ + .rmf_flags = (flags), \ + .rmf_size = (size), \ + .rmf_swabber = (void (*)(void*))(swabber), \ + .rmf_dumper = (void (*)(void*))(dumper) \ +} + +struct req_msg_field RMF_GENERIC_DATA = + DEFINE_MSGF("generic_data", 0, + -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GENERIC_DATA); + +struct req_msg_field RMF_MGS_TARGET_INFO = + DEFINE_MSGF("mgs_target_info", 0, + sizeof(struct mgs_target_info), + lustre_swab_mgs_target_info, NULL); +EXPORT_SYMBOL(RMF_MGS_TARGET_INFO); + +struct req_msg_field RMF_MGS_SEND_PARAM = + DEFINE_MSGF("mgs_send_param", 0, + sizeof(struct mgs_send_param), + NULL, NULL); +EXPORT_SYMBOL(RMF_MGS_SEND_PARAM); + +struct req_msg_field RMF_MGS_CONFIG_BODY = + DEFINE_MSGF("mgs_config_read request", 0, + sizeof(struct mgs_config_body), + lustre_swab_mgs_config_body, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY); + +struct req_msg_field RMF_MGS_CONFIG_RES = + DEFINE_MSGF("mgs_config_read reply ", 0, + sizeof(struct mgs_config_res), + lustre_swab_mgs_config_res, NULL); +EXPORT_SYMBOL(RMF_MGS_CONFIG_RES); + +struct req_msg_field RMF_U32 = + DEFINE_MSGF("generic u32", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_U32); + +struct req_msg_field RMF_SETINFO_VAL = + DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_VAL); + +struct req_msg_field RMF_GETINFO_KEY = + DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_KEY); + +struct req_msg_field RMF_GETINFO_VALLEN = + DEFINE_MSGF("getinfo_vallen", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VALLEN); + +struct req_msg_field RMF_GETINFO_VAL = + DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_GETINFO_VAL); + +struct req_msg_field RMF_SEQ_OPC = + DEFINE_MSGF("seq_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_SEQ_OPC); + +struct req_msg_field RMF_SEQ_RANGE = + DEFINE_MSGF("seq_query_range", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_SEQ_RANGE); + +struct req_msg_field RMF_FLD_OPC = + DEFINE_MSGF("fld_query_opc", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_FLD_OPC); + +struct req_msg_field RMF_FLD_MDFLD = + DEFINE_MSGF("fld_query_mdfld", 0, + sizeof(struct lu_seq_range), + lustre_swab_lu_seq_range, NULL); +EXPORT_SYMBOL(RMF_FLD_MDFLD); + +struct req_msg_field RMF_MDT_BODY = + DEFINE_MSGF("mdt_body", 0, + sizeof(struct mdt_body), lustre_swab_mdt_body, NULL); +EXPORT_SYMBOL(RMF_MDT_BODY); + +struct req_msg_field RMF_OBD_QUOTACTL = + DEFINE_MSGF("obd_quotactl", 0, + sizeof(struct obd_quotactl), + lustre_swab_obd_quotactl, NULL); +EXPORT_SYMBOL(RMF_OBD_QUOTACTL); + +struct req_msg_field RMF_QUOTA_BODY = + DEFINE_MSGF("quota_body", 0, + sizeof(struct quota_body), lustre_swab_quota_body, NULL); +EXPORT_SYMBOL(RMF_QUOTA_BODY); + +struct req_msg_field RMF_MDT_EPOCH = + DEFINE_MSGF("mdt_ioepoch", 0, + sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL); +EXPORT_SYMBOL(RMF_MDT_EPOCH); + +struct req_msg_field RMF_PTLRPC_BODY = + DEFINE_MSGF("ptlrpc_body", 0, + sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL); +EXPORT_SYMBOL(RMF_PTLRPC_BODY); + +struct req_msg_field RMF_OBD_STATFS = + DEFINE_MSGF("obd_statfs", 0, + sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL); +EXPORT_SYMBOL(RMF_OBD_STATFS); + +struct req_msg_field RMF_SETINFO_KEY = + DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SETINFO_KEY); + +struct req_msg_field RMF_NAME = + DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_NAME); + +struct req_msg_field RMF_SYMTGT = + DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_SYMTGT); + +struct req_msg_field RMF_TGTUUID = + DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_TGTUUID); + +struct req_msg_field RMF_CLUUID = + DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, + NULL); +EXPORT_SYMBOL(RMF_CLUUID); + +struct req_msg_field RMF_STRING = + DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_STRING); + +struct req_msg_field RMF_LLOGD_BODY = + DEFINE_MSGF("llogd_body", 0, + sizeof(struct llogd_body), lustre_swab_llogd_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_BODY); + +struct req_msg_field RMF_LLOG_LOG_HDR = + DEFINE_MSGF("llog_log_hdr", 0, + sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL); +EXPORT_SYMBOL(RMF_LLOG_LOG_HDR); + +struct req_msg_field RMF_LLOGD_CONN_BODY = + DEFINE_MSGF("llogd_conn_body", 0, + sizeof(struct llogd_conn_body), + lustre_swab_llogd_conn_body, NULL); +EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY); + +/* + * connection handle received in MDS_CONNECT request. + * + * No swabbing needed because struct lustre_handle contains only a 64-bit cookie + * that the client does not interpret at all. + */ +struct req_msg_field RMF_CONN = + DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL); +EXPORT_SYMBOL(RMF_CONN); + +struct req_msg_field RMF_CONNECT_DATA = + DEFINE_MSGF("cdata", + RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */, +#if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(2, 7, 50, 0) + sizeof(struct obd_connect_data), +#else +/* For interoperability with 1.8 and 2.0 clients/servers. + * The RPC verification code allows larger RPC buffers, but not + * smaller buffers. Until we no longer need to keep compatibility + * with older servers/clients we can only check that the buffer + * size is at least as large as obd_connect_data_v1. That is not + * not in itself harmful, since the chance of just corrupting this + * field is low. See JIRA LU-16 for details. */ + sizeof(struct obd_connect_data_v1), +#endif + lustre_swab_connect, NULL); +EXPORT_SYMBOL(RMF_CONNECT_DATA); + +struct req_msg_field RMF_DLM_REQ = + DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */, + sizeof(struct ldlm_request), + lustre_swab_ldlm_request, NULL); +EXPORT_SYMBOL(RMF_DLM_REQ); + +struct req_msg_field RMF_DLM_REP = + DEFINE_MSGF("dlm_rep", 0, + sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL); +EXPORT_SYMBOL(RMF_DLM_REP); + +struct req_msg_field RMF_LDLM_INTENT = + DEFINE_MSGF("ldlm_intent", 0, + sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL); +EXPORT_SYMBOL(RMF_LDLM_INTENT); + +struct req_msg_field RMF_DLM_LVB = + DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL); +EXPORT_SYMBOL(RMF_DLM_LVB); + +struct req_msg_field RMF_DLM_GL_DESC = + DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), + lustre_swab_gl_desc, NULL); +EXPORT_SYMBOL(RMF_DLM_GL_DESC); + +struct req_msg_field RMF_MDT_MD = + DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_MDT_MD); + +struct req_msg_field RMF_REC_REINT = + DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint), + lustre_swab_mdt_rec_reint, NULL); +EXPORT_SYMBOL(RMF_REC_REINT); + +/* FIXME: this length should be defined as a macro */ +struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, + NULL, NULL); +EXPORT_SYMBOL(RMF_EADATA); + +struct req_msg_field RMF_ACL = + DEFINE_MSGF("acl", RMF_F_NO_SIZE_CHECK, + LUSTRE_POSIX_ACL_MAX_SIZE, NULL, NULL); +EXPORT_SYMBOL(RMF_ACL); + +/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */ +struct req_msg_field RMF_LOGCOOKIES = + DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */, + sizeof(struct llog_cookie), NULL, NULL); +EXPORT_SYMBOL(RMF_LOGCOOKIES); + +struct req_msg_field RMF_CAPA1 = + DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), + lustre_swab_lustre_capa, NULL); +EXPORT_SYMBOL(RMF_CAPA1); + +struct req_msg_field RMF_CAPA2 = + DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), + lustre_swab_lustre_capa, NULL); +EXPORT_SYMBOL(RMF_CAPA2); + +struct req_msg_field RMF_LAYOUT_INTENT = + DEFINE_MSGF("layout_intent", 0, + sizeof(struct layout_intent), lustre_swab_layout_intent, + NULL); +EXPORT_SYMBOL(RMF_LAYOUT_INTENT); + +/* + * OST request field. + */ +struct req_msg_field RMF_OST_BODY = + DEFINE_MSGF("ost_body", 0, + sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body); +EXPORT_SYMBOL(RMF_OST_BODY); + +struct req_msg_field RMF_OBD_IOOBJ = + DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY, + sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo); +EXPORT_SYMBOL(RMF_OBD_IOOBJ); + +struct req_msg_field RMF_NIOBUF_REMOTE = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, + sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, + dump_rniobuf); +EXPORT_SYMBOL(RMF_NIOBUF_REMOTE); + +struct req_msg_field RMF_RCS = + DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32), + lustre_swab_generic_32s, dump_rcs); +EXPORT_SYMBOL(RMF_RCS); + +struct req_msg_field RMF_OBD_ID = + DEFINE_MSGF("obd_id", 0, + sizeof(obd_id), lustre_swab_ost_last_id, NULL); +EXPORT_SYMBOL(RMF_OBD_ID); + +struct req_msg_field RMF_FID = + DEFINE_MSGF("fid", 0, + sizeof(struct lu_fid), lustre_swab_lu_fid, NULL); +EXPORT_SYMBOL(RMF_FID); + +struct req_msg_field RMF_OST_ID = + DEFINE_MSGF("ost_id", 0, + sizeof(struct ost_id), lustre_swab_ost_id, NULL); +EXPORT_SYMBOL(RMF_OST_ID); + +struct req_msg_field RMF_FIEMAP_KEY = + DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key), + lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_KEY); + +struct req_msg_field RMF_FIEMAP_VAL = + DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL); +EXPORT_SYMBOL(RMF_FIEMAP_VAL); + +struct req_msg_field RMF_IDX_INFO = + DEFINE_MSGF("idx_info", 0, sizeof(struct idx_info), + lustre_swab_idx_info, NULL); +EXPORT_SYMBOL(RMF_IDX_INFO); +struct req_msg_field RMF_HSM_USER_STATE = + DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state), + lustre_swab_hsm_user_state, NULL); +EXPORT_SYMBOL(RMF_HSM_USER_STATE); + +struct req_msg_field RMF_HSM_STATE_SET = + DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set), + lustre_swab_hsm_state_set, NULL); +EXPORT_SYMBOL(RMF_HSM_STATE_SET); + +struct req_msg_field RMF_MDS_HSM_PROGRESS = + DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel), + lustre_swab_hsm_progress_kernel, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS); + +struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION = + DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action), + lustre_swab_hsm_current_action, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION); + +struct req_msg_field RMF_MDS_HSM_USER_ITEM = + DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY, + sizeof(struct hsm_user_item), lustre_swab_hsm_user_item, + NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM); + +struct req_msg_field RMF_MDS_HSM_ARCHIVE = + DEFINE_MSGF("hsm_archive", 0, + sizeof(__u32), lustre_swab_generic_32s, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE); + +struct req_msg_field RMF_MDS_HSM_REQUEST = + DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request), + lustre_swab_hsm_request, NULL); +EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST); + +struct req_msg_field RMF_UPDATE = DEFINE_MSGF("update", 0, -1, + lustre_swab_update_buf, NULL); +EXPORT_SYMBOL(RMF_UPDATE); + +struct req_msg_field RMF_UPDATE_REPLY = DEFINE_MSGF("update_reply", 0, -1, + lustre_swab_update_reply_buf, + NULL); +EXPORT_SYMBOL(RMF_UPDATE_REPLY); + +struct req_msg_field RMF_SWAP_LAYOUTS = + DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts), + lustre_swab_swap_layouts, NULL); +EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); +/* + * Request formats. + */ + +struct req_format { + const char *rf_name; + int rf_idx; + struct { + int nr; + const struct req_msg_field **d; + } rf_fields[RCL_NR]; +}; + +#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \ + .rf_name = name, \ + .rf_fields = { \ + [RCL_CLIENT] = { \ + .nr = client_nr, \ + .d = client \ + }, \ + [RCL_SERVER] = { \ + .nr = server_nr, \ + .d = server \ + } \ + } \ +} + +#define DEFINE_REQ_FMT0(name, client, server) \ +DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) + +struct req_format RQF_OBD_PING = + DEFINE_REQ_FMT0("OBD_PING", empty, empty); +EXPORT_SYMBOL(RQF_OBD_PING); + +struct req_format RQF_OBD_SET_INFO = + DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty); +EXPORT_SYMBOL(RQF_OBD_SET_INFO); + +/* Read index file through the network */ +struct req_format RQF_OBD_IDX_READ = + DEFINE_REQ_FMT0("OBD_IDX_READ", + obd_idx_read_client, obd_idx_read_server); +EXPORT_SYMBOL(RQF_OBD_IDX_READ); + +struct req_format RQF_SEC_CTX = + DEFINE_REQ_FMT0("SEC_CTX", empty, empty); +EXPORT_SYMBOL(RQF_SEC_CTX); + +struct req_format RQF_MGS_TARGET_REG = + DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, + mgs_target_info_only); +EXPORT_SYMBOL(RQF_MGS_TARGET_REG); + +struct req_format RQF_MGS_SET_INFO = + DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info, + mgs_set_info); +EXPORT_SYMBOL(RQF_MGS_SET_INFO); + +struct req_format RQF_MGS_CONFIG_READ = + DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client, + mgs_config_read_server); +EXPORT_SYMBOL(RQF_MGS_CONFIG_READ); + +struct req_format RQF_SEQ_QUERY = + DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); +EXPORT_SYMBOL(RQF_SEQ_QUERY); + +struct req_format RQF_FLD_QUERY = + DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); +EXPORT_SYMBOL(RQF_FLD_QUERY); + +struct req_format RQF_LOG_CANCEL = + DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty); +EXPORT_SYMBOL(RQF_LOG_CANCEL); + +struct req_format RQF_MDS_QUOTACHECK = + DEFINE_REQ_FMT0("MDS_QUOTACHECK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_MDS_QUOTACHECK); + +struct req_format RQF_OST_QUOTACHECK = + DEFINE_REQ_FMT0("OST_QUOTACHECK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_OST_QUOTACHECK); + +struct req_format RQF_MDS_QUOTACTL = + DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_MDS_QUOTACTL); + +struct req_format RQF_OST_QUOTACTL = + DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); +EXPORT_SYMBOL(RQF_OST_QUOTACTL); + +struct req_format RQF_QC_CALLBACK = + DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty); +EXPORT_SYMBOL(RQF_QC_CALLBACK); + +struct req_format RQF_QUOTA_DQACQ = + DEFINE_REQ_FMT0("QUOTA_DQACQ", quota_body_only, quota_body_only); +EXPORT_SYMBOL(RQF_QUOTA_DQACQ); + +struct req_format RQF_LDLM_INTENT_QUOTA = + DEFINE_REQ_FMT0("LDLM_INTENT_QUOTA", + ldlm_intent_quota_client, + ldlm_intent_quota_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_QUOTA); + +struct req_format RQF_MDS_GETSTATUS = + DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_GETSTATUS); + +struct req_format RQF_MDS_STATFS = + DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_MDS_STATFS); + +struct req_format RQF_MDS_SYNC = + DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_SYNC); + +struct req_format RQF_MDS_GETATTR = + DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR); + +struct req_format RQF_MDS_GETXATTR = + DEFINE_REQ_FMT0("MDS_GETXATTR", + mds_getxattr_client, mds_getxattr_server); +EXPORT_SYMBOL(RQF_MDS_GETXATTR); + +struct req_format RQF_MDS_GETATTR_NAME = + DEFINE_REQ_FMT0("MDS_GETATTR_NAME", + mds_getattr_name_client, mds_getattr_server); +EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME); + +struct req_format RQF_MDS_REINT = + DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT); + +struct req_format RQF_MDS_REINT_CREATE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE", + mds_reint_create_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE); + +struct req_format RQF_MDS_REINT_CREATE_RMT_ACL = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_RMT_ACL", + mds_reint_create_rmt_acl_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_RMT_ACL); + +struct req_format RQF_MDS_REINT_CREATE_SLAVE = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA", + mds_reint_create_slave_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE); + +struct req_format RQF_MDS_REINT_CREATE_SYM = + DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM", + mds_reint_create_sym_client, mdt_body_capa); +EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM); + +struct req_format RQF_MDS_REINT_OPEN = + DEFINE_REQ_FMT0("MDS_REINT_OPEN", + mds_reint_open_client, mds_reint_open_server); +EXPORT_SYMBOL(RQF_MDS_REINT_OPEN); + +struct req_format RQF_MDS_REINT_UNLINK = + DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK); + +struct req_format RQF_MDS_REINT_LINK = + DEFINE_REQ_FMT0("MDS_REINT_LINK", + mds_reint_link_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_LINK); + +struct req_format RQF_MDS_REINT_RENAME = + DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client, + mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_REINT_RENAME); + +struct req_format RQF_MDS_REINT_SETATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETATTR", + mds_reint_setattr_client, mds_setattr_server); +EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR); + +struct req_format RQF_MDS_REINT_SETXATTR = + DEFINE_REQ_FMT0("MDS_REINT_SETXATTR", + mds_reint_setxattr_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR); + +struct req_format RQF_MDS_CONNECT = + DEFINE_REQ_FMT0("MDS_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_MDS_CONNECT); + +struct req_format RQF_MDS_DISCONNECT = + DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_MDS_DISCONNECT); + +struct req_format RQF_MDS_GET_INFO = + DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client, + mds_getinfo_server); +EXPORT_SYMBOL(RQF_MDS_GET_INFO); + +struct req_format RQF_UPDATE_OBJ = + DEFINE_REQ_FMT0("OBJECT_UPDATE_OBJ", mds_update_client, + mds_update_server); +EXPORT_SYMBOL(RQF_UPDATE_OBJ); + +struct req_format RQF_LDLM_ENQUEUE = + DEFINE_REQ_FMT0("LDLM_ENQUEUE", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE); + +struct req_format RQF_LDLM_ENQUEUE_LVB = + DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB", + ldlm_enqueue_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB); + +struct req_format RQF_LDLM_CONVERT = + DEFINE_REQ_FMT0("LDLM_CONVERT", + ldlm_enqueue_client, ldlm_enqueue_server); +EXPORT_SYMBOL(RQF_LDLM_CONVERT); + +struct req_format RQF_LDLM_CANCEL = + DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CANCEL); + +struct req_format RQF_LDLM_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CALLBACK); + +struct req_format RQF_LDLM_CP_CALLBACK = + DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty); +EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK); + +struct req_format RQF_LDLM_BL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty); +EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK); + +struct req_format RQF_LDLM_GL_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK); + +struct req_format RQF_LDLM_GL_DESC_CALLBACK = + DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client, + ldlm_gl_callback_server); +EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK); + +struct req_format RQF_LDLM_INTENT_BASIC = + DEFINE_REQ_FMT0("LDLM_INTENT_BASIC", + ldlm_intent_basic_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC); + +struct req_format RQF_LDLM_INTENT = + DEFINE_REQ_FMT0("LDLM_INTENT", + ldlm_intent_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT); + +struct req_format RQF_LDLM_INTENT_LAYOUT = + DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ", + ldlm_intent_layout_client, ldlm_enqueue_lvb_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT); + +struct req_format RQF_LDLM_INTENT_GETATTR = + DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR", + ldlm_intent_getattr_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR); + +struct req_format RQF_LDLM_INTENT_OPEN = + DEFINE_REQ_FMT0("LDLM_INTENT_OPEN", + ldlm_intent_open_client, ldlm_intent_open_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN); + +struct req_format RQF_LDLM_INTENT_CREATE = + DEFINE_REQ_FMT0("LDLM_INTENT_CREATE", + ldlm_intent_create_client, ldlm_intent_getattr_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE); + +struct req_format RQF_LDLM_INTENT_UNLINK = + DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK", + ldlm_intent_unlink_client, ldlm_intent_server); +EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK); + +struct req_format RQF_MDS_CLOSE = + DEFINE_REQ_FMT0("MDS_CLOSE", + mdt_close_client, mds_last_unlink_server); +EXPORT_SYMBOL(RQF_MDS_CLOSE); + +struct req_format RQF_MDS_PIN = + DEFINE_REQ_FMT0("MDS_PIN", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_PIN); + +struct req_format RQF_MDS_UNPIN = + DEFINE_REQ_FMT0("MDS_UNPIN", mdt_body_only, empty); +EXPORT_SYMBOL(RQF_MDS_UNPIN); + +struct req_format RQF_MDS_DONE_WRITING = + DEFINE_REQ_FMT0("MDS_DONE_WRITING", + mdt_close_client, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_DONE_WRITING); + +struct req_format RQF_MDS_READPAGE = + DEFINE_REQ_FMT0("MDS_READPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_READPAGE); + +struct req_format RQF_MDS_HSM_ACTION = + DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server); +EXPORT_SYMBOL(RQF_MDS_HSM_ACTION); + +struct req_format RQF_MDS_HSM_PROGRESS = + DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS); + +struct req_format RQF_MDS_HSM_CT_REGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER); + +struct req_format RQF_MDS_HSM_CT_UNREGISTER = + DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER); + +struct req_format RQF_MDS_HSM_STATE_GET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_GET", + mdt_body_capa, mdt_hsm_state_get_server); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET); + +struct req_format RQF_MDS_HSM_STATE_SET = + DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET); + +struct req_format RQF_MDS_HSM_REQUEST = + DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty); +EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST); + +struct req_format RQF_MDS_SWAP_LAYOUTS = + DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS", + mdt_swap_layouts, empty); +EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS); + +/* This is for split */ +struct req_format RQF_MDS_WRITEPAGE = + DEFINE_REQ_FMT0("MDS_WRITEPAGE", + mdt_body_capa, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_WRITEPAGE); + +struct req_format RQF_MDS_IS_SUBDIR = + DEFINE_REQ_FMT0("MDS_IS_SUBDIR", + mdt_body_only, mdt_body_only); +EXPORT_SYMBOL(RQF_MDS_IS_SUBDIR); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", + llog_origin_handle_create_client, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY", + llogd_body_only, llogd_body_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK", + llogd_body_only, llog_origin_handle_next_block_server); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK); + +struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER = + DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER", + llogd_body_only, llog_log_hdr_only); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); + +struct req_format RQF_LLOG_ORIGIN_CONNECT = + DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty); +EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT); + +struct req_format RQF_OST_CONNECT = + DEFINE_REQ_FMT0("OST_CONNECT", + obd_connect_client, obd_connect_server); +EXPORT_SYMBOL(RQF_OST_CONNECT); + +struct req_format RQF_OST_DISCONNECT = + DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty); +EXPORT_SYMBOL(RQF_OST_DISCONNECT); + +struct req_format RQF_OST_GETATTR = + DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_GETATTR); + +struct req_format RQF_OST_SETATTR = + DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SETATTR); + +struct req_format RQF_OST_CREATE = + DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only); +EXPORT_SYMBOL(RQF_OST_CREATE); + +struct req_format RQF_OST_PUNCH = + DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_PUNCH); + +struct req_format RQF_OST_SYNC = + DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only); +EXPORT_SYMBOL(RQF_OST_SYNC); + +struct req_format RQF_OST_DESTROY = + DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only); +EXPORT_SYMBOL(RQF_OST_DESTROY); + +struct req_format RQF_OST_BRW_READ = + DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server); +EXPORT_SYMBOL(RQF_OST_BRW_READ); + +struct req_format RQF_OST_BRW_WRITE = + DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server); +EXPORT_SYMBOL(RQF_OST_BRW_WRITE); + +struct req_format RQF_OST_STATFS = + DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server); +EXPORT_SYMBOL(RQF_OST_STATFS); + +struct req_format RQF_OST_SET_GRANT_INFO = + DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client, + ost_body_only); +EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO); + +struct req_format RQF_OST_GET_INFO_GENERIC = + DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client, + ost_get_info_generic_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_GENERIC); + +struct req_format RQF_OST_GET_INFO_LAST_ID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client, + ost_get_last_id_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID); + +struct req_format RQF_OST_GET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", obd_set_info_client, + ost_get_last_fid_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID); + +struct req_format RQF_OST_SET_INFO_LAST_FID = + DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client, + empty); +EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID); + +struct req_format RQF_OST_GET_INFO_FIEMAP = + DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client, + ost_get_fiemap_server); +EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); + +#if !defined(__REQ_LAYOUT_USER__) + +/* Convenience macro */ +#define FMT_FIELD(fmt, i, j) (fmt)->rf_fields[(i)].d[(j)] + +/** + * Initializes the capsule abstraction by computing and setting the \a rf_idx + * field of RQFs and the \a rmf_offset field of RMFs. + */ +int req_layout_init(void) +{ + int i; + int j; + int k; + struct req_format *rf = NULL; + + for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { + rf = req_formats[i]; + rf->rf_idx = i; + for (j = 0; j < RCL_NR; ++j) { + LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR); + for (k = 0; k < rf->rf_fields[j].nr; ++k) { + struct req_msg_field *field; + + field = (typeof(field))rf->rf_fields[j].d[k]; + LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY) + || field->rmf_size > 0); + LASSERT(field->rmf_offset[i][j] == 0); + /* + * k + 1 to detect unused format/field + * combinations. + */ + field->rmf_offset[i][j] = k + 1; + } + } + } + return 0; +} +EXPORT_SYMBOL(req_layout_init); + +void req_layout_fini(void) +{ +} +EXPORT_SYMBOL(req_layout_fini); + +/** + * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1. + * + * Actual/expected field sizes are set elsewhere in functions in this file: + * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and + * req_capsule_msg_size(). The \a rc_area information is used by. + * ptlrpc_request_set_replen(). + */ +void req_capsule_init_area(struct req_capsule *pill) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { + pill->rc_area[RCL_CLIENT][i] = -1; + pill->rc_area[RCL_SERVER][i] = -1; + } +} +EXPORT_SYMBOL(req_capsule_init_area); + +/** + * Initialize a pill. + * + * The \a location indicates whether the caller is executing on the client side + * (RCL_CLIENT) or server side (RCL_SERVER).. + */ +void req_capsule_init(struct req_capsule *pill, + struct ptlrpc_request *req, + enum req_location location) +{ + LASSERT(location == RCL_SERVER || location == RCL_CLIENT); + + /* + * Today all capsules are embedded in ptlrpc_request structs, + * but just in case that ever isn't the case, we don't reach + * into req unless req != NULL and pill is the one embedded in + * the req. + * + * The req->rq_pill_init flag makes it safe to initialize a pill + * twice, which might happen in the OST paths as a result of the + * high-priority RPC queue getting peeked at before ost_handle() + * handles an OST RPC. + */ + if (req != NULL && pill == &req->rq_pill && req->rq_pill_init) + return; + + memset(pill, 0, sizeof *pill); + pill->rc_req = req; + pill->rc_loc = location; + req_capsule_init_area(pill); + + if (req != NULL && pill == &req->rq_pill) + req->rq_pill_init = 1; +} +EXPORT_SYMBOL(req_capsule_init); + +void req_capsule_fini(struct req_capsule *pill) +{ +} +EXPORT_SYMBOL(req_capsule_fini); + +static int __req_format_is_sane(const struct req_format *fmt) +{ + return + 0 <= fmt->rf_idx && fmt->rf_idx < ARRAY_SIZE(req_formats) && + req_formats[fmt->rf_idx] == fmt; +} + +static struct lustre_msg *__req_msg(const struct req_capsule *pill, + enum req_location loc) +{ + struct ptlrpc_request *req; + + req = pill->rc_req; + return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg; +} + +/** + * Set the format (\a fmt) of a \a pill; format changes are not allowed here + * (see req_capsule_extend()). + */ +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt) +{ + LASSERT(pill->rc_fmt == NULL || pill->rc_fmt == fmt); + LASSERT(__req_format_is_sane(fmt)); + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_set); + +/** + * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in + * yet. + + * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of + * variable-sized fields. The field sizes come from the declared \a rmf_size + * field of a \a pill's \a rc_fmt's RMF's. + */ +int req_capsule_filled_sizes(struct req_capsule *pill, + enum req_location loc) +{ + const struct req_format *fmt = pill->rc_fmt; + int i; + + LASSERT(fmt != NULL); + + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + if (pill->rc_area[loc][i] == -1) { + pill->rc_area[loc][i] = + fmt->rf_fields[loc].d[i]->rmf_size; + if (pill->rc_area[loc][i] == -1) { + /* + * Skip the following fields. + * + * If this LASSERT() trips then you're missing a + * call to req_capsule_set_size(). + */ + LASSERT(loc != RCL_SERVER); + break; + } + } + } + return i; +} +EXPORT_SYMBOL(req_capsule_filled_sizes); + +/** + * Capsule equivalent of lustre_pack_request() and lustre_pack_reply(). + * + * This function uses the \a pill's \a rc_area as filled in by + * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by + * this function). + */ +int req_capsule_server_pack(struct req_capsule *pill) +{ + const struct req_format *fmt; + int count; + int rc; + + LASSERT(pill->rc_loc == RCL_SERVER); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + + count = req_capsule_filled_sizes(pill, RCL_SERVER); + rc = lustre_pack_reply(pill->rc_req, count, + pill->rc_area[RCL_SERVER], NULL); + if (rc != 0) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Cannot pack %d fields in format `%s': ", + count, fmt->rf_name); + } + return rc; +} +EXPORT_SYMBOL(req_capsule_server_pack); + +/** + * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill + * corresponding to the given RMF (\a field). + */ +static int __req_capsule_offset(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + int offset; + + offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; + LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", + pill->rc_fmt->rf_name, + field->rmf_name, offset, loc); + offset --; + + LASSERT(0 <= offset && offset < REQ_MAX_FIELD_NR); + return offset; +} + +/** + * Helper for __req_capsule_get(); swabs value / array of values and/or dumps + * them if desired. + */ +static +void +swabber_dumper_helper(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + int offset, + void *value, int len, int dump, void (*swabber)( void *)) +{ + void *p; + int i; + int n; + int do_swab; + int inout = loc == RCL_CLIENT; + + swabber = swabber ?: field->rmf_swabber; + + if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) && + swabber != NULL && value != NULL) + do_swab = 1; + else + do_swab = 0; + + if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) { + if (dump && field->rmf_dumper) { + CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n", + do_swab ? "unswabbed " : "", field->rmf_name); + field->rmf_dumper(value); + } + if (!do_swab) + return; + swabber(value); + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); + if (dump) { + CDEBUG(D_RPCTRACE, "Dump of swabbed field %s " + "follows\n", field->rmf_name); + field->rmf_dumper(value); + } + + return; + } + + /* + * We're swabbing an array; swabber() swabs a single array element, so + * swab every element. + */ + LASSERT((len % field->rmf_size) == 0); + for (p = value, i = 0, n = len / field->rmf_size; + i < n; + i++, p += field->rmf_size) { + if (dump && field->rmf_dumper) { + CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, " + "element %d follows\n", + do_swab ? "unswabbed " : "", field->rmf_name, i); + field->rmf_dumper(p); + } + if (!do_swab) + continue; + swabber(p); + if (dump && field->rmf_dumper) { + CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, " + "element %d follows\n", field->rmf_name, i); + field->rmf_dumper(value); + } + } + if (do_swab) + ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); +} + +/** + * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill + * corresponding to the given RMF (\a field). + * + * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL + * then the \a rmf_swabber from the RMF will be used. Soon there will be no + * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then + * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each + * element of the array swabbed. + */ +static void *__req_capsule_get(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, + void (*swabber)( void *), + int dump) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + void *value; + int len; + int offset; + + void *(*getter)(struct lustre_msg *m, int n, int minlen); + + static const char *rcl_names[RCL_NR] = { + [RCL_CLIENT] = "client", + [RCL_SERVER] = "server" + }; + + LASSERT(pill != NULL); + LASSERT(pill != LP_POISON); + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(fmt != LP_POISON); + LASSERT(__req_format_is_sane(fmt)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + LASSERT(msg != NULL); + + getter = (field->rmf_flags & RMF_F_STRING) ? + (typeof(getter))lustre_msg_string : lustre_msg_buf; + + if (field->rmf_flags & RMF_F_STRUCT_ARRAY) { + /* + * We've already asserted that field->rmf_size > 0 in + * req_layout_init(). + */ + len = lustre_msg_buflen(msg, offset); + if ((len % field->rmf_size) != 0) { + CERROR("%s: array field size mismatch " + "%d modulo %d != 0 (%d)\n", + field->rmf_name, len, field->rmf_size, loc); + return NULL; + } + } else if (pill->rc_area[loc][offset] != -1) { + len = pill->rc_area[loc][offset]; + } else { + len = max(field->rmf_size, 0); + } + value = getter(msg, offset, len); + + if (value == NULL) { + DEBUG_REQ(D_ERROR, pill->rc_req, + "Wrong buffer for field `%s' (%d of %d) " + "in format `%s': %d vs. %d (%s)\n", + field->rmf_name, offset, lustre_msg_bufcount(msg), + fmt->rf_name, lustre_msg_buflen(msg, offset), len, + rcl_names[loc]); + } else { + swabber_dumper_helper(pill, field, loc, offset, value, len, + dump, swabber); + } + + return value; +} + +/** + * Dump a request and/or reply + */ +void __req_capsule_dump(struct req_capsule *pill, enum req_location loc) +{ + const struct req_format *fmt; + const struct req_msg_field *field; + int len; + int i; + + fmt = pill->rc_fmt; + + DEBUG_REQ(D_RPCTRACE, pill->rc_req, "BEGIN REQ CAPSULE DUMP\n"); + for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { + field = FMT_FIELD(fmt, loc, i); + if (field->rmf_dumper == NULL) { + /* + * FIXME Add a default hex dumper for fields that don't + * have a specific dumper + */ + len = req_capsule_get_size(pill, field, loc); + CDEBUG(D_RPCTRACE, "Field %s has no dumper function;" + "field size is %d\n", field->rmf_name, len); + } else { + /* It's the dumping side-effect that we're interested in */ + (void) __req_capsule_get(pill, field, loc, NULL, 1); + } + } + CDEBUG(D_RPCTRACE, "END REQ CAPSULE DUMP\n"); +} + +/** + * Dump a request. + */ +void req_capsule_client_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_CLIENT); +} +EXPORT_SYMBOL(req_capsule_client_dump); + +/** + * Dump a reply + */ +void req_capsule_server_dump(struct req_capsule *pill) +{ + __req_capsule_dump(pill, RCL_SERVER); +} +EXPORT_SYMBOL(req_capsule_server_dump); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_get); + +/** + * Same as req_capsule_client_get(), but with a \a swabber argument. + * + * Currently unused; will be removed when req_capsule_server_swab_get() is + * unused too. + */ +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_client_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_client_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len) +{ + req_capsule_set_size(pill, field, RCL_CLIENT, len); + return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_client_sized_get); + +/** + * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply + * buffer corresponding to the given RMF (\a field) of a \a pill. + */ +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_get); + +/** + * Same as req_capsule_server_get(), but with a \a swabber argument. + * + * Ideally all swabbing should be done pursuant to RMF definitions, with no + * swabbing done outside this capsule abstraction. + */ +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber) +{ + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_swab_get); + +/** + * Utility that combines req_capsule_set_size() and req_capsule_server_get(). + * + * First the \a pill's request \a field's size is set (\a rc_area) using + * req_capsule_set_size() with the given \a len. Then the actual buffer is + * returned. + */ +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_get); + +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len, void *swabber) +{ + req_capsule_set_size(pill, field, RCL_SERVER, len); + return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); +} +EXPORT_SYMBOL(req_capsule_server_sized_swab_get); + +/** + * Returns the buffer of a \a pill corresponding to the given \a field from the + * request (if the caller is executing on the server-side) or reply (if the + * caller is executing on the client-side). + * + * This function convienient for use is code that could be executed on the + * client and server alike. + */ +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field) +{ + return __req_capsule_get(pill, field, pill->rc_loc ^ 1, NULL, 0); +} +EXPORT_SYMBOL(req_capsule_other_get); + +/** + * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a + * field of the given \a pill. + * + * This function must be used when constructing variable sized fields of a + * request or reply. + */ +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, int size) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + if ((size != field->rmf_size) && + (field->rmf_size != -1) && + !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && + (size > 0)) { + if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && + (size % field->rmf_size != 0)) { + CERROR("%s: array field size mismatch " + "%d %% %d != 0 (%d)\n", + field->rmf_name, size, field->rmf_size, loc); + LBUG(); + } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && + size < field->rmf_size) { + CERROR("%s: field size mismatch %d != %d (%d)\n", + field->rmf_name, size, field->rmf_size, loc); + LBUG(); + } + } + + pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size; +} +EXPORT_SYMBOL(req_capsule_set_size); + +/** + * Return the actual PTLRPC buffer length of a request or reply (\a loc) + * for the given \a pill's given \a field. + * + * NB: this function doesn't correspond with req_capsule_set_size(), which + * actually sets the size in pill.rc_area[loc][offset], but this function + * returns the message buflen[offset], maybe we should use another name. + */ +int req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return lustre_msg_buflen(__req_msg(pill, loc), + __req_capsule_offset(pill, field, loc)); +} +EXPORT_SYMBOL(req_capsule_get_size); + +/** + * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the + * given \a pill's request or reply (\a loc) given the field size recorded in + * the \a pill's rc_area. + * + * See also req_capsule_set_size(). + */ +int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) +{ + return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, + pill->rc_fmt->rf_fields[loc].nr, + pill->rc_area[loc]); +} + +/** + * While req_capsule_msg_size() computes the size of a PTLRPC request or reply + * (\a loc) given a \a pill's \a rc_area, this function computes the size of a + * PTLRPC request or reply given only an RQF (\a fmt). + * + * This function should not be used for formats which contain variable size + * fields. + */ +int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc) +{ + int size, i = 0; + + /* + * This function should probably LASSERT() that fmt has no fields with + * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many + * elements in the array there will ultimately be, but then, we could + * assume that there will be at least one element, and that's just what + * we do. + */ + size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); + if (size < 0) + return size; + + for (; i < fmt->rf_fields[loc].nr; ++i) + if (fmt->rf_fields[loc].d[i]->rmf_size != -1) + size += cfs_size_round(fmt->rf_fields[loc].d[i]-> + rmf_size); + return size; +} + +/** + * Changes the format of an RPC. + * + * The pill must already have been initialized, which means that it already has + * a request format. The new format \a fmt must be an extension of the pill's + * old format. Specifically: the new format must have as many request and reply + * fields as the old one, and all fields shared by the old and new format must + * be at least as large in the new format. + * + * The new format's fields may be of different "type" than the old format, but + * only for fields that are "opaque" blobs: fields which have a) have no + * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a + * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example, + * OBD_SET_INFO has a key field and an opaque value field that gets interpreted + * according to the key field. When the value, according to the key, contains a + * structure (or array thereof) to be swabbed, the format should be changed to + * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set + * accordingly. + */ +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) +{ + int i; + int j; + + const struct req_format *old; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + + old = pill->rc_fmt; + /* + * Sanity checking... + */ + for (i = 0; i < RCL_NR; ++i) { + LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr); + for (j = 0; j < old->rf_fields[i].nr - 1; ++j) { + const struct req_msg_field *ofield = FMT_FIELD(old, i, j); + + /* "opaque" fields can be transmogrified */ + if (ofield->rmf_swabber == NULL && + (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 && + (ofield->rmf_size == -1 || + ofield->rmf_flags == RMF_F_NO_SIZE_CHECK)) + continue; + LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j)); + } + /* + * Last field in old format can be shorter than in new. + */ + LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >= + FMT_FIELD(old, i, j)->rmf_size); + } + + pill->rc_fmt = fmt; +} +EXPORT_SYMBOL(req_capsule_extend); + +/** + * This function returns a non-zero value if the given \a field is present in + * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it + * returns 0. + */ +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + + return field->rmf_offset[pill->rc_fmt->rf_idx][loc]; +} +EXPORT_SYMBOL(req_capsule_has_field); + +/** + * Returns a non-zero value if the given \a field is present in the given \a + * pill's PTLRPC request or reply (\a loc), else it returns 0. + */ +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc) +{ + int offset; + + LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); + LASSERT(req_capsule_has_field(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + return lustre_msg_bufcount(__req_msg(pill, loc)) > offset; +} +EXPORT_SYMBOL(req_capsule_field_present); + +/** + * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC + * request or reply (\a loc). + * + * This is not the opposite of req_capsule_extend(). + */ +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen, + enum req_location loc) +{ + const struct req_format *fmt; + struct lustre_msg *msg; + int len; + int offset; + + fmt = pill->rc_fmt; + LASSERT(fmt != NULL); + LASSERT(__req_format_is_sane(fmt)); + LASSERT(req_capsule_has_field(pill, field, loc)); + LASSERT(req_capsule_field_present(pill, field, loc)); + + offset = __req_capsule_offset(pill, field, loc); + + msg = __req_msg(pill, loc); + len = lustre_msg_buflen(msg, offset); + LASSERTF(newlen <= len, "%s:%s, oldlen=%d, newlen=%d\n", + fmt->rf_name, field->rmf_name, len, newlen); + + if (loc == RCL_CLIENT) + pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen, + 1); + else + pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen, + 1); +} +EXPORT_SYMBOL(req_capsule_shrink); + +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen) +{ + struct ptlrpc_reply_state *rs = pill->rc_req->rq_reply_state, *nrs; + char *from, *to; + int offset, len, rc; + + LASSERT(pill->rc_fmt != NULL); + LASSERT(__req_format_is_sane(pill->rc_fmt)); + LASSERT(req_capsule_has_field(pill, field, RCL_SERVER)); + LASSERT(req_capsule_field_present(pill, field, RCL_SERVER)); + + len = req_capsule_get_size(pill, field, RCL_SERVER); + offset = __req_capsule_offset(pill, field, RCL_SERVER); + if (pill->rc_req->rq_repbuf_len >= + lustre_packed_msg_size(pill->rc_req->rq_repmsg) - len + newlen) + CERROR("Inplace repack might be done\n"); + + pill->rc_req->rq_reply_state = NULL; + req_capsule_set_size(pill, field, RCL_SERVER, newlen); + rc = req_capsule_server_pack(pill); + if (rc) { + /* put old rs back, the caller will decide what to do */ + pill->rc_req->rq_reply_state = rs; + return rc; + } + nrs = pill->rc_req->rq_reply_state; + /* Now we need only buffers, copy first chunk */ + to = lustre_msg_buf(nrs->rs_msg, 0, 0); + from = lustre_msg_buf(rs->rs_msg, 0, 0); + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) - from; + memcpy(to, from, len); + /* check if we have tail and copy it too */ + if (rs->rs_msg->lm_bufcount > offset + 1) { + to = lustre_msg_buf(nrs->rs_msg, offset + 1, 0); + from = lustre_msg_buf(rs->rs_msg, offset + 1, 0); + offset = rs->rs_msg->lm_bufcount - 1; + len = (char *)lustre_msg_buf(rs->rs_msg, offset, 0) + + cfs_size_round(rs->rs_msg->lm_buflens[offset]) - from; + memcpy(to, from, len); + } + /* drop old reply if everything is fine */ + if (rs->rs_difficult) { + /* copy rs data */ + int i; + + nrs->rs_difficult = 1; + nrs->rs_no_ack = rs->rs_no_ack; + for (i = 0; i < rs->rs_nlocks; i++) { + nrs->rs_locks[i] = rs->rs_locks[i]; + nrs->rs_modes[i] = rs->rs_modes[i]; + nrs->rs_nlocks++; + } + rs->rs_nlocks = 0; + rs->rs_difficult = 0; + rs->rs_no_ack = 0; + } + ptlrpc_rs_decref(rs); + return 0; +} +EXPORT_SYMBOL(req_capsule_server_grow); +/* __REQ_LAYOUT_USER__ */ +#endif diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c new file mode 100644 index 000000000000..367ca8ef7d60 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c @@ -0,0 +1,354 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_client.c + * + * remote api for llog - client side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include + +#include +#include +#include +#include + +#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp) { \ + imp = class_import_get(ctxt->loc_imp); \ + } else { \ + CERROR("ctxt->loc_imp == NULL for context idx %d." \ + "Unable to complete MDS/OSS recovery," \ + "but I'll try again next time. Not fatal.\n", \ + ctxt->loc_idx); \ + imp = NULL; \ + mutex_unlock(&ctxt->loc_mutex); \ + return (-EINVAL); \ + } \ + mutex_unlock(&ctxt->loc_mutex); \ +} while(0) + +#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ + mutex_lock(&ctxt->loc_mutex); \ + if (ctxt->loc_imp != imp) \ + CWARN("loc_imp has changed from %p to %p\n", \ + ctxt->loc_imp, imp); \ + class_import_put(imp); \ + mutex_unlock(&ctxt->loc_mutex); \ +} while(0) + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_client_open(const struct lu_env *env, + struct llog_handle *lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + struct obd_import *imp; + struct llogd_body *body; + struct llog_ctxt *ctxt = lgh->lgh_ctxt; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(ctxt, imp); + + /* client cannot create llog */ + LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param); + LASSERT(lgh); + + req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE); + if (req == NULL) + GOTO(out, rc = -ENOMEM); + + if (name) + req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, + strlen(name) + 1); + + rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_CREATE); + if (rc) { + ptlrpc_request_free(req); + req = NULL; + GOTO(out, rc); + } + ptlrpc_request_set_replen(req); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (logid) + body->lgd_logid = *logid; + body->lgd_ctxt_idx = ctxt->loc_idx - 1; + + if (name) { + char *tmp; + tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME, + strlen(name) + 1); + LASSERT(tmp); + strcpy(tmp, name); + } + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc = -EFAULT); + + lgh->lgh_id = body->lgd_logid; + lgh->lgh_ctxt = ctxt; + EXIT; +out: + LLOG_CLIENT_EXIT(ctxt, imp); + ptlrpc_req_finished(req); + return rc; +} + +static int llog_client_destroy(const struct lu_env *env, + struct llog_handle *loghandle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_DESTROY, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_DESTROY); + if (req == NULL) + GOTO(err_exit, rc =-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + + if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN)) + CERROR("%s: wrong llog flags %x\n", imp->imp_obd->obd_name, + body->lgd_llh_flags); + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + RETURN(rc); +} + + +static int llog_client_next_block(const struct lu_env *env, + struct llog_handle *loghandle, + int *cur_idx, int next_idx, + __u64 *cur_offset, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK); + if (req == NULL) + GOTO(err_exit, rc =-ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = next_idx; + body->lgd_saved_index = *cur_idx; + body->lgd_len = len; + body->lgd_cur_offset = *cur_offset; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc =-EFAULT); + + /* The log records are swabbed as they are processed */ + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) + GOTO(out, rc =-EFAULT); + + *cur_idx = body->lgd_saved_index; + *cur_offset = body->lgd_cur_offset; + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + void *ptr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_PREV_BLOCK); + if (req == NULL) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; + body->lgd_index = prev_idx; + body->lgd_len = len; + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + GOTO(out, rc =-EFAULT); + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + if (ptr == NULL) + GOTO(out, rc =-EFAULT); + + memcpy(buf, ptr, len); + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_import *imp; + struct ptlrpc_request *req = NULL; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct llog_rec_hdr *llh_hdr; + int rc; + ENTRY; + + LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); + req = ptlrpc_request_alloc_pack(imp,&RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, + LUSTRE_LOG_VERSION, + LLOG_ORIGIN_HANDLE_READ_HEADER); + if (req == NULL) + GOTO(err_exit, rc = -ENOMEM); + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = handle->lgh_id; + body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; + body->lgd_llh_flags = handle->lgh_hdr->llh_flags; + + ptlrpc_request_set_replen(req); + rc = ptlrpc_queue_wait(req); + if (rc) + GOTO(out, rc); + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + if (hdr == NULL) + GOTO(out, rc =-EFAULT); + + memcpy(handle->lgh_hdr, hdr, sizeof (*hdr)); + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + + /* sanity checks */ + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log header magic: %#x (expecting %#x)\n", + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("incorrectly sized log header: %#x " + "(expecting %#x)\n", + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } + EXIT; +out: + ptlrpc_req_finished(req); +err_exit: + LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); + return rc; +} + +static int llog_client_close(const struct lu_env *env, + struct llog_handle *handle) +{ + /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because + the servers all close the file at the end of every + other LLOG_ RPC. */ + return(0); +} + +struct llog_operations llog_client_ops = { + .lop_next_block = llog_client_next_block, + .lop_prev_block = llog_client_prev_block, + .lop_read_header = llog_client_read_header, + .lop_open = llog_client_open, + .lop_destroy = llog_client_destroy, + .lop_close = llog_client_close, +}; +EXPORT_SYMBOL(llog_client_ops); diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c new file mode 100644 index 000000000000..a81f557d7794 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_net.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include + +#include +#include +#include +#include +#include + +int llog_initiator_connect(struct llog_ctxt *ctxt) +{ + struct obd_import *new_imp; + ENTRY; + + LASSERT(ctxt); + new_imp = ctxt->loc_obd->u.cli.cl_import; + LASSERTF(ctxt->loc_imp == NULL || ctxt->loc_imp == new_imp, + "%p - %p\n", ctxt->loc_imp, new_imp); + mutex_lock(&ctxt->loc_mutex); + if (ctxt->loc_imp != new_imp) { + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(new_imp); + } + mutex_unlock(&ctxt->loc_mutex); + RETURN(0); +} +EXPORT_SYMBOL(llog_initiator_connect); diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_server.c b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c new file mode 100644 index 000000000000..bc1fcd8c7e73 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/llog_server.c @@ -0,0 +1,466 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/llog_server.c + * + * remote api for llog - server side + * + * Author: Andreas Dilger + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include +#include +#include +#include + +#if defined(LUSTRE_LOG_SERVER) +static int llog_origin_close(const struct lu_env *env, struct llog_handle *lgh) +{ + if (lgh->lgh_hdr != NULL && lgh->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + return llog_cat_close(env, lgh); + else + return llog_close(env, lgh); +} + +/* Only open is supported, no new llog can be created remotely */ +int llog_origin_handle_open(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct obd_device *disk_obd; + struct lvfs_run_ctxt saved; + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_logid *logid = NULL; + struct llog_ctxt *ctxt; + char *name = NULL; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(-EFAULT); + + if (ostid_id(&body->lgd_logid.lgl_oi) > 0) + logid = &body->lgd_logid; + + if (req_capsule_field_present(&req->rq_pill, &RMF_NAME, RCL_CLIENT)) { + name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); + if (name == NULL) + RETURN(-EFAULT); + CDEBUG(D_INFO, "%s: opening log %s\n", obd->obd_name, name); + } + + ctxt = llog_get_context(obd, body->lgd_ctxt_idx); + if (ctxt == NULL) { + CDEBUG(D_WARNING, "%s: no ctxt. group=%p idx=%d name=%s\n", + obd->obd_name, &obd->obd_olg, body->lgd_ctxt_idx, name); + RETURN(-ENODEV); + } + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, logid, + name, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_pop, rc); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + body->lgd_logid = loghandle->lgh_id; + + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + return rc; +} +EXPORT_SYMBOL(llog_origin_handle_open); + +int llog_origin_handle_destroy(struct ptlrpc_request *req) +{ + struct obd_device *disk_obd; + struct lvfs_run_ctxt saved; + struct llogd_body *body; + struct llog_logid *logid = NULL; + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(-EFAULT); + + if (ostid_id(&body->lgd_logid.lgl_oi) > 0) + logid = &body->lgd_logid; + + if (!(body->lgd_llh_flags & LLOG_F_IS_PLAIN)) + CERROR("%s: wrong llog flags %x\n", + req->rq_export->exp_obd->obd_name, body->lgd_llh_flags); + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = req_capsule_server_pack(&req->rq_pill); + /* erase only if no error and logid is valid */ + if (rc == 0) + rc = llog_erase(req->rq_svc_thread->t_env, ctxt, logid, NULL); + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + RETURN(rc); +} +EXPORT_SYMBOL(llog_origin_handle_destroy); + +int llog_origin_handle_next_block(struct ptlrpc_request *req) +{ + struct obd_device *disk_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct lvfs_run_ctxt saved; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(-EFAULT); + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_pop, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_next_block(req->rq_svc_thread->t_env, loghandle, + &repbody->lgd_saved_index, repbody->lgd_index, + &repbody->lgd_cur_offset, ptr, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + return rc; +} +EXPORT_SYMBOL(llog_origin_handle_next_block); + +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + struct llog_handle *loghandle; + struct llogd_body *body; + struct llogd_body *repbody; + struct obd_device *disk_obd; + struct lvfs_run_ctxt saved; + struct llog_ctxt *ctxt; + __u32 flags; + void *ptr; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(-EFAULT); + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_pop, rc); + + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + + req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, + LLOG_CHUNK_SIZE); + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + repbody = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); + *repbody = *body; + + ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); + rc = llog_prev_block(req->rq_svc_thread->t_env, loghandle, + body->lgd_index, ptr, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out_close, rc); + + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + return rc; +} +EXPORT_SYMBOL(llog_origin_handle_prev_block); + +int llog_origin_handle_read_header(struct ptlrpc_request *req) +{ + struct obd_device *disk_obd; + struct llog_handle *loghandle; + struct llogd_body *body; + struct llog_log_hdr *hdr; + struct lvfs_run_ctxt saved; + struct llog_ctxt *ctxt; + __u32 flags; + int rc; + + ENTRY; + + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); + if (body == NULL) + RETURN(-EFAULT); + + ctxt = llog_get_context(req->rq_export->exp_obd, body->lgd_ctxt_idx); + if (ctxt == NULL) + RETURN(-ENODEV); + + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + + rc = llog_open(req->rq_svc_thread->t_env, ctxt, &loghandle, + &body->lgd_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) + GOTO(out_pop, rc); + + /* + * llog_init_handle() reads the llog header + */ + flags = body->lgd_llh_flags; + rc = llog_init_handle(req->rq_svc_thread->t_env, loghandle, flags, + NULL); + if (rc) + GOTO(out_close, rc); + flags = loghandle->lgh_hdr->llh_flags; + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out_close, rc = -ENOMEM); + + hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); + *hdr = *loghandle->lgh_hdr; + EXIT; +out_close: + llog_origin_close(req->rq_svc_thread->t_env, loghandle); +out_pop: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + return rc; +} +EXPORT_SYMBOL(llog_origin_handle_read_header); + +int llog_origin_handle_close(struct ptlrpc_request *req) +{ + ENTRY; + /* Nothing to do */ + RETURN(0); +} +EXPORT_SYMBOL(llog_origin_handle_close); + +int llog_origin_handle_cancel(struct ptlrpc_request *req) +{ + int num_cookies, rc = 0, err, i, failed = 0; + struct obd_device *disk_obd; + struct llog_cookie *logcookies; + struct llog_ctxt *ctxt = NULL; + struct lvfs_run_ctxt saved; + struct llog_handle *cathandle; + struct inode *inode; + void *handle; + ENTRY; + + logcookies = req_capsule_client_get(&req->rq_pill, &RMF_LOGCOOKIES); + num_cookies = req_capsule_get_size(&req->rq_pill, &RMF_LOGCOOKIES, + RCL_CLIENT) / sizeof(*logcookies); + if (logcookies == NULL || num_cookies == 0) { + DEBUG_REQ(D_HA, req, "No llog cookies sent"); + RETURN(-EFAULT); + } + + ctxt = llog_get_context(req->rq_export->exp_obd, + logcookies->lgc_subsys); + if (ctxt == NULL) + RETURN(-ENODEV); + + disk_obd = ctxt->loc_exp->exp_obd; + push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + for (i = 0; i < num_cookies; i++, logcookies++) { + cathandle = ctxt->loc_handle; + LASSERT(cathandle != NULL); + inode = cathandle->lgh_file->f_dentry->d_inode; + + handle = fsfilt_start_log(disk_obd, inode, + FSFILT_OP_CANCEL_UNLINK, NULL, 1); + if (IS_ERR(handle)) { + CERROR("fsfilt_start_log() failed: %ld\n", + PTR_ERR(handle)); + GOTO(pop_ctxt, rc = PTR_ERR(handle)); + } + + rc = llog_cat_cancel_records(req->rq_svc_thread->t_env, + cathandle, 1, logcookies); + + /* + * Do not raise -ENOENT errors for resent rpcs. This rec already + * might be killed. + */ + if (rc == -ENOENT && + (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) { + /* + * Do not change this message, reply-single.sh test_59b + * expects to find this in log. + */ + CDEBUG(D_RPCTRACE, "RESENT cancel req %p - ignored\n", + req); + rc = 0; + } else if (rc == 0) { + CDEBUG(D_RPCTRACE, "Canceled %d llog-records\n", + num_cookies); + } + + err = fsfilt_commit(disk_obd, inode, handle, 0); + if (err) { + CERROR("Error committing transaction: %d\n", err); + if (!rc) + rc = err; + failed++; + GOTO(pop_ctxt, rc); + } else if (rc) + failed++; + } + GOTO(pop_ctxt, rc); +pop_ctxt: + pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL); + if (rc) + CERROR("Cancel %d of %d llog-records failed: %d\n", + failed, num_cookies, rc); + + llog_ctxt_put(ctxt); + return rc; +} +EXPORT_SYMBOL(llog_origin_handle_cancel); + +#else /* !__KERNEL__ */ +int llog_origin_handle_open(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} + +int llog_origin_handle_destroy(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} + +int llog_origin_handle_next_block(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_prev_block(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_read_header(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_close(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +int llog_origin_handle_cancel(struct ptlrpc_request *req) +{ + LBUG(); + return 0; +} +#endif diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c new file mode 100644 index 000000000000..031c0f9abb82 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c @@ -0,0 +1,1401 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#define DEBUG_SUBSYSTEM S_CLASS + + +#include +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + + +struct ll_rpc_opcode { + __u32 opcode; + const char *opname; +} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { + { OST_REPLY, "ost_reply" }, + { OST_GETATTR, "ost_getattr" }, + { OST_SETATTR, "ost_setattr" }, + { OST_READ, "ost_read" }, + { OST_WRITE, "ost_write" }, + { OST_CREATE , "ost_create" }, + { OST_DESTROY, "ost_destroy" }, + { OST_GET_INFO, "ost_get_info" }, + { OST_CONNECT, "ost_connect" }, + { OST_DISCONNECT, "ost_disconnect" }, + { OST_PUNCH, "ost_punch" }, + { OST_OPEN, "ost_open" }, + { OST_CLOSE, "ost_close" }, + { OST_STATFS, "ost_statfs" }, + { 14, NULL }, /* formerly OST_SAN_READ */ + { 15, NULL }, /* formerly OST_SAN_WRITE */ + { OST_SYNC, "ost_sync" }, + { OST_SET_INFO, "ost_set_info" }, + { OST_QUOTACHECK, "ost_quotacheck" }, + { OST_QUOTACTL, "ost_quotactl" }, + { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, + { MDS_GETATTR, "mds_getattr" }, + { MDS_GETATTR_NAME, "mds_getattr_lock" }, + { MDS_CLOSE, "mds_close" }, + { MDS_REINT, "mds_reint" }, + { MDS_READPAGE, "mds_readpage" }, + { MDS_CONNECT, "mds_connect" }, + { MDS_DISCONNECT, "mds_disconnect" }, + { MDS_GETSTATUS, "mds_getstatus" }, + { MDS_STATFS, "mds_statfs" }, + { MDS_PIN, "mds_pin" }, + { MDS_UNPIN, "mds_unpin" }, + { MDS_SYNC, "mds_sync" }, + { MDS_DONE_WRITING, "mds_done_writing" }, + { MDS_SET_INFO, "mds_set_info" }, + { MDS_QUOTACHECK, "mds_quotacheck" }, + { MDS_QUOTACTL, "mds_quotactl" }, + { MDS_GETXATTR, "mds_getxattr" }, + { MDS_SETXATTR, "mds_setxattr" }, + { MDS_WRITEPAGE, "mds_writepage" }, + { MDS_IS_SUBDIR, "mds_is_subdir" }, + { MDS_GET_INFO, "mds_get_info" }, + { MDS_HSM_STATE_GET, "mds_hsm_state_get" }, + { MDS_HSM_STATE_SET, "mds_hsm_state_set" }, + { MDS_HSM_ACTION, "mds_hsm_action" }, + { MDS_HSM_PROGRESS, "mds_hsm_progress" }, + { MDS_HSM_REQUEST, "mds_hsm_request" }, + { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" }, + { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" }, + { MDS_SWAP_LAYOUTS, "mds_swap_layouts" }, + { LDLM_ENQUEUE, "ldlm_enqueue" }, + { LDLM_CONVERT, "ldlm_convert" }, + { LDLM_CANCEL, "ldlm_cancel" }, + { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, + { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, + { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, + { LDLM_SET_INFO, "ldlm_set_info" }, + { MGS_CONNECT, "mgs_connect" }, + { MGS_DISCONNECT, "mgs_disconnect" }, + { MGS_EXCEPTION, "mgs_exception" }, + { MGS_TARGET_REG, "mgs_target_reg" }, + { MGS_TARGET_DEL, "mgs_target_del" }, + { MGS_SET_INFO, "mgs_set_info" }, + { MGS_CONFIG_READ, "mgs_config_read" }, + { OBD_PING, "obd_ping" }, + { OBD_LOG_CANCEL, "llog_origin_handle_cancel" }, + { OBD_QC_CALLBACK, "obd_quota_callback" }, + { OBD_IDX_READ, "dt_index_read" }, + { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_create" }, + { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, + { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" }, + { LLOG_ORIGIN_HANDLE_WRITE_REC, "llog_origin_handle_write_rec" }, + { LLOG_ORIGIN_HANDLE_CLOSE, "llog_origin_handle_close" }, + { LLOG_ORIGIN_CONNECT, "llog_origin_connect" }, + { LLOG_CATINFO, "llog_catinfo" }, + { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, + { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" }, + { SEQ_QUERY, "seq_query" }, + { SEC_CTX_INIT, "sec_ctx_init" }, + { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" }, + { SEC_CTX_FINI, "sec_ctx_fini" }, + { FLD_QUERY, "fld_query" }, + { UPDATE_OBJ, "update_obj" }, +}; + +struct ll_eopcode { + __u32 opcode; + const char *opname; +} ll_eopcode_table[EXTRA_LAST_OPC] = { + { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" }, + { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" }, + { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, + { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, + { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, + { MDS_REINT_SETATTR, "mds_reint_setattr" }, + { MDS_REINT_CREATE, "mds_reint_create" }, + { MDS_REINT_LINK, "mds_reint_link" }, + { MDS_REINT_UNLINK, "mds_reint_unlink" }, + { MDS_REINT_RENAME, "mds_reint_rename" }, + { MDS_REINT_OPEN, "mds_reint_open" }, + { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, + { BRW_READ_BYTES, "read_bytes" }, + { BRW_WRITE_BYTES, "write_bytes" }, +}; + +const char *ll_opcode2str(__u32 opcode) +{ + /* When one of the assertions below fail, chances are that: + * 1) A new opcode was added in include/lustre/lustre_idl.h, + * but is missing from the table above. + * or 2) The opcode space was renumbered or rearranged, + * and the opcode_offset() function in + * ptlrpc_internal.h needs to be modified. + */ + __u32 offset = opcode_offset(opcode); + LASSERTF(offset < LUSTRE_MAX_OPCODES, + "offset %u >= LUSTRE_MAX_OPCODES %u\n", + offset, LUSTRE_MAX_OPCODES); + LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, + "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", + offset, ll_rpc_opcode_table[offset].opcode, opcode); + return ll_rpc_opcode_table[offset].opname; +} + +const char* ll_eopcode2str(__u32 opcode) +{ + LASSERT(ll_eopcode_table[opcode].opcode == opcode); + return ll_eopcode_table[opcode].opname; +} +#ifdef LPROCFS +void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, + char *name, struct proc_dir_entry **procroot_ret, + struct lprocfs_stats **stats_ret) +{ + struct proc_dir_entry *svc_procroot; + struct lprocfs_stats *svc_stats; + int i, rc; + unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; + + LASSERT(*procroot_ret == NULL); + LASSERT(*stats_ret == NULL); + + svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES+LUSTRE_MAX_OPCODES,0); + if (svc_stats == NULL) + return; + + if (dir) { + svc_procroot = lprocfs_register(dir, root, NULL, NULL); + if (IS_ERR(svc_procroot)) { + lprocfs_free_stats(&svc_stats); + return; + } + } else { + svc_procroot = root; + } + + lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, + svc_counter_config, "req_waittime", "usec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, + svc_counter_config, "req_qdepth", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, + svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "req_timeout", "sec"); + lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, + svc_counter_config, "reqbuf_avail", "bufs"); + for (i = 0; i < EXTRA_LAST_OPC; i++) { + char *units; + + switch(i) { + case BRW_WRITE_BYTES: + case BRW_READ_BYTES: + units = "bytes"; + break; + default: + units = "reqs"; + break; + } + lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, + svc_counter_config, + ll_eopcode2str(i), units); + } + for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { + __u32 opcode = ll_rpc_opcode_table[i].opcode; + lprocfs_counter_init(svc_stats, + EXTRA_MAX_OPCODES + i, svc_counter_config, + ll_opcode2str(opcode), "usec"); + } + + rc = lprocfs_register_stats(svc_procroot, name, svc_stats); + if (rc < 0) { + if (dir) + lprocfs_remove(&svc_procroot); + lprocfs_free_stats(&svc_stats); + } else { + if (dir) + *procroot_ret = svc_procroot; + *stats_ret = svc_stats; + } +} + +static int +ptlrpc_lprocfs_read_req_history_len(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + *eof = 1; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_hist_nrqbds; + + return snprintf(page, count, "%d\n", total); +} + +static int +ptlrpc_lprocfs_read_req_history_max(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + *eof = 1; + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svc->srv_hist_nrqbds_cpt_max; + + return snprintf(page, count, "%d\n", total); +} + +static int +ptlrpc_lprocfs_write_req_history_max(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int bufpages; + int val; + int rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val < 0) + return -ERANGE; + + /* This sanity check is more of an insanity check; we can still + * hose a kernel by allowing the request history to grow too + * far. */ + bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (val > num_physpages/(2 * bufpages)) + return -ERANGE; + + spin_lock(&svc->srv_lock); + + if (val == 0) + svc->srv_hist_nrqbds_cpt_max = 0; + else + svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts)); + + spin_unlock(&svc->srv_lock); + + return count; +} + +static int +ptlrpc_lprocfs_rd_threads_min(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + + return snprintf(page, count, "%d\n", + svc->srv_nthrs_cpt_init * svc->srv_ncpts); +} + +static int +ptlrpc_lprocfs_wr_threads_min(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int val; + int rc = lprocfs_write_helper(buffer, count, &val); + + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_init = val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} + +static int +ptlrpc_lprocfs_rd_threads_started(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + struct ptlrpc_service_part *svcpt; + int total = 0; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) + total += svcpt->scp_nthrs_running; + + return snprintf(page, count, "%d\n", total); +} + +static int +ptlrpc_lprocfs_rd_threads_max(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + + return snprintf(page, count, "%d\n", + svc->srv_nthrs_cpt_limit * svc->srv_ncpts); +} + +static int +ptlrpc_lprocfs_wr_threads_max(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int val; + int rc = lprocfs_write_helper(buffer, count, &val); + + if (rc < 0) + return rc; + + if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) + return -ERANGE; + + spin_lock(&svc->srv_lock); + if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) { + spin_unlock(&svc->srv_lock); + return -ERANGE; + } + + svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts; + + spin_unlock(&svc->srv_lock); + + return count; +} + +/** + * \addtogoup nrs + * @{ + */ +extern struct nrs_core nrs_core; + +/** + * Translates \e ptlrpc_nrs_pol_state values to human-readable strings. + * + * \param[in] state The policy state + */ +static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state) +{ + switch (state) { + default: + LBUG(); + case NRS_POL_STATE_INVALID: + return "invalid"; + case NRS_POL_STATE_STOPPED: + return "stopped"; + case NRS_POL_STATE_STOPPING: + return "stopping"; + case NRS_POL_STATE_STARTING: + return "starting"; + case NRS_POL_STATE_STARTED: + return "started"; + } +} + +/** + * Obtains status information for \a policy. + * + * Information is copied in \a info. + * + * \param[in] policy The policy + * \param[out] info Holds returned status information + */ +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info) +{ + LASSERT(policy != NULL); + LASSERT(info != NULL); + LASSERT(spin_is_locked(&policy->pol_nrs->nrs_lock)); + + memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX); + + info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK); + info->pi_state = policy->pol_state; + /** + * XXX: These are accessed without holding + * ptlrpc_service_part::scp_req_lock. + */ + info->pi_req_queued = policy->pol_req_queued; + info->pi_req_started = policy->pol_req_started; +} + +/** + * Reads and prints policy status information for all policies of a PTLRPC + * service. + */ +static int ptlrpc_lprocfs_rd_nrs(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_pol_info *infos; + struct ptlrpc_nrs_pol_info tmp; + unsigned num_pols; + unsigned pol_idx = 0; + bool hp = false; + int i; + int rc = 0; + int rc2 = 0; + ENTRY; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Use the first service partition's regular NRS head in order to obtain + * the number of policies registered with NRS heads of this service. All + * service partitions will have the same number of policies. + */ + nrs = nrs_svcpt2nrs(svc->srv_parts[0], false); + + spin_lock(&nrs->nrs_lock); + num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols; + spin_unlock(&nrs->nrs_lock); + + OBD_ALLOC(infos, num_pols * sizeof(*infos)); + if (infos == NULL) + GOTO(out, rc = -ENOMEM); +again: + + ptlrpc_service_for_each_part(svcpt, i, svc) { + nrs = nrs_svcpt2nrs(svcpt, hp); + spin_lock(&nrs->nrs_lock); + + pol_idx = 0; + + list_for_each_entry(policy, &nrs->nrs_policy_list, + pol_list) { + LASSERT(pol_idx < num_pols); + + nrs_policy_get_info_locked(policy, &tmp); + /** + * Copy values when handling the first service + * partition. + */ + if (i == 0) { + memcpy(infos[pol_idx].pi_name, tmp.pi_name, + NRS_POL_NAME_MAX); + memcpy(&infos[pol_idx].pi_state, &tmp.pi_state, + sizeof(tmp.pi_state)); + infos[pol_idx].pi_fallback = tmp.pi_fallback; + /** + * For the rest of the service partitions + * sanity-check the values we get. + */ + } else { + LASSERT(strncmp(infos[pol_idx].pi_name, + tmp.pi_name, + NRS_POL_NAME_MAX) == 0); + /** + * Not asserting ptlrpc_nrs_pol_info::pi_state, + * because it may be different between + * instances of the same policy in different + * service partitions. + */ + LASSERT(infos[pol_idx].pi_fallback == + tmp.pi_fallback); + } + + infos[pol_idx].pi_req_queued += tmp.pi_req_queued; + infos[pol_idx].pi_req_started += tmp.pi_req_started; + + pol_idx++; + } + spin_unlock(&nrs->nrs_lock); + } + + /** + * Policy status information output is in YAML format. + * For example: + * + * regular_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 0 + * + * - name: crrn + * state: started + * fallback: no + * queued: 2015 + * active: 384 + * + * high_priority_requests: + * - name: fifo + * state: started + * fallback: yes + * queued: 0 + * active: 2 + * + * - name: crrn + * state: stopped + * fallback: no + * queued: 0 + * active: 0 + */ + rc2 = snprintf(page + rc, count - rc, + "%s\n", !hp ? + "\nregular_requests:" : + "high_priority_requests:"); + + if (rc2 >= count - rc) { + /** Output was truncated */ + GOTO(out, rc = -EFBIG); + } + + rc += rc2; + + for (pol_idx = 0; pol_idx < num_pols; pol_idx++) { + rc2 = snprintf(page + rc, count - rc, + " - name: %s\n" + " state: %s\n" + " fallback: %s\n" + " queued: %-20d\n" + " active: %-20d\n\n", + infos[pol_idx].pi_name, + nrs_state2str(infos[pol_idx].pi_state), + infos[pol_idx].pi_fallback ? "yes" : "no", + (int)infos[pol_idx].pi_req_queued, + (int)infos[pol_idx].pi_req_started); + + + if (rc2 >= count - rc) { + /** Output was truncated */ + GOTO(out, rc = -EFBIG); + } + + rc += rc2; + } + + if (!hp && nrs_svc_has_hp(svc)) { + memset(infos, 0, num_pols * sizeof(*infos)); + + /** + * Redo the processing for the service's HP NRS heads' policies. + */ + hp = true; + goto again; + } + + *eof = 1; + +out: + if (infos) + OBD_FREE(infos, num_pols * sizeof(*infos)); + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +/** + * The longest valid command string is the maxium policy name size, plus the + * length of the " reg" substring + */ +#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1) + +/** + * Starts and stops a given policy on a PTLRPC service. + * + * Commands consist of the policy name, followed by an optional [reg|hp] token; + * if the optional token is omitted, the operation is performed on both the + * regular and high-priority (if the service has one) NRS head. + */ +static int ptlrpc_lprocfs_wr_nrs(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; + char *cmd; + char *cmd_copy = NULL; + char *token; + int rc = 0; + ENTRY; + + if (count >= LPROCFS_NRS_WR_MAX_CMD) + GOTO(out, rc = -EINVAL); + + OBD_ALLOC(cmd, LPROCFS_NRS_WR_MAX_CMD); + if (cmd == NULL) + GOTO(out, rc = -ENOMEM); + /** + * strsep() modifies its argument, so keep a copy + */ + cmd_copy = cmd; + + if (copy_from_user(cmd, buffer, count)) + GOTO(out, rc = -EFAULT); + + cmd[count] = '\0'; + + token = strsep(&cmd, " "); + + if (strlen(token) > NRS_POL_NAME_MAX - 1) + GOTO(out, rc = -EINVAL); + + /** + * No [reg|hp] token has been specified + */ + if (cmd == NULL) + goto default_queue; + + /** + * The second token is either NULL, or an optional [reg|hp] string + */ + if (strcmp(cmd, "reg") == 0) + queue = PTLRPC_NRS_QUEUE_REG; + else if (strcmp(cmd, "hp") == 0) + queue = PTLRPC_NRS_QUEUE_HP; + else + GOTO(out, rc = -EINVAL); + +default_queue: + + if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) + GOTO(out, rc = -ENODEV); + else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) + queue = PTLRPC_NRS_QUEUE_REG; + + /** + * Serialize NRS core lprocfs operations with policy registration/ + * unregistration. + */ + mutex_lock(&nrs_core.nrs_mutex); + + rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START, + false, NULL); + + mutex_unlock(&nrs_core.nrs_mutex); +out: + if (cmd_copy) + OBD_FREE(cmd_copy, LPROCFS_NRS_WR_MAX_CMD); + + RETURN(rc < 0 ? rc : count); +} + +/** @} nrs */ + +struct ptlrpc_srh_iterator { + int srhi_idx; + __u64 srhi_seq; + struct ptlrpc_request *srhi_req; +}; + +int +ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt, + struct ptlrpc_srh_iterator *srhi, + __u64 seq) +{ + struct list_head *e; + struct ptlrpc_request *req; + + if (srhi->srhi_req != NULL && + srhi->srhi_seq > svcpt->scp_hist_seq_culled && + srhi->srhi_seq <= seq) { + /* If srhi_req was set previously, hasn't been culled and + * we're searching for a seq on or after it (i.e. more + * recent), search from it onwards. + * Since the service history is LRU (i.e. culled reqs will + * be near the head), we shouldn't have to do long + * re-scans */ + LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq, + "%s:%d: seek seq "LPU64", request seq "LPU64"\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + srhi->srhi_seq, srhi->srhi_req->rq_history_seq); + LASSERTF(!list_empty(&svcpt->scp_hist_reqs), + "%s:%d: seek offset "LPU64", request seq "LPU64", " + "last culled "LPU64"\n", + svcpt->scp_service->srv_name, svcpt->scp_cpt, + seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled); + e = &srhi->srhi_req->rq_history_list; + } else { + /* search from start */ + e = svcpt->scp_hist_reqs.next; + } + + while (e != &svcpt->scp_hist_reqs) { + req = list_entry(e, struct ptlrpc_request, rq_history_list); + + if (req->rq_history_seq >= seq) { + srhi->srhi_seq = req->rq_history_seq; + srhi->srhi_req = req; + return 0; + } + e = e->next; + } + + return -ENOENT; +} + +/* + * ptlrpc history sequence is used as "position" of seq_file, in some case, + * seq_read() will increase "position" to indicate reading the next + * element, however, low bits of history sequence are reserved for CPT id + * (check the details from comments before ptlrpc_req_add_history), which + * means seq_read() might change CPT id of history sequence and never + * finish reading of requests on a CPT. To make it work, we have to shift + * CPT id to high bits and timestamp to low bits, so seq_read() will only + * increase timestamp which can correctly indicate the next position. + */ + +/* convert seq_file pos to cpt */ +#define PTLRPC_REQ_POS2CPT(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (__u64)(pos) >> (64 - (svc)->srv_cpt_bits)) + +/* make up seq_file pos from cpt */ +#define PTLRPC_REQ_CPT2POS(svc, cpt) \ + ((svc)->srv_cpt_bits == 0 ? 0 : \ + (cpt) << (64 - (svc)->srv_cpt_bits)) + +/* convert sequence to position */ +#define PTLRPC_REQ_SEQ2POS(svc, seq) \ + ((svc)->srv_cpt_bits == 0 ? (seq) : \ + ((seq) >> (svc)->srv_cpt_bits) | \ + ((seq) << (64 - (svc)->srv_cpt_bits))) + +/* convert position to sequence */ +#define PTLRPC_REQ_POS2SEQ(svc, pos) \ + ((svc)->srv_cpt_bits == 0 ? (pos) : \ + ((__u64)(pos) << (svc)->srv_cpt_bits) | \ + ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits))) + +static void * +ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_srh_iterator *srhi; + unsigned int cpt; + int rc; + int i; + + if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */ + CWARN("Failed to read request history because size of loff_t " + "%d can't match size of u64\n", (int)sizeof(loff_t)); + return NULL; + } + + OBD_ALLOC(srhi, sizeof(*srhi)); + if (srhi == NULL) + return NULL; + + srhi->srhi_seq = 0; + srhi->srhi_req = NULL; + + cpt = PTLRPC_REQ_POS2CPT(svc, *pos); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (i < cpt) /* skip */ + continue; + if (i > cpt) /* make up the lowest position for this CPT */ + *pos = PTLRPC_REQ_CPT2POS(svc, i); + + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, + PTLRPC_REQ_POS2SEQ(svc, *pos)); + spin_unlock(&svcpt->scp_lock); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +static void +ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) +{ + struct ptlrpc_srh_iterator *srhi = iter; + + if (srhi != NULL) + OBD_FREE(srhi, sizeof(*srhi)); +} + +static void * +ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, + void *iter, loff_t *pos) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + __u64 seq; + int rc; + int i; + + for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) { + svcpt = svc->srv_parts[i]; + + if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */ + srhi->srhi_req = NULL; + seq = srhi->srhi_seq = 0; + } else { /* the next sequence */ + seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits); + } + + spin_lock(&svcpt->scp_lock); + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq); + spin_unlock(&svcpt->scp_lock); + if (rc == 0) { + *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); + srhi->srhi_idx = i; + return srhi; + } + } + + OBD_FREE(srhi, sizeof(*srhi)); + return NULL; +} + +/* common ost/mdt so_req_printer */ +void target_print_req(void *seq_file, struct ptlrpc_request *req) +{ + /* Called holding srv_lock with irqs disabled. + * Print specific req contents and a newline. + * CAVEAT EMPTOR: check request message length before printing!!! + * You might have received any old crap so you must be just as + * careful here as the service's request parser!!! */ + struct seq_file *sf = seq_file; + + switch (req->rq_phase) { + case RQ_PHASE_NEW: + /* still awaiting a service thread's attention, or rejected + * because the generic request message didn't unpack */ + seq_printf(sf, "\n"); + break; + case RQ_PHASE_INTERPRET: + /* being handled, so basic msg swabbed, and opc is valid + * but racing with mds_handle() */ + case RQ_PHASE_COMPLETE: + /* been handled by mds_handle() reply state possibly still + * volatile */ + seq_printf(sf, "opc %d\n", lustre_msg_get_opc(req->rq_reqmsg)); + break; + default: + DEBUG_REQ(D_ERROR, req, "bad phase %d", req->rq_phase); + } +} +EXPORT_SYMBOL(target_print_req); + +static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) +{ + struct ptlrpc_service *svc = s->private; + struct ptlrpc_srh_iterator *srhi = iter; + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request *req; + int rc; + + LASSERT(srhi->srhi_idx < svc->srv_ncpts); + + svcpt = svc->srv_parts[srhi->srhi_idx]; + + spin_lock(&svcpt->scp_lock); + + rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq); + + if (rc == 0) { + req = srhi->srhi_req; + + /* Print common req fields. + * CAVEAT EMPTOR: we're racing with the service handler + * here. The request could contain any old crap, so you + * must be just as careful as the service's request + * parser. Currently I only print stuff here I know is OK + * to look at coz it was set up in request_in_callback()!!! */ + seq_printf(s, LPD64":%s:%s:x"LPU64":%d:%s:%ld:%lds(%+lds) ", + req->rq_history_seq, libcfs_nid2str(req->rq_self), + libcfs_id2str(req->rq_peer), req->rq_xid, + req->rq_reqlen, ptlrpc_rqphase2str(req), + req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_arrival_time.tv_sec, + req->rq_sent - req->rq_deadline); + if (svc->srv_ops.so_req_printer == NULL) + seq_printf(s, "\n"); + else + svc->srv_ops.so_req_printer(s, srhi->srhi_req); + } + + spin_unlock(&svcpt->scp_lock); + return rc; +} + +static int +ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) +{ + static struct seq_operations sops = { + .start = ptlrpc_lprocfs_svc_req_history_start, + .stop = ptlrpc_lprocfs_svc_req_history_stop, + .next = ptlrpc_lprocfs_svc_req_history_next, + .show = ptlrpc_lprocfs_svc_req_history_show, + }; + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *seqf; + int rc; + + LPROCFS_ENTRY_AND_CHECK(dp); + rc = seq_open(file, &sops); + if (rc) { + LPROCFS_EXIT(); + return rc; + } + + seqf = file->private_data; + seqf->private = dp->data; + return 0; +} + +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + struct ptlrpc_service_part *svcpt; + struct dhms ts; + time_t worstt; + unsigned int cur; + unsigned int worst; + int nob = 0; + int rc = 0; + int i; + + if (AT_OFF) { + rc += snprintf(page + rc, count - rc, + "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + return rc; + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + cur = at_get(&svcpt->scp_at_estimate); + worst = svcpt->scp_at_estimate.at_worst_ever; + worstt = svcpt->scp_at_estimate.at_worst_time; + s2dhms(&ts, cfs_time_current_sec() - worstt); + + nob = snprintf(page, count, + "%10s : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", "service", + cur, worst, worstt, DHMS_VARS(&ts)); + + nob = lprocfs_at_hist_helper(page, count, nob, + &svcpt->scp_at_estimate); + rc += nob; + page += nob; + count -= nob; + + /* + * NB: for lustre proc read, the read count must be less + * than PAGE_SIZE, please see details in lprocfs_fops_read. + * It's unlikely that we exceed PAGE_SIZE at here because + * it means the service has more than 50 partitions. + */ + if (count <= 0) { + CWARN("Can't fit AT information of %s in one page, " + "please contact with developer to fix this.\n", + svc->srv_name); + break; + } + } + + return rc; +} + +static int ptlrpc_lprocfs_rd_hp_ratio(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + int rc = snprintf(page, count, "%d", svc->srv_hpreq_ratio); + return rc; +} + +static int ptlrpc_lprocfs_wr_hp_ratio(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int rc; + int val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val < 0) + return -ERANGE; + + spin_lock(&svc->srv_lock); + svc->srv_hpreq_ratio = val; + spin_unlock(&svc->srv_lock); + + return count; +} + +void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, + struct ptlrpc_service *svc) +{ + struct lprocfs_vars lproc_vars[] = { + {.name = "high_priority_ratio", + .read_fptr = ptlrpc_lprocfs_rd_hp_ratio, + .write_fptr = ptlrpc_lprocfs_wr_hp_ratio, + .data = svc}, + {.name = "req_buffer_history_len", + .read_fptr = ptlrpc_lprocfs_read_req_history_len, + .data = svc}, + {.name = "req_buffer_history_max", + .write_fptr = ptlrpc_lprocfs_write_req_history_max, + .read_fptr = ptlrpc_lprocfs_read_req_history_max, + .data = svc}, + {.name = "threads_min", + .read_fptr = ptlrpc_lprocfs_rd_threads_min, + .write_fptr = ptlrpc_lprocfs_wr_threads_min, + .data = svc}, + {.name = "threads_max", + .read_fptr = ptlrpc_lprocfs_rd_threads_max, + .write_fptr = ptlrpc_lprocfs_wr_threads_max, + .data = svc}, + {.name = "threads_started", + .read_fptr = ptlrpc_lprocfs_rd_threads_started, + .data = svc}, + {.name = "timeouts", + .read_fptr = ptlrpc_lprocfs_rd_timeouts, + .data = svc}, + {.name = "nrs_policies", + .read_fptr = ptlrpc_lprocfs_rd_nrs, + .write_fptr = ptlrpc_lprocfs_wr_nrs, + .data = svc}, + {NULL} + }; + static struct file_operations req_history_fops = { + .owner = THIS_MODULE, + .open = ptlrpc_lprocfs_svc_req_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, + }; + + int rc; + + ptlrpc_lprocfs_register(entry, svc->srv_name, + "stats", &svc->srv_procroot, + &svc->srv_stats); + + if (svc->srv_procroot == NULL) + return; + + lprocfs_add_vars(svc->srv_procroot, lproc_vars, NULL); + + rc = lprocfs_seq_create(svc->srv_procroot, "req_history", + 0400, &req_history_fops, svc); + if (rc) + CWARN("Error adding the req_history file\n"); +} + +void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) +{ + ptlrpc_lprocfs_register(obddev->obd_proc_entry, NULL, "stats", + &obddev->obd_svc_procroot, + &obddev->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd); + +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount) +{ + struct lprocfs_stats *svc_stats; + __u32 op = lustre_msg_get_opc(req->rq_reqmsg); + int opc = opcode_offset(op); + + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (svc_stats == NULL || opc <= 0) + return; + LASSERT(opc < LUSTRE_MAX_OPCODES); + if (!(op == LDLM_ENQUEUE || op == MDS_REINT)) + lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount); +} + +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) +{ + struct lprocfs_stats *svc_stats; + int idx; + + if (!req->rq_import) + return; + svc_stats = req->rq_import->imp_obd->obd_svc_stats; + if (!svc_stats) + return; + idx = lustre_msg_get_opc(req->rq_reqmsg); + switch (idx) { + case OST_READ: + idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR; + break; + case OST_WRITE: + idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR; + break; + default: + LASSERTF(0, "unsupported opcode %u\n", idx); + break; + } + + lprocfs_counter_add(svc_stats, idx, bytes); +} + +EXPORT_SYMBOL(ptlrpc_lprocfs_brw); + +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) +{ + if (svc->srv_procroot != NULL) + lprocfs_remove(&svc->srv_procroot); + + if (svc->srv_stats) + lprocfs_free_stats(&svc->srv_stats); +} + +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) +{ + if (obd->obd_svc_procroot) + lprocfs_remove(&obd->obd_svc_procroot); + + if (obd->obd_svc_stats) + lprocfs_free_stats(&obd->obd_svc_stats); +} +EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd); + + +#define BUFLEN (UUID_MAX + 5) + +int lprocfs_wr_evict_client(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + char *kbuf; + char *tmpbuf; + + OBD_ALLOC(kbuf, BUFLEN); + if (kbuf == NULL) + return -ENOMEM; + + /* + * OBD_ALLOC() will zero kbuf, but we only copy BUFLEN - 1 + * bytes into kbuf, to ensure that the string is NUL-terminated. + * UUID_MAX should include a trailing NUL already. + */ + if (copy_from_user(kbuf, buffer, + min_t(unsigned long, BUFLEN - 1, count))) { + count = -EFAULT; + goto out; + } + tmpbuf = cfs_firststr(kbuf, min_t(unsigned long, BUFLEN - 1, count)); + /* Kludge code(deadlock situation): the lprocfs lock has been held + * since the client is evicted by writting client's + * uuid/nid to procfs "evict_client" entry. However, + * obd_export_evict_by_uuid() will call lprocfs_remove() to destroy + * the proc entries under the being destroyed export{}, so I have + * to drop the lock at first here. + * - jay, jxiong@clusterfs.com */ + LPROCFS_EXIT(); + class_incref(obd, __FUNCTION__, current); + + if (strncmp(tmpbuf, "nid:", 4) == 0) + obd_export_evict_by_nid(obd, tmpbuf + 4); + else if (strncmp(tmpbuf, "uuid:", 5) == 0) + obd_export_evict_by_uuid(obd, tmpbuf + 5); + else + obd_export_evict_by_uuid(obd, tmpbuf); + + class_decref(obd, __FUNCTION__, current); + LPROCFS_ENTRY(); + +out: + OBD_FREE(kbuf, BUFLEN); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_evict_client); + +#undef BUFLEN + +int lprocfs_wr_ping(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct ptlrpc_request *req; + int rc; + ENTRY; + + LPROCFS_CLIMP_CHECK(obd); + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + LPROCFS_CLIMP_EXIT(obd); + if (req == NULL) + RETURN(-ENOMEM); + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + if (rc >= 0) + RETURN(count); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_wr_ping); + +/* Write the connection UUID to this file to attempt to connect to that node. + * The connection UUID is a node's primary NID. For example, + * "echo connection=192.168.0.1@tcp0::instance > .../import". + */ +int lprocfs_wr_import(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp = obd->u.cli.cl_import; + char *kbuf = NULL; + char *uuid; + char *ptr; + int do_reconn = 1; + const char prefix[] = "connection="; + const int prefix_len = sizeof(prefix) - 1; + + if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len) + return -EINVAL; + + OBD_ALLOC(kbuf, count + 1); + if (kbuf == NULL) + return -ENOMEM; + + if (copy_from_user(kbuf, buffer, count)) + GOTO(out, count = -EFAULT); + + kbuf[count] = 0; + + /* only support connection=uuid::instance now */ + if (strncmp(prefix, kbuf, prefix_len) != 0) + GOTO(out, count = -EINVAL); + + uuid = kbuf + prefix_len; + ptr = strstr(uuid, "::"); + if (ptr) { + __u32 inst; + char *endptr; + + *ptr = 0; + do_reconn = 0; + ptr += strlen("::"); + inst = simple_strtol(ptr, &endptr, 10); + if (*endptr) { + CERROR("config: wrong instance # %s\n", ptr); + } else if (inst != imp->imp_connect_data.ocd_instance) { + CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted " + "target(%u/%u), reconnecting...\n", + imp->imp_obd->obd_name, + imp->imp_connect_data.ocd_instance, inst); + do_reconn = 1; + } else { + CDEBUG(D_INFO, "IR: %s has already been connecting to " + "new target(%u)\n", + imp->imp_obd->obd_name, inst); + } + } + + if (do_reconn) + ptlrpc_recover_import(imp, uuid, 1); + +out: + OBD_FREE(kbuf, count + 1); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_import); + +int lprocfs_rd_pinger_recov(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp = obd->u.cli.cl_import; + int rc; + + LPROCFS_CLIMP_CHECK(obd); + rc = snprintf(page, count, "%d\n", !imp->imp_no_pinger_recover); + LPROCFS_CLIMP_EXIT(obd); + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_pinger_recov); + +int lprocfs_wr_pinger_recov(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + int rc, val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val != 0 && val != 1) + return -ERANGE; + + LPROCFS_CLIMP_CHECK(obd); + spin_lock(&imp->imp_lock); + imp->imp_no_pinger_recover = !val; + spin_unlock(&imp->imp_lock); + LPROCFS_CLIMP_EXIT(obd); + + return count; + +} +EXPORT_SYMBOL(lprocfs_wr_pinger_recov); + +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c new file mode 100644 index 000000000000..de3f0db0ba47 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c @@ -0,0 +1,728 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * Helper function. Sends \a len bytes from \a base at offset \a offset + * over \a conn connection to portal \a portal. + * Returns 0 on success or error code. + */ +static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, + lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, + struct ptlrpc_connection *conn, int portal, __u64 xid, + unsigned int offset) +{ + int rc; + lnet_md_t md; + ENTRY; + + LASSERT (portal != 0); + LASSERT (conn != NULL); + CDEBUG (D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); + md.start = base; + md.length = len; + md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; + md.options = PTLRPC_MD_OPTIONS; + md.user_ptr = cbid; + md.eq_handle = ptlrpc_eq_h; + + if (unlikely(ack == LNET_ACK_REQ && + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, OBD_FAIL_ONCE))){ + /* don't ask for the ack to simulate failing client */ + ack = LNET_NOACK_REQ; + } + + rc = LNetMDBind (md, LNET_UNLINK, mdh); + if (unlikely(rc != 0)) { + CERROR ("LNetMDBind failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + RETURN (-ENOMEM); + } + + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n", + len, portal, xid, offset); + + rc = LNetPut (conn->c_self, *mdh, ack, + conn->c_peer, portal, xid, offset, 0); + if (unlikely(rc != 0)) { + int rc2; + /* We're going to get an UNLINK event when I unlink below, + * which will complete just like any other failed send, so + * I fall through and return success here! */ + CERROR("LNetPut(%s, %d, "LPD64") failed: %d\n", + libcfs_id2str(conn->c_peer), portal, xid, rc); + rc2 = LNetMDUnlink(*mdh); + LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); + } + + RETURN (0); +} + +static void mdunlink_iterate_helper(lnet_handle_md_t *bd_mds, int count) +{ + int i; + + for (i = 0; i < count; i++) + LNetMDUnlink(bd_mds[i]); +} + + +/** + * Register bulk at the sender for later transfer. + * Returns 0 on success or error code. + */ +int ptlrpc_register_bulk(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + lnet_process_id_t peer; + int rc = 0; + int rc2; + int posted_md; + int total_md; + __u64 xid; + lnet_handle_me_t me_h; + lnet_md_t md; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) + RETURN(0); + + /* NB no locking required until desc is on the network */ + LASSERT(desc->bd_nob > 0); + LASSERT(desc->bd_md_count == 0); + LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(desc->bd_req != NULL); + LASSERT(desc->bd_type == BULK_PUT_SINK || + desc->bd_type == BULK_GET_SOURCE); + + /* cleanup the state of the bulk for it will be reused */ + if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) + desc->bd_nob_transferred = 0; + else + LASSERT(desc->bd_nob_transferred == 0); + + desc->bd_failure = 0; + + peer = desc->bd_import->imp_connection->c_peer; + + LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); + LASSERT(desc->bd_cbid.cbid_arg == desc); + + /* An XID is only used for a single request from the client. + * For retried bulk transfers, a new XID will be allocated in + * in ptlrpc_check_set() if it needs to be resent, so it is not + * using the same RDMA match bits after an error. + * + * For multi-bulk RPCs, rq_xid is the last XID needed for bulks. The + * first bulk XID is power-of-two aligned before rq_xid. LU-1431 */ + xid = req->rq_xid & ~((__u64)desc->bd_md_max_brw - 1); + LASSERTF(!(desc->bd_registered && + req->rq_send_state != LUSTRE_IMP_REPLAY) || + xid != desc->bd_last_xid, + "registered: %d rq_xid: "LPU64" bd_last_xid: "LPU64"\n", + desc->bd_registered, xid, desc->bd_last_xid); + + total_md = (desc->bd_iov_count + LNET_MAX_IOV - 1) / LNET_MAX_IOV; + desc->bd_registered = 1; + desc->bd_last_xid = xid; + desc->bd_md_count = total_md; + md.user_ptr = &desc->bd_cbid; + md.eq_handle = ptlrpc_eq_h; + md.threshold = 1; /* PUT or GET */ + + for (posted_md = 0; posted_md < total_md; posted_md++, xid++) { + md.options = PTLRPC_MD_OPTIONS | + ((desc->bd_type == BULK_GET_SOURCE) ? + LNET_MD_OP_GET : LNET_MD_OP_PUT); + ptlrpc_fill_bulk_md(&md, desc, posted_md); + + rc = LNetMEAttach(desc->bd_portal, peer, xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &me_h); + if (rc != 0) { + CERROR("%s: LNetMEAttach failed x"LPU64"/%d: rc = %d\n", + desc->bd_export->exp_obd->obd_name, xid, + posted_md, rc); + break; + } + + /* About to let the network at it... */ + rc = LNetMDAttach(me_h, md, LNET_UNLINK, + &desc->bd_mds[posted_md]); + if (rc != 0) { + CERROR("%s: LNetMDAttach failed x"LPU64"/%d: rc = %d\n", + desc->bd_export->exp_obd->obd_name, xid, + posted_md, rc); + rc2 = LNetMEUnlink(me_h); + LASSERT(rc2 == 0); + break; + } + } + + if (rc != 0) { + LASSERT(rc == -ENOMEM); + spin_lock(&desc->bd_lock); + desc->bd_md_count -= total_md - posted_md; + spin_unlock(&desc->bd_lock); + LASSERT(desc->bd_md_count >= 0); + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + req->rq_status = -ENOMEM; + RETURN(-ENOMEM); + } + + /* Set rq_xid to matchbits of the final bulk so that server can + * infer the number of bulks that were prepared */ + req->rq_xid = --xid; + LASSERTF(desc->bd_last_xid == (req->rq_xid & PTLRPC_BULK_OPS_MASK), + "bd_last_xid = x"LPU64", rq_xid = x"LPU64"\n", + desc->bd_last_xid, req->rq_xid); + + spin_lock(&desc->bd_lock); + /* Holler if peer manages to touch buffers before he knows the xid */ + if (desc->bd_md_count != total_md) + CWARN("%s: Peer %s touched %d buffers while I registered\n", + desc->bd_export->exp_obd->obd_name, libcfs_id2str(peer), + total_md - desc->bd_md_count); + spin_unlock(&desc->bd_lock); + + CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, " + "xid x"LPX64"-"LPX64", portal %u\n", desc->bd_md_count, + desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", + desc->bd_iov_count, desc->bd_nob, + desc->bd_last_xid, req->rq_xid, desc->bd_portal); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_register_bulk); + +/** + * Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). + * Returns 1 on success or 0 if network unregistration failed for whatever + * reason. + */ +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) +{ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + wait_queue_head_t *wq; + struct l_wait_info lwi; + int rc; + ENTRY; + + LASSERT(!in_interrupt()); /* might sleep */ + + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + async && req->rq_bulk_deadline == 0) + req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK; + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ + + /* the unlink ensures the callback happens ASAP and is the last + * one. If it fails, it must be because completion just happened, + * but we must still l_wait_event() in this case to give liblustre + * a chance to run client_bulk_callback() */ + mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); + + if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ + RETURN(1); /* never registered */ + + /* Move to "Unregistering" phase as bulk was not unlinked yet. */ + ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING); + + /* Do not wait for unlink to finish. */ + if (async) + RETURN(0); + + if (req->rq_set != NULL) + wq = &req->rq_set->set_waitq; + else + wq = &req->rq_reply_waitq; + + for (;;) { + /* Network access will complete in finite time but the HUGE + * timeout lets us CWARN for visibility of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(req, req->rq_next_phase); + RETURN(1); + } + + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", + desc); + } + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_unregister_bulk); + +static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int service_time = max_t(int, cfs_time_current_sec() - + req->rq_arrival_time.tv_sec, 1); + + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR) && + (req->rq_reqmsg != NULL) && + !(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY | + MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { + /* early replies, errors and recovery requests don't count + * toward our service time estimate */ + int oldse = at_measured(&svcpt->scp_at_estimate, service_time); + + if (oldse != 0) { + DEBUG_REQ(D_ADAPTTO, req, + "svc %s changed estimate from %d to %d", + svc->srv_name, oldse, + at_get(&svcpt->scp_at_estimate)); + } + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_time(req->rq_repmsg, service_time); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's a error reply during recovery. + * (bz15815) */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering)) + lustre_msg_set_timeout(req->rq_repmsg, 0); + else + lustre_msg_set_timeout(req->rq_repmsg, + at_get(&svcpt->scp_at_estimate)); + + if (req->rq_reqmsg && + !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x " + "req_flags=%#x magic=%d:%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_is_v1(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } +} + +/** + * Send request reply from request \a req reply buffer. + * \a flags defines reply types + * Returns 0 on sucess or error code + */ +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_connection *conn; + int rc; + + /* We must already have a reply buffer (only ptlrpc_error() may be + * called without one). The reply generated by sptlrpc layer (e.g. + * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must + * have a request buffer which is either the actual (swabbed) incoming + * request, or a saved copy if this is a req saved in + * target_queue_final_reply(). + */ + LASSERT (req->rq_no_reply == 0); + LASSERT (req->rq_reqbuf != NULL); + LASSERT (rs != NULL); + LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); + LASSERT (req->rq_repmsg != NULL); + LASSERT (req->rq_repmsg == rs->rs_msg); + LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); + LASSERT (rs->rs_cb_id.cbid_arg == rs); + + /* There may be no rq_export during failover */ + + if (unlikely(req->rq_export && req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failed obd's only send ENODEV */ + req->rq_type = PTL_RPC_MSG_ERR; + req->rq_status = -ENODEV; + CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", + req->rq_export->exp_obd->obd_minor); + } + + /* In order to keep interoprability with the client (< 2.3) which + * doesn't have pb_jobid in ptlrpc_body, We have to shrink the + * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the + * reply buffer on client will be overflow. + * + * XXX Remove this whenver we drop the interoprability with such client. + */ + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, + sizeof(struct ptlrpc_body_v2), 1); + + if (req->rq_type != PTL_RPC_MSG_ERR) + req->rq_type = PTL_RPC_MSG_REPLY; + + lustre_msg_set_type(req->rq_repmsg, req->rq_type); + lustre_msg_set_status(req->rq_repmsg, req->rq_status); + lustre_msg_set_opc(req->rq_repmsg, + req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : 0); + + target_pack_pool_reply(req); + + ptlrpc_at_set_reply(req, flags); + + if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) + conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); + else + conn = ptlrpc_connection_addref(req->rq_export->exp_connection); + + if (unlikely(conn == NULL)) { + CERROR("not replying on NULL connection\n"); /* bug 9635 */ + return -ENOTCONN; + } + ptlrpc_rs_addref(rs); /* +1 ref for the network */ + + rc = sptlrpc_svc_wrap_reply(req); + if (unlikely(rc)) + goto out; + + req->rq_sent = cfs_time_current_sec(); + + rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, + &rs->rs_cb_id, conn, + ptlrpc_req2svc(req)->srv_rep_portal, + req->rq_xid, req->rq_reply_off); +out: + if (unlikely(rc != 0)) + ptlrpc_req_drop_rs(req); + ptlrpc_connection_put(conn); + return rc; +} +EXPORT_SYMBOL(ptlrpc_send_reply); + +int ptlrpc_reply (struct ptlrpc_request *req) +{ + if (req->rq_no_reply) + return 0; + else + return (ptlrpc_send_reply(req, 0)); +} +EXPORT_SYMBOL(ptlrpc_reply); + +/** + * For request \a req send an error reply back. Create empty + * reply buffers if necessary. + */ +int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) +{ + int rc; + ENTRY; + + if (req->rq_no_reply) + RETURN(0); + + if (!req->rq_repmsg) { + rc = lustre_pack_reply(req, 1, NULL, NULL); + if (rc) + RETURN(rc); + } + + if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && + req->rq_status != -EPERM && req->rq_status != -ENOENT && + req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) + req->rq_type = PTL_RPC_MSG_ERR; + + rc = ptlrpc_send_reply(req, may_be_difficult); + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_send_error); + +int ptlrpc_error(struct ptlrpc_request *req) +{ + return ptlrpc_send_error(req, 0); +} +EXPORT_SYMBOL(ptlrpc_error); + +/** + * Send request \a request. + * if \a noreply is set, don't expect any reply back and don't set up + * reply buffers. + * Returns 0 on success or error code. + */ +int ptl_send_rpc(struct ptlrpc_request *request, int noreply) +{ + int rc; + int rc2; + int mpflag = 0; + struct ptlrpc_connection *connection; + lnet_handle_me_t reply_me_h; + lnet_md_t reply_md; + struct obd_device *obd = request->rq_import->imp_obd; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) + RETURN(0); + + LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); + LASSERT(request->rq_wait_ctx == 0); + + /* If this is a re-transmit, we're required to have disengaged + * cleanly from the previous attempt */ + LASSERT(!request->rq_receiving_reply); + + if (request->rq_import->imp_obd && + request->rq_import->imp_obd->obd_fail) { + CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", + request->rq_import->imp_obd->obd_name); + /* this prevents us from waiting in ptlrpc_queue_wait */ + request->rq_err = 1; + request->rq_status = -ENODEV; + RETURN(-ENODEV); + } + + connection = request->rq_import->imp_connection; + + lustre_msg_set_handle(request->rq_reqmsg, + &request->rq_import->imp_remote_handle); + lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); + lustre_msg_set_conn_cnt(request->rq_reqmsg, + request->rq_import->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + request->rq_import->imp_msghdr_flags); + + if (request->rq_resend) + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + + if (request->rq_memalloc) + mpflag = cfs_memory_pressure_get_and_set(); + + rc = sptlrpc_cli_wrap_request(request); + if (rc) + GOTO(out, rc); + + /* bulk register should be done after wrap_request() */ + if (request->rq_bulk != NULL) { + rc = ptlrpc_register_bulk (request); + if (rc != 0) + GOTO(out, rc); + } + + if (!noreply) { + LASSERT (request->rq_replen != 0); + if (request->rq_repbuf == NULL) { + LASSERT(request->rq_repdata == NULL); + LASSERT(request->rq_repmsg == NULL); + rc = sptlrpc_cli_alloc_repbuf(request, + request->rq_replen); + if (rc) { + /* this prevents us from looping in + * ptlrpc_queue_wait */ + request->rq_err = 1; + request->rq_status = rc; + GOTO(cleanup_bulk, rc); + } + } else { + request->rq_repdata = NULL; + request->rq_repmsg = NULL; + } + + rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ + connection->c_peer, request->rq_xid, 0, + LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + GOTO(cleanup_bulk, rc = -ENOMEM); + } + } + + spin_lock(&request->rq_lock); + /* If the MD attach succeeds, there _will_ be a reply_in callback */ + request->rq_receiving_reply = !noreply; + /* We are responsible for unlinking the reply buffer */ + request->rq_must_unlink = !noreply; + /* Clear any flags that may be present from previous sends. */ + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_net_err = 0; + request->rq_resend = 0; + request->rq_restart = 0; + request->rq_reply_truncate = 0; + spin_unlock(&request->rq_lock); + + if (!noreply) { + reply_md.start = request->rq_repbuf; + reply_md.length = request->rq_repbuf_len; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE | + LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */; + reply_md.user_ptr = &request->rq_reply_cbid; + reply_md.eq_handle = ptlrpc_eq_h; + + /* We must see the unlink callback to unset rq_must_unlink, + so we can't auto-unlink */ + rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, + &request->rq_reply_md_h); + if (rc != 0) { + CERROR("LNetMDAttach failed: %d\n", rc); + LASSERT (rc == -ENOMEM); + spin_lock(&request->rq_lock); + /* ...but the MD attach didn't succeed... */ + request->rq_receiving_reply = 0; + spin_unlock(&request->rq_lock); + GOTO(cleanup_me, rc = -ENOMEM); + } + + CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 + ", portal %u\n", + request->rq_repbuf_len, request->rq_xid, + request->rq_reply_portal); + } + + /* add references on request for request_out_callback */ + ptlrpc_request_addref(request); + if (obd->obd_svc_stats != NULL) + lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, + atomic_read(&request->rq_import->imp_inflight)); + + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); + + do_gettimeofday(&request->rq_arrival_time); + request->rq_sent = cfs_time_current_sec(); + /* We give the server rq_timeout secs to process the req, and + add the network latency for our local timeout. */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + + ptlrpc_pinger_sending_on_import(request->rq_import); + + DEBUG_REQ(D_INFO, request, "send flg=%x", + lustre_msg_get_flags(request->rq_reqmsg)); + rc = ptl_send_buf(&request->rq_req_md_h, + request->rq_reqbuf, request->rq_reqdata_len, + LNET_NOACK_REQ, &request->rq_req_cbid, + connection, + request->rq_request_portal, + request->rq_xid, 0); + if (rc == 0) + GOTO(out, rc); + + ptlrpc_req_finished(request); + if (noreply) + GOTO(out, rc); + + cleanup_me: + /* MEUnlink is safe; the PUT didn't even get off the ground, and + * nobody apart from the PUT's target has the right nid+XID to + * access the reply buffer. */ + rc2 = LNetMEUnlink(reply_me_h); + LASSERT (rc2 == 0); + /* UNLINKED callback called synchronously */ + LASSERT(!request->rq_receiving_reply); + + cleanup_bulk: + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ + ptlrpc_unregister_bulk(request, 0); + out: + if (request->rq_memalloc) + cfs_memory_pressure_restore(mpflag); + return rc; +} +EXPORT_SYMBOL(ptl_send_rpc); + +/** + * Register request buffer descriptor for request receiving. + */ +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; + static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; + int rc; + lnet_md_t md; + lnet_handle_me_t me_h; + + CDEBUG(D_NET, "LNetMEAttach: portal %d\n", + service->srv_req_portal); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) + return (-ENOMEM); + + /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, + * which means buffer can only be attached on local CPT, and LND + * threads can find it by grabbing a local lock */ + rc = LNetMEAttach(service->srv_req_portal, + match_id, 0, ~0, LNET_UNLINK, + rqbd->rqbd_svcpt->scp_cpt >= 0 ? + LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); + if (rc != 0) { + CERROR("LNetMEAttach failed: %d\n", rc); + return (-ENOMEM); + } + + LASSERT(rqbd->rqbd_refcount == 0); + rqbd->rqbd_refcount = 1; + + md.start = rqbd->rqbd_buffer; + md.length = service->srv_buf_size; + md.max_size = service->srv_max_req_size; + md.threshold = LNET_MD_THRESH_INF; + md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; + md.user_ptr = &rqbd->rqbd_cbid; + md.eq_handle = ptlrpc_eq_h; + + rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); + if (rc == 0) + return (0); + + CERROR("LNetMDAttach failed: %d; \n", rc); + LASSERT (rc == -ENOMEM); + rc = LNetMEUnlink (me_h); + LASSERT (rc == 0); + rqbd->rqbd_refcount = 0; + + return (-ENOMEM); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c new file mode 100644 index 000000000000..1996431e35ff --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/nrs.c @@ -0,0 +1,1790 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs.c + * + * Network Request Scheduler (NRS) + * + * Allows to reorder the handling of RPCs at servers. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/* XXX: This is just for liblustre. Remove the #if defined directive when the + * "cfs_" prefix is dropped from cfs_list_head. */ +extern struct list_head ptlrpc_all_services; + +/** + * NRS core object. + */ +struct nrs_core nrs_core; + +static int nrs_policy_init(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_desc->pd_ops->op_policy_init != NULL ? + policy->pol_desc->pd_ops->op_policy_init(policy) : 0; +} + +static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref == 0); + LASSERT(policy->pol_req_queued == 0); + + if (policy->pol_desc->pd_ops->op_policy_fini != NULL) + policy->pol_desc->pd_ops->op_policy_fini(policy); +} + +static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + /** + * The policy may be stopped, but the lprocfs files and + * ptlrpc_nrs_policy instances remain present until unregistration time. + * Do not perform the ctl operation if the policy is stopped, as + * policy->pol_private will be NULL in such a case. + */ + if (policy->pol_state == NRS_POL_STATE_STOPPED) + RETURN(-ENODEV); + + RETURN(policy->pol_desc->pd_ops->op_policy_ctl != NULL ? + policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) : + -ENOSYS); +} + +static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + ENTRY; + + if (policy->pol_desc->pd_ops->op_policy_stop != NULL) { + spin_unlock(&nrs->nrs_lock); + + policy->pol_desc->pd_ops->op_policy_stop(policy); + + spin_lock(&nrs->nrs_lock); + } + + LASSERT(list_empty(&policy->pol_list_queued)); + LASSERT(policy->pol_req_queued == 0 && + policy->pol_req_started == 0); + + policy->pol_private = NULL; + + policy->pol_state = NRS_POL_STATE_STOPPED; + + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + EXIT; +} + +static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + ENTRY; + + if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTING) + RETURN(-EAGAIN); + + /* In progress or already stopped */ + if (policy->pol_state != NRS_POL_STATE_STARTED) + RETURN(0); + + policy->pol_state = NRS_POL_STATE_STOPPING; + + /* Immediately make it invisible */ + if (nrs->nrs_policy_primary == policy) { + nrs->nrs_policy_primary = NULL; + + } else { + LASSERT(nrs->nrs_policy_fallback == policy); + nrs->nrs_policy_fallback = NULL; + } + + /* I have the only refcount */ + if (policy->pol_ref == 1) + nrs_policy_stop0(policy); + + RETURN(0); +} + +/** + * Transitions the \a nrs NRS head's primary policy to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no + * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED. + * + * \param[in] nrs the NRS head to carry out this operation on + */ +static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary; + ENTRY; + + if (tmp == NULL) { + /** + * XXX: This should really be RETURN_EXIT, but the latter does + * not currently print anything out, and possibly should be + * fixed to do so. + */ + EXIT; + return; + } + + nrs->nrs_policy_primary = NULL; + + LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED); + tmp->pol_state = NRS_POL_STATE_STOPPING; + + if (tmp->pol_ref == 0) + nrs_policy_stop0(tmp); + EXIT; +} + +/** + * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in + * response to an lprocfs command to start a policy. + * + * If a primary policy different to the current one is specified, this function + * will transition the new policy to the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition + * the old primary policy (if there is one) to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. + * + * If the fallback policy is specified, this is taken to indicate an instruction + * to stop the current primary policy, without substituting it with another + * primary policy, so the primary policy (if any) is transitioned to + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding + * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In + * this case, the fallback policy is only left active in the NRS head. + */ +static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy) +{ + struct ptlrpc_nrs *nrs = policy->pol_nrs; + int rc = 0; + ENTRY; + + /** + * Don't allow multiple starting which is too complex, and has no real + * benefit. + */ + if (nrs->nrs_policy_starting) + RETURN(-EAGAIN); + + LASSERT(policy->pol_state != NRS_POL_STATE_STARTING); + + if (policy->pol_state == NRS_POL_STATE_STOPPING) + RETURN(-EAGAIN); + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This is for cases in which the user sets the policy to the + * fallback policy (currently fifo for all services); i.e. the + * user is resetting the policy to the default; so we stop the + * primary policy, if any. + */ + if (policy == nrs->nrs_policy_fallback) { + nrs_policy_stop_primary(nrs); + RETURN(0); + } + + /** + * If we reach here, we must be setting up the fallback policy + * at service startup time, and only a single policy with the + * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can + * register with NRS core. + */ + LASSERT(nrs->nrs_policy_fallback == NULL); + } else { + /** + * Shouldn't start primary policy if w/o fallback policy. + */ + if (nrs->nrs_policy_fallback == NULL) + RETURN(-EPERM); + + if (policy->pol_state == NRS_POL_STATE_STARTED) + RETURN(0); + } + + /** + * Increase the module usage count for policies registering from other + * modules. + */ + if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 && + !try_module_get(policy->pol_desc->pd_owner)) { + atomic_dec(&policy->pol_desc->pd_refs); + CERROR("NRS: cannot get module for policy %s; is it alive?\n", + policy->pol_desc->pd_name); + RETURN(-ENODEV); + } + + /** + * Serialize policy starting across the NRS head + */ + nrs->nrs_policy_starting = 1; + + policy->pol_state = NRS_POL_STATE_STARTING; + + if (policy->pol_desc->pd_ops->op_policy_start) { + spin_unlock(&nrs->nrs_lock); + + rc = policy->pol_desc->pd_ops->op_policy_start(policy); + + spin_lock(&nrs->nrs_lock); + if (rc != 0) { + if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) + module_put(policy->pol_desc->pd_owner); + + policy->pol_state = NRS_POL_STATE_STOPPED; + GOTO(out, rc); + } + } + + policy->pol_state = NRS_POL_STATE_STARTED; + + if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { + /** + * This path is only used at PTLRPC service setup time. + */ + nrs->nrs_policy_fallback = policy; + } else { + /* + * Try to stop the current primary policy if there is one. + */ + nrs_policy_stop_primary(nrs); + + /** + * And set the newly-started policy as the primary one. + */ + nrs->nrs_policy_primary = policy; + } + +out: + nrs->nrs_policy_starting = 0; + + RETURN(rc); +} + +/** + * Increases the policy's usage reference count. + */ +static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy) +{ + policy->pol_ref++; +} + +/** + * Decreases the policy's usage reference count, and stops the policy in case it + * was already stopping and have no more outstanding usage references (which + * indicates it has no more queued or started requests, and can be safely + * stopped). + */ +static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_ref > 0); + + policy->pol_ref--; + if (unlikely(policy->pol_ref == 0 && + policy->pol_state == NRS_POL_STATE_STOPPING)) + nrs_policy_stop0(policy); +} + +static void nrs_policy_put(struct ptlrpc_nrs_policy *policy) +{ + spin_lock(&policy->pol_nrs->nrs_lock); + nrs_policy_put_locked(policy); + spin_unlock(&policy->pol_nrs->nrs_lock); +} + +/** + * Find and return a policy by name. + */ +static struct ptlrpc_nrs_policy * nrs_policy_find_locked(struct ptlrpc_nrs *nrs, + char *name) +{ + struct ptlrpc_nrs_policy *tmp; + + list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) { + if (strncmp(tmp->pol_desc->pd_name, name, + NRS_POL_NAME_MAX) == 0) { + nrs_policy_get_locked(tmp); + return tmp; + } + } + return NULL; +} + +/** + * Release references for the resource hierarchy moving upwards towards the + * policy instance resource. + */ +static void nrs_resource_put(struct ptlrpc_nrs_resource *res) +{ + struct ptlrpc_nrs_policy *policy = res->res_policy; + + if (policy->pol_desc->pd_ops->op_res_put != NULL) { + struct ptlrpc_nrs_resource *parent; + + for (; res != NULL; res = parent) { + parent = res->res_parent; + policy->pol_desc->pd_ops->op_res_put(policy, res); + } + } +} + +/** + * Obtains references for each resource in the resource hierarchy for request + * \a nrq if it is to be handled by \a policy. + * + * \param[in] policy the policy + * \param[in] nrq the request + * \param[in] moving_req denotes whether this is a call to the function by + * ldlm_lock_reorder_req(), in order to move \a nrq to + * the high-priority NRS head; we should not sleep when + * set. + * + * \retval NULL resource hierarchy references not obtained + * \retval valid-pointer the bottom level of the resource hierarchy + * + * \see ptlrpc_nrs_pol_ops::op_res_get() + */ +static +struct ptlrpc_nrs_resource * nrs_resource_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + bool moving_req) +{ + /** + * Set to NULL to traverse the resource hierarchy from the top. + */ + struct ptlrpc_nrs_resource *res = NULL; + struct ptlrpc_nrs_resource *tmp = NULL; + int rc; + + while (1) { + rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res, + &tmp, moving_req); + if (rc < 0) { + if (res != NULL) + nrs_resource_put(res); + return NULL; + } + + LASSERT(tmp != NULL); + tmp->res_parent = res; + tmp->res_policy = policy; + res = tmp; + tmp = NULL; + /** + * Return once we have obtained a reference to the bottom level + * of the resource hierarchy. + */ + if (rc > 0) + return res; + } +} + +/** + * Obtains resources for the resource hierarchies and policy references for + * the fallback and current primary policy (if any), that will later be used + * to handle request \a nrq. + * + * \param[in] nrs the NRS head instance that will be handling request \a nrq. + * \param[in] nrq the request that is being handled. + * \param[out] resp the array where references to the resource hierarchy are + * stored. + * \param[in] moving_req is set when obtaining resources while moving a + * request from a policy on the regular NRS head to a + * policy on the HP NRS head (via + * ldlm_lock_reorder_req()). It signifies that + * allocations to get resources should be atomic; for + * a full explanation, see comment in + * ptlrpc_nrs_pol_ops::op_res_get(). + */ +static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_request *nrq, + struct ptlrpc_nrs_resource **resp, + bool moving_req) +{ + struct ptlrpc_nrs_policy *primary = NULL; + struct ptlrpc_nrs_policy *fallback = NULL; + + memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX); + + /** + * Obtain policy references. + */ + spin_lock(&nrs->nrs_lock); + + fallback = nrs->nrs_policy_fallback; + nrs_policy_get_locked(fallback); + + primary = nrs->nrs_policy_primary; + if (primary != NULL) + nrs_policy_get_locked(primary); + + spin_unlock(&nrs->nrs_lock); + + /** + * Obtain resource hierarchy references. + */ + resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req); + LASSERT(resp[NRS_RES_FALLBACK] != NULL); + + if (primary != NULL) { + resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq, + moving_req); + /** + * A primary policy may exist which may not wish to serve a + * particular request for different reasons; release the + * reference on the policy as it will not be used for this + * request. + */ + if (resp[NRS_RES_PRIMARY] == NULL) + nrs_policy_put(primary); + } +} + +/** + * Releases references to resource hierarchies and policies, because they are no + * longer required; used when request handling has been completed, or the + * request is moving to the high priority NRS head. + * + * \param resp the resource hierarchy that is being released + * + * \see ptlrpcnrs_req_hp_move() + * \see ptlrpc_nrs_req_finalize() + */ +static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp) +{ + struct ptlrpc_nrs_policy *pols[NRS_RES_MAX]; + struct ptlrpc_nrs *nrs = NULL; + int i; + + for (i = 0; i < NRS_RES_MAX; i++) { + if (resp[i] != NULL) { + pols[i] = resp[i]->res_policy; + nrs_resource_put(resp[i]); + resp[i] = NULL; + } else { + pols[i] = NULL; + } + } + + for (i = 0; i < NRS_RES_MAX; i++) { + if (pols[i] == NULL) + continue; + + if (nrs == NULL) { + nrs = pols[i]->pol_nrs; + spin_lock(&nrs->nrs_lock); + } + nrs_policy_put_locked(pols[i]); + } + + if (nrs != NULL) + spin_unlock(&nrs->nrs_lock); +} + +/** + * Obtains an NRS request from \a policy for handling or examination; the + * request should be removed in the 'handling' case. + * + * Calling into this function implies we already know the policy has a request + * waiting to be handled. + * + * \param[in] policy the policy from which a request + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the NRS request to be handled + */ +static inline +struct ptlrpc_nrs_request * nrs_request_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct ptlrpc_nrs_request *nrq; + + LASSERT(policy->pol_req_queued > 0); + + nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force); + + LASSERT(ergo(nrq != NULL, nrs_request_policy(nrq) == policy)); + + return nrq; +} + +/** + * Enqueues request \a nrq for later handling, via one one the policies for + * which resources where earlier obtained via nrs_resource_get_safe(). The + * function attempts to enqueue the request first on the primary policy + * (if any), since this is the preferred choice. + * + * \param nrq the request being enqueued + * + * \see nrs_resource_get_safe() + */ +static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy; + int rc; + int i; + + /** + * Try in descending order, because the primary policy (if any) is + * the preferred choice. + */ + for (i = NRS_RES_MAX - 1; i >= 0; i--) { + if (nrq->nr_res_ptrs[i] == NULL) + continue; + + nrq->nr_res_idx = i; + policy = nrq->nr_res_ptrs[i]->res_policy; + + rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq); + if (rc == 0) { + policy->pol_nrs->nrs_req_queued++; + policy->pol_req_queued++; + return; + } + } + /** + * Should never get here, as at least the primary policy's + * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always + * succeed. + */ + LBUG(); +} + +/** + * Called when a request has been handled + * + * \param[in] nrs the request that has been handled; can be used for + * job/resource control. + * + * \see ptlrpc_nrs_req_stop_nolock() + */ +static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq); + + if (policy->pol_desc->pd_ops->op_req_stop) + policy->pol_desc->pd_ops->op_req_stop(policy, nrq); + + LASSERT(policy->pol_nrs->nrs_req_started > 0); + LASSERT(policy->pol_req_started > 0); + + policy->pol_nrs->nrs_req_started--; + policy->pol_req_started--; +} + +/** + * Handler for operations that can be carried out on policies. + * + * Handles opcodes that are common to all policy types within NRS core, and + * passes any unknown opcodes to the policy-specific control function. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name. + * \param[in] opc the opcode of the operation being carried out. + * \param[in,out] arg can be used to pass information in and out between when + * carrying an operation; usually data that is private to + * the policy at some level, or generic policy status + * information. + * + * \retval -ve error condition + * \retval 0 operation was carried out successfully + */ +static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name, + enum ptlrpc_nrs_ctl opc, void *arg) +{ + struct ptlrpc_nrs_policy *policy; + int rc = 0; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) + GOTO(out, rc = -ENOENT); + + switch (opc) { + /** + * Unknown opcode, pass it down to the policy-specific control + * function for handling. + */ + default: + rc = nrs_policy_ctl_locked(policy, opc, arg); + break; + + /** + * Start \e policy + */ + case PTLRPC_NRS_CTL_START: + rc = nrs_policy_start_locked(policy); + break; + } +out: + if (policy != NULL) + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + RETURN(rc); +} + +/** + * Unregisters a policy by name. + * + * \param[in] nrs the NRS head this policy belongs to. + * \param[in] name the human-readable policy name; should be the same as + * ptlrpc_nrs_pol_desc::pd_name + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name) +{ + struct ptlrpc_nrs_policy *policy = NULL; + ENTRY; + + spin_lock(&nrs->nrs_lock); + + policy = nrs_policy_find_locked(nrs, name); + if (policy == NULL) { + spin_unlock(&nrs->nrs_lock); + + CERROR("Can't find NRS policy %s\n", name); + RETURN(-ENOENT); + } + + if (policy->pol_ref > 1) { + CERROR("Policy %s is busy with %d references\n", name, + (int)policy->pol_ref); + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + RETURN(-EBUSY); + } + + LASSERT(policy->pol_req_queued == 0); + LASSERT(policy->pol_req_started == 0); + + if (policy->pol_state != NRS_POL_STATE_STOPPED) { + nrs_policy_stop_locked(policy); + LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED); + } + + list_del(&policy->pol_list); + nrs->nrs_num_pols--; + + nrs_policy_put_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + nrs_policy_fini(policy); + + LASSERT(policy->pol_private == NULL); + OBD_FREE_PTR(policy); + + RETURN(0); +} + +/** + * Register a policy from \policy descriptor \a desc with NRS head \a nrs. + * + * \param[in] nrs the NRS head on which the policy will be registered. + * \param[in] desc the policy descriptor from which the information will be + * obtained to register the policy. + * + * \retval -ve error + * \retval 0 success + */ +static int nrs_policy_register(struct ptlrpc_nrs *nrs, + struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + int rc; + ENTRY; + + LASSERT(svcpt != NULL); + LASSERT(desc->pd_ops != NULL); + LASSERT(desc->pd_ops->op_res_get != NULL); + LASSERT(desc->pd_ops->op_req_get != NULL); + LASSERT(desc->pd_ops->op_req_enqueue != NULL); + LASSERT(desc->pd_ops->op_req_dequeue != NULL); + LASSERT(desc->pd_compat != NULL); + + OBD_CPT_ALLOC_GFP(policy, svcpt->scp_service->srv_cptable, + svcpt->scp_cpt, sizeof(*policy), __GFP_IO); + if (policy == NULL) + RETURN(-ENOMEM); + + policy->pol_nrs = nrs; + policy->pol_desc = desc; + policy->pol_state = NRS_POL_STATE_STOPPED; + policy->pol_flags = desc->pd_flags; + + INIT_LIST_HEAD(&policy->pol_list); + INIT_LIST_HEAD(&policy->pol_list_queued); + + rc = nrs_policy_init(policy); + if (rc != 0) { + OBD_FREE_PTR(policy); + RETURN(rc); + } + + spin_lock(&nrs->nrs_lock); + + tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name); + if (tmp != NULL) { + CERROR("NRS policy %s has been registered, can't register it " + "for %s\n", policy->pol_desc->pd_name, + svcpt->scp_service->srv_name); + nrs_policy_put_locked(tmp); + + spin_unlock(&nrs->nrs_lock); + nrs_policy_fini(policy); + OBD_FREE_PTR(policy); + + RETURN(-EEXIST); + } + + list_add_tail(&policy->pol_list, &nrs->nrs_policy_list); + nrs->nrs_num_pols++; + + if (policy->pol_flags & PTLRPC_NRS_FL_REG_START) + rc = nrs_policy_start_locked(policy); + + spin_unlock(&nrs->nrs_lock); + + if (rc != 0) + (void) nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + + RETURN(rc); +} + +/** + * Enqueue request \a req using one of the policies its resources are referring + * to. + * + * \param[in] req the request to enqueue. + */ +static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy; + + LASSERT(req->rq_nrq.nr_initialized); + LASSERT(!req->rq_nrq.nr_enqueued); + + nrs_request_enqueue(&req->rq_nrq); + req->rq_nrq.nr_enqueued = 1; + + policy = nrs_request_policy(&req->rq_nrq); + /** + * Add the policy to the NRS head's list of policies with enqueued + * requests, if it has not been added there. + */ + if (unlikely(list_empty(&policy->pol_list_queued))) + list_add_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); +} + +/** + * Enqueue a request on the high priority NRS head. + * + * \param req the request to enqueue. + */ +static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + ENTRY; + + spin_lock(&req->rq_lock); + req->rq_hp = 1; + ptlrpc_nrs_req_add_nolock(req); + if (opc != OBD_PING) + DEBUG_REQ(D_NET, req, "high priority req"); + spin_unlock(&req->rq_lock); + EXIT; +} + +/** + * Returns a boolean predicate indicating whether the policy described by + * \a desc is adequate for use with service \a svc. + * + * \param[in] svc the service + * \param[in] desc the policy descriptor + * + * \retval false the policy is not compatible with the service + * \retval true the policy is compatible with the service + */ +static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return desc->pd_compat(svc, desc); +} + +/** + * Registers all compatible policies in nrs_core.nrs_policies, for NRS head + * \a nrs. + * + * \param[in] nrs the NRS head + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * + * \see ptlrpc_service_nrs_setup() + */ +static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs) +{ + struct ptlrpc_nrs_pol_desc *desc; + /* for convenience */ + struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int rc = -EINVAL; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (nrs_policy_compatible(svc, desc)) { + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svc->srv_name, rc); + /** + * Fail registration if any of the policies' + * registration fails. + */ + break; + } + } + } + + RETURN(rc); +} + +/** + * Initializes NRS head \a nrs of service partition \a svcpt, and registers all + * compatible policies in NRS core, with the NRS head. + * + * \param[in] nrs the NRS head + * \param[in] svcpt the PTLRPC service partition to setup + * + * \retval -ve error + * \retval 0 success + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs, + struct ptlrpc_service_part *svcpt) +{ + int rc; + enum ptlrpc_nrs_queue_type queue; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + if (nrs == &svcpt->scp_nrs_reg) + queue = PTLRPC_NRS_QUEUE_REG; + else if (nrs == svcpt->scp_nrs_hp) + queue = PTLRPC_NRS_QUEUE_HP; + else + LBUG(); + + nrs->nrs_svcpt = svcpt; + nrs->nrs_queue_type = queue; + spin_lock_init(&nrs->nrs_lock); + INIT_LIST_HEAD(&nrs->nrs_policy_list); + INIT_LIST_HEAD(&nrs->nrs_policy_queued); + + rc = nrs_register_policies_locked(nrs); + + RETURN(rc); +} + +/** + * Allocates a regular and optionally a high-priority NRS head (if the service + * handles high-priority RPCs), and then registers all available compatible + * policies on those NRS heads. + * + * \param[in,out] svcpt the PTLRPC service partition to setup + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + int rc; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + + /** + * Initialize the regular NRS head. + */ + nrs = nrs_svcpt2nrs(svcpt, false); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + if (rc < 0) + GOTO(out, rc); + + /** + * Optionally allocate a high-priority NRS head. + */ + if (svcpt->scp_service->srv_ops.so_hpreq_handler == NULL) + GOTO(out, rc); + + OBD_CPT_ALLOC_PTR(svcpt->scp_nrs_hp, + svcpt->scp_service->srv_cptable, + svcpt->scp_cpt); + if (svcpt->scp_nrs_hp == NULL) + GOTO(out, rc = -ENOMEM); + + nrs = nrs_svcpt2nrs(svcpt, true); + rc = nrs_svcpt_setup_locked0(nrs, svcpt); + +out: + RETURN(rc); +} + +/** + * Unregisters all policies on all available NRS heads in a service partition; + * called at PTLRPC service unregistration time. + * + * \param[in] svcpt the PTLRPC service partition + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + */ +static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_policy *tmp; + int rc; + bool hp = false; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + nrs->nrs_stopping = 1; + + list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list, + pol_list) { + rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name); + LASSERT(rc == 0); + } + + /** + * If the service partition has an HP NRS head, clean that up as well. + */ + if (!hp && nrs_svcpt_has_hp(svcpt)) { + hp = true; + goto again; + } + + if (hp) + OBD_FREE_PTR(nrs); + + EXIT; +} + +/** + * Returns the descriptor for a policy as identified by by \a name. + * + * \param[in] name the policy name + * + * \retval the policy descriptor + * \retval NULL + */ +static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name) +{ + struct ptlrpc_nrs_pol_desc *tmp; + ENTRY; + + list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) { + if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0) + RETURN(tmp); + } + RETURN(NULL); +} + +/** + * Removes the policy from all supported NRS heads of all partitions of all + * PTLRPC services. + * + * \param[in] desc the policy descriptor to unregister + * + * \retval -ve error + * \retval 0 successfully unregistered policy on all supported NRS heads + * + * \pre mutex_is_locked(&nrs_core.nrs_mutex) + * \pre mutex_is_locked(&ptlrpc_all_services_mutex) + */ +static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc) +{ + struct ptlrpc_nrs *nrs; + struct ptlrpc_service *svc; + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); + LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex)); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + bool hp = false; + +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_unregister(nrs, desc->pd_name); + /** + * Ignore -ENOENT as the policy may not have registered + * successfully on all service partitions. + */ + if (rc == -ENOENT) { + rc = 0; + } else if (rc != 0) { + CERROR("Failed to unregister NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + RETURN(rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + RETURN(rc); +} + +/** + * Registers a new policy with NRS core. + * + * The function will only succeed if policy registration with all compatible + * service partitions (if any) is successful. + * + * N.B. This function should be called either at ptlrpc module initialization + * time when registering a policy that ships with NRS core, or in a + * module's init() function for policies registering from other modules. + * + * \param[in] conf configuration information for the new policy to register + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_service *svc; + struct ptlrpc_nrs_pol_desc *desc; + int rc = 0; + ENTRY; + + LASSERT(conf != NULL); + LASSERT(conf->nc_ops != NULL); + LASSERT(conf->nc_compat != NULL); + LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one, + conf->nc_compat_svc_name != NULL)); + LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0, + conf->nc_owner != NULL)); + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + /** + * External policies are not allowed to start immediately upon + * registration, as there is a relatively higher chance that their + * registration might fail. In such a case, some policy instances may + * already have requests queued wen unregistration needs to happen as + * part o cleanup; since there is currently no way to drain requests + * from a policy unless the service is unregistering, we just disallow + * this. + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) && + (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START))) { + CERROR("NRS: failing to register policy %s. Please check " + "policy flags; external policies cannot act as fallback " + "policies, or be started immediately upon registration " + "without interaction with lprocfs\n", conf->nc_name); + RETURN(-EINVAL); + } + + mutex_lock(&nrs_core.nrs_mutex); + + if (nrs_policy_find_desc_locked(conf->nc_name) != NULL) { + CERROR("NRS: failing to register policy %s which has already " + "been registered with NRS core!\n", + conf->nc_name); + GOTO(fail, rc = -EEXIST); + } + + OBD_ALLOC_PTR(desc); + if (desc == NULL) + GOTO(fail, rc = -ENOMEM); + + strncpy(desc->pd_name, conf->nc_name, NRS_POL_NAME_MAX); + desc->pd_ops = conf->nc_ops; + desc->pd_compat = conf->nc_compat; + desc->pd_compat_svc_name = conf->nc_compat_svc_name; + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0) + desc->pd_owner = conf->nc_owner; + desc->pd_flags = conf->nc_flags; + atomic_set(&desc->pd_refs, 0); + + /** + * For policies that are held in the same module as NRS (currently + * ptlrpc), do not register the policy with all compatible services, + * as the services will not have started at this point, since we are + * calling from ptlrpc module initialization code. In such cases each + * service will register all compatible policies later, via + * ptlrpc_service_nrs_setup(). + */ + if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0) + goto internal; + + /** + * Register the new policy on all compatible services + */ + mutex_lock(&ptlrpc_all_services_mutex); + + list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { + struct ptlrpc_service_part *svcpt; + int i; + int rc2; + + if (!nrs_policy_compatible(svc, desc) || + unlikely(svc->srv_is_stopping)) + continue; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + struct ptlrpc_nrs *nrs; + bool hp = false; +again: + nrs = nrs_svcpt2nrs(svcpt, hp); + rc = nrs_policy_register(nrs, desc); + if (rc != 0) { + CERROR("Failed to register NRS policy %s for " + "partition %d of service %s: %d\n", + desc->pd_name, svcpt->scp_cpt, + svcpt->scp_service->srv_name, rc); + + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + + if (!hp && nrs_svc_has_hp(svc)) { + hp = true; + goto again; + } + } + + /** + * No need to take a reference to other modules here, as we + * will be calling from the module's init() function. + */ + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) { + rc2 = nrs_policy_unregister_locked(desc); + /** + * Should not fail at this point + */ + LASSERT(rc2 == 0); + mutex_unlock(&ptlrpc_all_services_mutex); + OBD_FREE_PTR(desc); + GOTO(fail, rc); + } + } + } + + mutex_unlock(&ptlrpc_all_services_mutex); +internal: + list_add_tail(&desc->pd_list, &nrs_core.nrs_policies); +fail: + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_register); + +/** + * Unregisters a previously registered policy with NRS core. All instances of + * the policy on all NRS heads of all supported services are removed. + * + * N.B. This function should only be called from a module's exit() function. + * Although it can be used for policies that ship alongside NRS core, the + * function is primarily intended for policies that register externally, + * from other modules. + * + * \param[in] conf configuration information for the policy to unregister + * + * \retval -ve error + * \retval 0 success + */ +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf) +{ + struct ptlrpc_nrs_pol_desc *desc; + int rc; + ENTRY; + + LASSERT(conf != NULL); + + if (conf->nc_flags & PTLRPC_NRS_FL_FALLBACK) { + CERROR("Unable to unregister a fallback policy, unless the " + "PTLRPC service is stopping.\n"); + RETURN(-EPERM); + } + + conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; + + mutex_lock(&nrs_core.nrs_mutex); + + desc = nrs_policy_find_desc_locked(conf->nc_name); + if (desc == NULL) { + CERROR("Failing to unregister NRS policy %s which has " + "not been registered with NRS core!\n", + conf->nc_name); + GOTO(not_exist, rc = -ENOENT); + } + + mutex_lock(&ptlrpc_all_services_mutex); + + rc = nrs_policy_unregister_locked(desc); + if (rc < 0) { + if (rc == -EBUSY) + CERROR("Please first stop policy %s on all service " + "partitions and then retry to unregister the " + "policy.\n", conf->nc_name); + GOTO(fail, rc); + } + + CDEBUG(D_INFO, "Unregistering policy %s from NRS core.\n", + conf->nc_name); + + list_del(&desc->pd_list); + OBD_FREE_PTR(desc); + +fail: + mutex_unlock(&ptlrpc_all_services_mutex); + +not_exist: + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_nrs_policy_unregister); + +/** + * Setup NRS heads on all service partitions of service \a svc, and register + * all compatible policies on those NRS heads. + * + * To be called from withing ptl + * \param[in] svc the service to setup + * + * \retval -ve error, the calling logic should eventually call + * ptlrpc_service_nrs_cleanup() to undo any work performed + * by this function. + * + * \see ptlrpc_register_service() + * \see ptlrpc_service_nrs_cleanup() + */ +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + int rc = 0; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Initialize NRS heads on all service CPTs. + */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + rc = nrs_svcpt_setup_locked(svcpt); + if (rc != 0) + GOTO(failed, rc); + } + + /** + * Set up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_init != NULL) { + rc = desc->pd_ops->op_lprocfs_init(svc); + if (rc != 0) + GOTO(failed, rc); + } + } + +failed: + + mutex_unlock(&nrs_core.nrs_mutex); + + RETURN(rc); +} + +/** + * Unregisters all policies on all service partitions of service \a svc. + * + * \param[in] svc the PTLRPC service to unregister + */ +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + const struct ptlrpc_nrs_pol_desc *desc; + int i; + + mutex_lock(&nrs_core.nrs_mutex); + + /** + * Clean up NRS heads on all service partitions + */ + ptlrpc_service_for_each_part(svcpt, i, svc) + nrs_svcpt_cleanup_locked(svcpt); + + /** + * Clean up lprocfs interfaces for all supported policies for the + * service. + */ + list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { + if (!nrs_policy_compatible(svc, desc)) + continue; + + if (desc->pd_ops->op_lprocfs_fini != NULL) + desc->pd_ops->op_lprocfs_fini(svc); + } + + mutex_unlock(&nrs_core.nrs_mutex); +} + +/** + * Obtains NRS head resources for request \a req. + * + * These could be either on the regular or HP NRS head of \a svcpt; resources + * taken on the regular head can later be swapped for HP head resources by + * ldlm_lock_reorder_req(). + * + * \param[in] svcpt the service partition + * \param[in] req the request + * \param[in] hp which NRS head of \a svcpt to use + */ +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + memset(&req->rq_nrq, 0, sizeof(req->rq_nrq)); + nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs, + false); + + /** + * It is fine to access \e nr_initialized without locking as there is + * no contention at this early stage. + */ + req->rq_nrq.nr_initialized = 1; +} + +/** + * Releases resources for a request; is called after the request has been + * handled. + * + * \param[in] req the request + * + * \see ptlrpc_server_finish_request() + */ +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_initialized) { + nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs); + /* no protection on bit nr_initialized because no + * contention at this late stage */ + req->rq_nrq.nr_finalized = 1; + } +} + +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req) +{ + if (req->rq_nrq.nr_started) + nrs_request_stop(&req->rq_nrq); +} + +/** + * Enqueues request \a req on either the regular or high-priority NRS head + * of service partition \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] req the request to be enqueued + * \param[in] hp whether to enqueue the request on the regular or + * high-priority NRS head. + */ +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp) +{ + spin_lock(&svcpt->scp_req_lock); + + if (hp) + ptlrpc_nrs_hpreq_add_nolock(req); + else + ptlrpc_nrs_req_add_nolock(req); + + spin_unlock(&svcpt->scp_req_lock); +} + +static void nrs_request_removed(struct ptlrpc_nrs_policy *policy) +{ + LASSERT(policy->pol_nrs->nrs_req_queued > 0); + LASSERT(policy->pol_req_queued > 0); + + policy->pol_nrs->nrs_req_queued--; + policy->pol_req_queued--; + + /** + * If the policy has no more requests queued, remove it from + * ptlrpc_nrs::nrs_policy_queued. + */ + if (unlikely(policy->pol_req_queued == 0)) { + list_del_init(&policy->pol_list_queued); + + /** + * If there are other policies with queued requests, move the + * current policy to the end so that we can round robin over + * all policies and drain the requests. + */ + } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) { + LASSERT(policy->pol_req_queued < + policy->pol_nrs->nrs_req_queued); + + list_move_tail(&policy->pol_list_queued, + &policy->pol_nrs->nrs_policy_queued); + } +} + +/** + * Obtains a request for handling from an NRS head of service partition + * \a svcpt. + * + * \param[in] svcpt the service partition + * \param[in] hp whether to obtain a request from the regular or + * high-priority NRS head. + * \param[in] peek when set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force when set, it will force a policy to return a request if it + * has one pending + * + * \retval the request to be handled + * \retval NULL the head has no requests to serve + */ +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + struct ptlrpc_nrs_policy *policy; + struct ptlrpc_nrs_request *nrq; + + /** + * Always try to drain requests from all NRS polices even if they are + * inactive, because the user can change policy status at runtime. + */ + list_for_each_entry(policy, &nrs->nrs_policy_queued, + pol_list_queued) { + nrq = nrs_request_get(policy, peek, force); + if (nrq != NULL) { + if (likely(!peek)) { + nrq->nr_started = 1; + + policy->pol_req_started++; + policy->pol_nrs->nrs_req_started++; + + nrs_request_removed(policy); + } + + return container_of(nrq, struct ptlrpc_request, rq_nrq); + } + } + + return NULL; +} + +/** + * Dequeues request \a req from the policy it has been enqueued on. + * + * \param[in] req the request + */ +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_policy *policy = nrs_request_policy(&req->rq_nrq); + + policy->pol_desc->pd_ops->op_req_dequeue(policy, &req->rq_nrq); + + req->rq_nrq.nr_enqueued = 0; + + nrs_request_removed(policy); +} + +/** + * Returns whether there are any requests currently enqueued on any of the + * policies of service partition's \a svcpt NRS head specified by \a hp. Should + * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable + * result. + * + * \param[in] svcpt the service partition to enquire. + * \param[in] hp whether the regular or high-priority NRS head is to be + * enquired. + * + * \retval false the indicated NRS head has no enqueued requests. + * \retval true the indicated NRS head has some enqueued requests. + */ +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); + + return nrs->nrs_req_queued > 0; +}; + +/** + * Moves request \a req from the regular to the high-priority NRS head. + * + * \param[in] req the request to move + */ +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + struct ptlrpc_nrs_resource *res1[NRS_RES_MAX]; + struct ptlrpc_nrs_resource *res2[NRS_RES_MAX]; + ENTRY; + + /** + * Obtain the high-priority NRS head resources. + */ + nrs_resource_get_safe(nrs_svcpt2nrs(svcpt, true), nrq, res1, true); + + spin_lock(&svcpt->scp_req_lock); + + if (!ptlrpc_nrs_req_can_move(req)) + goto out; + + ptlrpc_nrs_req_del_nolock(req); + + memcpy(res2, nrq->nr_res_ptrs, NRS_RES_MAX * sizeof(res2[0])); + memcpy(nrq->nr_res_ptrs, res1, NRS_RES_MAX * sizeof(res1[0])); + + ptlrpc_nrs_hpreq_add_nolock(req); + + memcpy(res1, res2, NRS_RES_MAX * sizeof(res1[0])); +out: + spin_unlock(&svcpt->scp_req_lock); + + /** + * Release either the regular NRS head resources if we moved the + * request, or the high-priority NRS head resources if we took a + * reference earlier in this function and ptlrpc_nrs_req_can_move() + * returned false. + */ + nrs_resource_put_safe(res1); + EXIT; +} + +/** + * Carries out a control operation \a opc on the policy identified by the + * human-readable \a name, on either all partitions, or only on the first + * partition of service \a svc. + * + * \param[in] svc the service the policy belongs to. + * \param[in] queue whether to carry out the command on the policy which + * belongs to the regular, high-priority, or both NRS + * heads of service partitions of \a svc. + * \param[in] name the policy to act upon, by human-readable name + * \param[in] opc the opcode of the operation to carry out + * \param[in] single when set, the operation will only be carried out on the + * NRS heads of the first service partition of \a svc. + * This is useful for some policies which e.g. share + * identical values on the same parameters of different + * service partitions; when reading these parameters via + * lprocfs, these policies may just want to obtain and + * print out the values from the first service partition. + * Storing these values centrally elsewhere then could be + * another solution for this. + * \param[in,out] arg can be used as a generic in/out buffer between control + * operations and the user environment. + * + *\retval -ve error condition + *\retval 0 operation was carried out successfully + */ +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg) +{ + struct ptlrpc_service_part *svcpt; + int i; + int rc = 0; + ENTRY; + + LASSERT(opc != PTLRPC_NRS_CTL_INVALID); + + if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0) + return -EINVAL; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name, + opc, arg); + if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG && + single)) + GOTO(out, rc); + } + + if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { + /** + * XXX: We could optionally check for + * nrs_svc_has_hp(svc) here, and return an error if it + * is false. Right now we rely on the policies' lprocfs + * handlers that call the present function to make this + * check; if they fail to do so, they might hit the + * assertion inside nrs_svcpt2nrs() below. + */ + rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name, + opc, arg); + if (rc != 0 || single) + GOTO(out, rc); + } + } +out: + RETURN(rc); +} + + +/* ptlrpc/nrs_fifo.c */ +extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo; + +/** + * Adds all policies that ship with the ptlrpc module, to NRS core's list of + * policies \e nrs_core.nrs_policies. + * + * \retval 0 all policies have been registered successfully + * \retval -ve error + */ +int ptlrpc_nrs_init(void) +{ + int rc; + ENTRY; + + mutex_init(&nrs_core.nrs_mutex); + INIT_LIST_HEAD(&nrs_core.nrs_policies); + + rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo); + if (rc != 0) + GOTO(fail, rc); + + + RETURN(rc); +fail: + /** + * Since no PTLRPC services have been started at this point, all we need + * to do for cleanup is to free the descriptors. + */ + ptlrpc_nrs_fini(); + + RETURN(rc); +} + +/** + * Removes all policy desciptors from nrs_core::nrs_policies, and frees the + * policy descriptors. + * + * Since all PTLRPC services are stopped at this point, there are no more + * instances of any policies, because each service will have stopped its policy + * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the + * descriptors here. + */ +void ptlrpc_nrs_fini(void) +{ + struct ptlrpc_nrs_pol_desc *desc; + struct ptlrpc_nrs_pol_desc *tmp; + + list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies, + pd_list) { + list_del_init(&desc->pd_list); + OBD_FREE_PTR(desc); + } +} + +/** @} nrs */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c new file mode 100644 index 000000000000..ddfb5102d822 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_crr.c @@ -0,0 +1,40 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_crr.c + * + * Network Request Scheduler (NRS) CRR-N policy + * + * Request ordering in a batched Round-Robin manner over client NIDs + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c new file mode 100644 index 000000000000..7d3ee9706c9b --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c @@ -0,0 +1,270 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_fifo.c + * + * Network Request Scheduler (NRS) FIFO policy + * + * Handles RPCs in a FIFO manner, as received from the network. This policy is + * a logical wrapper around previous, non-NRS functionality. It is used as the + * default and fallback policy for all types of RPCs on all PTLRPC service + * partitions, for both regular and high-priority NRS heads. Default here means + * the policy is the one enabled at PTLRPC service partition startup time, and + * fallback means the policy is used to handle RPCs that are not handled + * successfully or are not handled at all by any primary policy that may be + * enabled on a given NRS head. + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ +/** + * \addtogoup nrs + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include "ptlrpc_internal.h" + +/** + * \name fifo + * + * The FIFO policy is a logical wrapper around previous, non-NRS functionality. + * It schedules RPCs in the same order as they are queued from LNet. + * + * @{ + */ + +#define NRS_POL_NAME_FIFO "fifo" + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a + * policy-specific private data structure. + * + * \param[in] policy The policy to start + * + * \retval -ENOMEM OOM error + * \retval 0 success + * + * \see nrs_policy_register() + * \see nrs_policy_ctl() + */ +static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head; + + OBD_CPT_ALLOC_PTR(head, nrs_pol2cptab(policy), nrs_pol2cptid(policy)); + if (head == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&head->fh_list); + policy->pol_private = head; + return 0; +} + +/** + * Is called before the policy transitions into + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific + * private data structure. + * + * \param[in] policy The policy to stop + * + * \see nrs_policy_stop0() + */ +static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy) +{ + struct nrs_fifo_head *head = policy->pol_private; + + LASSERT(head != NULL); + LASSERT(list_empty(&head->fh_list)); + + OBD_FREE_PTR(head); +} + +/** + * Is called for obtaining a FIFO policy resource. + * + * \param[in] policy The policy on which the request is being asked for + * \param[in] nrq The request for which resources are being taken + * \param[in] parent Parent resource, unused in this policy + * \param[out] resp Resources references are placed in this array + * \param[in] moving_req Signifies limited caller context; unused in this + * policy + * + * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since + * it implements a simple scheduling algorithm in which request + * priority is determined on the request arrival order, it does not + * need to maintain a set of resources that would otherwise be used + * to calculate a request's priority. + * + * \see nrs_resource_get_safe() + */ +static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, bool moving_req) +{ + /** + * Just return the resource embedded inside nrs_fifo_head, and end this + * resource hierarchy reference request. + */ + *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res; + return 1; +} + +/** + * Called when getting a request from the FIFO policy for handling, or just + * peeking; removes the request from the policy when it is to be handled. + * + * \param[in] policy The policy + * \param[in] peek When set, signifies that we just want to examine the + * request, and not handle it, so the request is not removed + * from the policy. + * \param[in] force Force the policy to return a request; unused in this + * policy + * + * \retval The request to be handled; this is the next request in the FIFO + * queue + * + * \see ptlrpc_nrs_req_get_nolock() + * \see nrs_request_get() + */ +static +struct ptlrpc_nrs_request * nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy, + bool peek, bool force) +{ + struct nrs_fifo_head *head = policy->pol_private; + struct ptlrpc_nrs_request *nrq; + + nrq = unlikely(list_empty(&head->fh_list)) ? NULL : + list_entry(head->fh_list.next, struct ptlrpc_nrs_request, + nr_u.fifo.fr_list); + + if (likely(!peek && nrq != NULL)) { + struct ptlrpc_request *req = container_of(nrq, + struct ptlrpc_request, + rq_nrq); + + list_del_init(&nrq->nr_u.fifo.fr_list); + + CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: "LPU64 + "\n", policy->pol_desc->pd_name, + libcfs_id2str(req->rq_peer), nrq->nr_u.fifo.fr_sequence); + } + + return nrq; +} + +/** + * Adds request \a nrq to \a policy's list of queued requests + * + * \param[in] policy The policy + * \param[in] nrq The request to add + * + * \retval 0 success; nrs_request_enqueue() assumes this function will always + * succeed + */ +static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct nrs_fifo_head *head; + + head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head, + fh_res); + /** + * Only used for debugging + */ + nrq->nr_u.fifo.fr_sequence = head->fh_sequence++; + list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list); + + return 0; +} + +/** + * Removes request \a nrq from \a policy's list of queued requests. + * + * \param[in] policy The policy + * \param[in] nrq The request to remove + */ +static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list)); + list_del_init(&nrq->nr_u.fifo.fr_list); +} + +/** + * Prints a debug statement right before the request \a nrq stops being + * handled. + * + * \param[in] policy The policy handling the request + * \param[in] nrq The request being handled + * + * \see ptlrpc_server_finish_request() + * \see ptlrpc_nrs_req_stop_nolock() + */ +static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq) +{ + struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, + rq_nrq); + + CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: "LPU64"\n", + policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), + nrq->nr_u.fifo.fr_sequence); +} + +/** + * FIFO policy operations + */ +static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = { + .op_policy_start = nrs_fifo_start, + .op_policy_stop = nrs_fifo_stop, + .op_res_get = nrs_fifo_res_get, + .op_req_get = nrs_fifo_req_get, + .op_req_enqueue = nrs_fifo_req_add, + .op_req_dequeue = nrs_fifo_req_del, + .op_req_stop = nrs_fifo_req_stop, +}; + +/** + * FIFO policy configuration + */ +struct ptlrpc_nrs_pol_conf nrs_conf_fifo = { + .nc_name = NRS_POL_NAME_FIFO, + .nc_ops = &nrs_fifo_ops, + .nc_compat = nrs_policy_compat_all, + .nc_flags = PTLRPC_NRS_FL_FALLBACK | + PTLRPC_NRS_FL_REG_START +}; + +/** @} fifo */ + +/** @} nrs */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c new file mode 100644 index 000000000000..a88c51993df6 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/nrs_orr.c @@ -0,0 +1,37 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011 Intel Corporation + * + * Copyright 2012 Xyratex Technology Limited + */ +/* + * lustre/ptlrpc/nrs_orr.c + * + * Network Request Scheduler (NRS) ORR and TRR policies + * + * Request scheduling in a Round-Robin manner over backend-fs objects and OSTs + * respectively + * + * Author: Liang Zhen + * Author: Nikitas Angelinas + */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c new file mode 100644 index 000000000000..1437636dfe28 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c @@ -0,0 +1,2575 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pack_generic.c + * + * (Un)packing of OST requests + * + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include + +#include +#include +#include +#include +#include + +static inline int lustre_msg_hdr_size_v2(int count) +{ + return cfs_size_round(offsetof(struct lustre_msg_v2, + lm_buflens[count])); +} + +int lustre_msg_hdr_size(__u32 magic, int count) +{ + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_hdr_size_v2(count); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_hdr_size); + +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + int index) +{ + if (inout) + lustre_set_req_swabbed(req, index); + else + lustre_set_rep_swabbed(req, index); +} +EXPORT_SYMBOL(ptlrpc_buf_set_swabbed); + +int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + int index) +{ + if (inout) + return (ptlrpc_req_need_swab(req) && + !lustre_req_swabbed(req, index)); + else + return (ptlrpc_rep_need_swab(req) && + !lustre_rep_swabbed(req, index)); +} +EXPORT_SYMBOL(ptlrpc_buf_need_swab); + +static inline int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, + __u32 version) +{ + __u32 ver = lustre_msg_get_version(msg); + return (ver & LUSTRE_VERSION_MASK) != version; +} + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + CERROR("msg v1 not supported - please upgrade you system\n"); + return -EINVAL; + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_check_version_v2(msg, version); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_check_version); + +/* early reply size */ +int lustre_msg_early_size() +{ + static int size = 0; + if (!size) { + /* Always reply old ptlrpc_body_v2 to keep interoprability + * with the old client (< 2.3) which doesn't have pb_jobid + * in the ptlrpc_body. + * + * XXX Remove this whenever we dorp interoprability with such + * client. + */ + __u32 pblen = sizeof(struct ptlrpc_body_v2); + size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen); + } + return size; +} +EXPORT_SYMBOL(lustre_msg_early_size); + +int lustre_msg_size_v2(int count, __u32 *lengths) +{ + int size; + int i; + + size = lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) + size += cfs_size_round(lengths[i]); + + return size; +} +EXPORT_SYMBOL(lustre_msg_size_v2); + +/* This returns the size of the buffer that is required to hold a lustre_msg + * with the given sub-buffer lengths. + * NOTE: this should only be used for NEW requests, and should always be + * in the form of a v2 request. If this is a connection to a v1 + * target then the first buffer will be stripped because the ptlrpc + * data is part of the lustre_msg_v1 header. b=14043 */ +int lustre_msg_size(__u32 magic, int count, __u32 *lens) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2)); + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(count, lens); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_size); + +/* This is used to determine the size of a buffer that was already packed + * and will correctly handle the different message formats. */ +int lustre_packed_msg_size(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_packed_msg_size); + +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs) +{ + char *ptr; + int i; + + msg->lm_bufcount = count; + /* XXX: lm_secflvr uninitialized here */ + msg->lm_magic = LUSTRE_MSG_MAGIC_V2; + + for (i = 0; i < count; i++) + msg->lm_buflens[i] = lens[i]; + + if (bufs == NULL) + return; + + ptr = (char *)msg + lustre_msg_hdr_size_v2(count); + for (i = 0; i < count; i++) { + char *tmp = bufs[i]; + LOGL(tmp, lens[i], ptr); + } +} +EXPORT_SYMBOL(lustre_init_msg_v2); + +static int lustre_pack_request_v2(struct ptlrpc_request *req, + int count, __u32 *lens, char **bufs) +{ + int reqlen, rc; + + reqlen = lustre_msg_size_v2(count, lens); + + rc = sptlrpc_cli_alloc_reqbuf(req, reqlen); + if (rc) + return rc; + + req->rq_reqlen = reqlen; + + lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); + lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); + return 0; +} + +int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, + __u32 *lens, char **bufs) +{ + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + /* only use new format, we don't need to be compatible with 1.4 */ + magic = LUSTRE_MSG_MAGIC_V2; + + switch (magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_pack_request_v2(req, count, lens, bufs); + default: + LASSERTF(0, "incorrect message magic: %08x\n", magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_pack_request); + +#if RS_DEBUG +LIST_HEAD(ptlrpc_rs_debug_lru); +spinlock_t ptlrpc_rs_debug_lock; + +#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) + +#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ +do { \ + spin_lock(&ptlrpc_rs_debug_lock); \ + list_del(&(rs)->rs_debug_list); \ + spin_unlock(&ptlrpc_rs_debug_lock); \ +} while (0) +#else +# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while(0) +# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while(0) +#endif + +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_reply_state *rs = NULL; + + spin_lock(&svcpt->scp_rep_lock); + + /* See if we have anything in a pool, and wait if nothing */ + while (list_empty(&svcpt->scp_rep_idle)) { + struct l_wait_info lwi; + int rc; + + spin_unlock(&svcpt->scp_rep_lock); + /* If we cannot get anything for some long time, we better + * bail out instead of waiting infinitely */ + lwi = LWI_TIMEOUT(cfs_time_seconds(10), NULL, NULL); + rc = l_wait_event(svcpt->scp_rep_waitq, + !list_empty(&svcpt->scp_rep_idle), &lwi); + if (rc != 0) + goto out; + spin_lock(&svcpt->scp_rep_lock); + } + + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, rs_list); + list_del(&rs->rs_list); + + spin_unlock(&svcpt->scp_rep_lock); + + memset(rs, 0, svcpt->scp_service->srv_max_reply_size); + rs->rs_svcpt = svcpt; + rs->rs_prealloc = 1; +out: + return rs; +} + +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + spin_unlock(&svcpt->scp_rep_lock); + wake_up(&svcpt->scp_rep_waitq); +} + +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags) +{ + struct ptlrpc_reply_state *rs; + int msg_len, rc; + ENTRY; + + LASSERT(req->rq_reply_state == NULL); + + if ((flags & LPRFL_EARLY_REPLY) == 0) { + spin_lock(&req->rq_lock); + req->rq_packed_final = 1; + spin_unlock(&req->rq_lock); + } + + msg_len = lustre_msg_size_v2(count, lens); + rc = sptlrpc_svc_alloc_rs(req, msg_len); + if (rc) + RETURN(rc); + + rs = req->rq_reply_state; + atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ + rs->rs_cb_id.cbid_fn = reply_out_callback; + rs->rs_cb_id.cbid_arg = rs; + rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt; + INIT_LIST_HEAD(&rs->rs_exp_list); + INIT_LIST_HEAD(&rs->rs_obd_list); + INIT_LIST_HEAD(&rs->rs_list); + spin_lock_init(&rs->rs_lock); + + req->rq_replen = msg_len; + req->rq_reply_state = rs; + req->rq_repmsg = rs->rs_msg; + + lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); + lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); + + PTLRPC_RS_DEBUG_LRU_ADD(rs); + + RETURN(0); +} +EXPORT_SYMBOL(lustre_pack_reply_v2); + +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs, int flags) +{ + int rc = 0; + __u32 size[] = { sizeof(struct ptlrpc_body) }; + + if (!lens) { + LASSERT(count == 1); + lens = size; + } + + LASSERT(count > 0); + LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + rc = -EINVAL; + } + if (rc != 0) + CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc, + lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); + return rc; +} +EXPORT_SYMBOL(lustre_pack_reply_flags); + +int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} +EXPORT_SYMBOL(lustre_pack_reply); + +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) +{ + int i, offset, buflen, bufcount; + + LASSERT(m != NULL); + LASSERT(n >= 0); + + bufcount = m->lm_bufcount; + if (unlikely(n >= bufcount)) { + CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + m, n, bufcount); + return NULL; + } + + buflen = m->lm_buflens[n]; + if (unlikely(buflen < min_size)) { + CERROR("msg %p buffer[%d] size %d too small " + "(required %d, opc=%d)\n", m, n, buflen, min_size, + n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m)); + return NULL; + } + + offset = lustre_msg_hdr_size_v2(bufcount); + for (i = 0; i < n; i++) + offset += cfs_size_round(m->lm_buflens[i]); + + return (char *)m + offset; +} + +void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buf_v2(m, n, min_size); + default: + LASSERTF(0, "incorrect message magic: %08x(msg:%p)\n", m->lm_magic, m); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_buf); + +int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, int segment, + unsigned int newlen, int move_data) +{ + char *tail = NULL, *newpos; + int tail_len = 0, n; + + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] >= newlen); + + if (msg->lm_buflens[segment] == newlen) + goto out; + + if (move_data && msg->lm_bufcount > segment + 1) { + tail = lustre_msg_buf_v2(msg, segment + 1, 0); + for (n = segment + 1; n < msg->lm_bufcount; n++) + tail_len += cfs_size_round(msg->lm_buflens[n]); + } + + msg->lm_buflens[segment] = newlen; + + if (tail && tail_len) { + newpos = lustre_msg_buf_v2(msg, segment + 1, 0); + LASSERT(newpos <= tail); + if (newpos != tail) + memmove(newpos, tail, tail_len); + } +out: + return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); +} + +/* + * for @msg, shrink @segment to size @newlen. if @move_data is non-zero, + * we also move data forward from @segment + 1. + * + * if @newlen == 0, we remove the segment completely, but we still keep the + * totally bufcount the same to save possible data moving. this will leave a + * unused segment with size 0 at the tail, but that's ok. + * + * return new msg size after shrinking. + * + * CAUTION: + * + if any buffers higher than @segment has been filled in, must call shrink + * with non-zero @move_data. + * + caller should NOT keep pointers to msg buffers which higher than @segment + * after call shrink. + */ +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_shrink_msg_v2(msg, segment, newlen, move_data); + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_shrink_msg); + +void lustre_free_reply_state(struct ptlrpc_reply_state *rs) +{ + PTLRPC_RS_DEBUG_LRU_DEL(rs); + + LASSERT (atomic_read(&rs->rs_refcount) == 0); + LASSERT (!rs->rs_difficult || rs->rs_handled); + LASSERT (!rs->rs_on_net); + LASSERT (!rs->rs_scheduled); + LASSERT (rs->rs_export == NULL); + LASSERT (rs->rs_nlocks == 0); + LASSERT (list_empty(&rs->rs_exp_list)); + LASSERT (list_empty(&rs->rs_obd_list)); + + sptlrpc_svc_free_rs(rs); +} +EXPORT_SYMBOL(lustre_free_reply_state); + +static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) +{ + int swabbed, required_len, i; + + /* Now we know the sender speaks my language. */ + required_len = lustre_msg_hdr_size_v2(0); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for lustre_msg\n", len); + return -EINVAL; + } + + swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); + + if (swabbed) { + __swab32s(&m->lm_magic); + __swab32s(&m->lm_bufcount); + __swab32s(&m->lm_secflvr); + __swab32s(&m->lm_repsize); + __swab32s(&m->lm_cksum); + __swab32s(&m->lm_flags); + CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0); + CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0); + } + + required_len = lustre_msg_hdr_size_v2(m->lm_bufcount); + if (len < required_len) { + /* didn't receive all the buffer lengths */ + CERROR ("message length %d too small for %d buflens\n", + len, m->lm_bufcount); + return -EINVAL; + } + + for (i = 0; i < m->lm_bufcount; i++) { + if (swabbed) + __swab32s(&m->lm_buflens[i]); + required_len += cfs_size_round(m->lm_buflens[i]); + } + + if (len < required_len) { + CERROR("len: %d, required_len %d\n", len, required_len); + CERROR("bufcount: %d\n", m->lm_bufcount); + for (i = 0; i < m->lm_bufcount; i++) + CERROR("buffer %d length %d\n", i, m->lm_buflens[i]); + return -EINVAL; + } + + return swabbed; +} + +int __lustre_unpack_msg(struct lustre_msg *m, int len) +{ + int required_len, rc; + ENTRY; + + /* We can provide a slightly better error log, if we check the + * message magic and version first. In the future, struct + * lustre_msg may grow, and we'd like to log a version mismatch, + * rather than a short message. + * + */ + required_len = offsetof(struct lustre_msg, lm_magic) + + sizeof(m->lm_magic); + if (len < required_len) { + /* can't even look inside the message */ + CERROR("message length %d too small for magic/version check\n", + len); + RETURN(-EINVAL); + } + + rc = lustre_unpack_msg_v2(m, len); + + RETURN(rc); +} +EXPORT_SYMBOL(__lustre_unpack_msg); + +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_reqmsg, len); + if (rc == 1) { + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(ptlrpc_unpack_req_msg); + +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) +{ + int rc; + rc = __lustre_unpack_msg(req->rq_repmsg, len); + if (rc == 1) { + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(ptlrpc_unpack_rep_msg); + +static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, + const int inout, int offset) +{ + struct ptlrpc_body *pb; + struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg; + + pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2)); + if (!pb) { + CERROR("error unpacking ptlrpc body\n"); + return -EFAULT; + } + if (ptlrpc_buf_need_swab(req, inout, offset)) { + lustre_swab_ptlrpc_body(pb); + ptlrpc_buf_set_swabbed(req, inout, offset); + } + + if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { + CERROR("wrong lustre_msg version %08x\n", pb->pb_version); + return -EINVAL; + } + + return 0; +} + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 1, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EINVAL; + } +} + +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) +{ + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_unpack_ptlrpc_body_v2(req, 0, offset); + default: + CERROR("bad lustre msg magic: %08x\n", + req->rq_repmsg->lm_magic); + return -EINVAL; + } +} + +static inline int lustre_msg_buflen_v2(struct lustre_msg_v2 *m, int n) +{ + if (n >= m->lm_bufcount) + return 0; + + return m->lm_buflens[n]; +} + +/** + * lustre_msg_buflen - return the length of buffer \a n in message \a m + * \param m lustre_msg (request or reply) to look at + * \param n message index (base 0) + * + * returns zero for non-existent message indices + */ +int lustre_msg_buflen(struct lustre_msg *m, int n) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_msg_buflen_v2(m, n); + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_buflen); + +static inline void +lustre_msg_set_buflen_v2(struct lustre_msg_v2 *m, int n, int len) +{ + if (n >= m->lm_bufcount) + LBUG(); + + m->lm_buflens[n] = len; +} + +void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + lustre_msg_set_buflen_v2(m, n, len); + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } +} + +EXPORT_SYMBOL(lustre_msg_set_buflen); + +/* NB return the bufcount for lustre_msg_v2 format, so if message is packed + * in V1 format, the result is one bigger. (add struct ptlrpc_body). */ +int lustre_msg_bufcount(struct lustre_msg *m) +{ + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return m->lm_bufcount; + default: + CERROR("incorrect message magic: %08x\n", m->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_bufcount); + +char *lustre_msg_string(struct lustre_msg *m, int index, int max_len) +{ + /* max_len == 0 means the string should fill the buffer */ + char *str; + int slen, blen; + + switch (m->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + str = lustre_msg_buf_v2(m, index, 0); + blen = lustre_msg_buflen_v2(m, index); + break; + default: + LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + } + + if (str == NULL) { + CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index); + return NULL; + } + + slen = strnlen(str, blen); + + if (slen == blen) { /* not NULL terminated */ + CERROR("can't unpack non-NULL terminated string in " + "msg %p buffer[%d] len %d\n", m, index, blen); + return NULL; + } + + if (max_len == 0) { + if (slen != blen - 1) { + CERROR("can't unpack short string in msg %p " + "buffer[%d] len %d: strlen %d\n", + m, index, blen, slen); + return NULL; + } + } else if (slen > max_len) { + CERROR("can't unpack oversized string in msg %p " + "buffer[%d] len %d strlen %d: max %d expected\n", + m, index, blen, slen, max_len); + return NULL; + } + + return str; +} +EXPORT_SYMBOL(lustre_msg_string); + +/* Wrap up the normal fixed length cases */ +static inline void *__lustre_swab_buf(struct lustre_msg *msg, int index, + int min_size, void *swabber) +{ + void *ptr = NULL; + + LASSERT(msg != NULL); + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + ptr = lustre_msg_buf_v2(msg, index, min_size); + break; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + } + + if (ptr && swabber) + ((void (*)(void *))swabber)(ptr); + + return ptr; +} + +static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg) +{ + return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body_v2)); +} + +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + /* already in host endian */ + return msg->lm_flags; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msghdr_get_flags); + +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_flags = flags; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +__u32 lustre_msg_get_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_flags; + } + default: + /* flags might be printed in debug code while message + * uninitialized */ + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_flags); + +void lustre_msg_add_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_flags); + +void lustre_msg_set_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags = flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_flags); + +void lustre_msg_clear_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_flags &= ~(MSG_GEN_FLAG_MASK & flags); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_clear_flags); + +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_op_flags; + } + default: + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_op_flags); + +void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_op_flags); + +void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_op_flags |= flags; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_op_flags); + +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return &pb->pb_handle; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_handle); + +__u32 lustre_msg_get_type(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return PTL_RPC_MSG_ERR; + } + return pb->pb_type; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return PTL_RPC_MSG_ERR; + } +} +EXPORT_SYMBOL(lustre_msg_get_type); + +__u32 lustre_msg_get_version(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_version; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_version); + +void lustre_msg_add_version(struct lustre_msg *msg, int version) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_version |= version; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_add_version); + +__u32 lustre_msg_get_opc(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_opc; + } + default: + CERROR("incorrect message magic: %08x(msg:%p)\n", msg->lm_magic, msg); + LBUG(); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_opc); + +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_xid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_xid); + +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_last_committed; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_last_committed); + +__u64 *lustre_msg_get_versions(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return NULL; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return NULL; + } + return pb->pb_pre_versions; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_versions); + +__u64 lustre_msg_get_transno(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_transno; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_transno); + +int lustre_msg_get_status(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_status; + } + default: + /* status might be printed in debug code while message + * uninitialized */ + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_status); + +__u64 lustre_msg_get_slv(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_slv; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_slv); + + +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_slv = slv; + return; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return; + } +} +EXPORT_SYMBOL(lustre_msg_set_slv); + +__u32 lustre_msg_get_limit(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return -EINVAL; + } + return pb->pb_limit; + } + default: + CERROR("invalid msg magic %x\n", msg->lm_magic); + return -EINVAL; + } +} +EXPORT_SYMBOL(lustre_msg_get_limit); + + +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return; + } + pb->pb_limit = limit; + return; + } + default: + CERROR("invalid msg magic %08x\n", msg->lm_magic); + return; + } +} +EXPORT_SYMBOL(lustre_msg_set_limit); + +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + } + return pb->pb_conn_cnt; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_conn_cnt); + +int lustre_msg_is_v1(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_is_v1); + +__u32 lustre_msg_get_magic(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_magic; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} +EXPORT_SYMBOL(lustre_msg_get_magic); + +__u32 lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_service_time(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +char *lustre_msg_get_jobid(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return NULL; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = + lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + if (!pb) + return NULL; + + return pb->pb_jobid; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return NULL; + } +} +EXPORT_SYMBOL(lustre_msg_get_jobid); + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) +/* + * In 1.6 and 1.8 the checksum was computed only on struct ptlrpc_body as + * it was in 1.6 (88 bytes, smaller than the full size in 1.8). It makes + * more sense to compute the checksum on the full ptlrpc_body, regardless + * of what size it is, but in order to keep interoperability with 1.8 we + * can optionally also checksum only the first 88 bytes (caller decides). */ +# define ptlrpc_body_cksum_size_compat18 88 + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18) +#else +# warning "remove checksum compatibility support for b1_8" +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) +#endif +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) + __u32 crc; + unsigned int hsize = 4; + __u32 len = compat18 ? ptlrpc_body_cksum_size_compat18 : + lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, + len, NULL, 0, (unsigned char *)&crc, + &hsize); + return crc; +#else +# warning "remove checksum compatibility support for b1_8" + __u32 crc; + unsigned int hsize = 4; + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, + lustre_msg_buflen(msg, MSG_PTLRPC_BODY_OFF), + NULL, 0, (unsigned char *)&crc, &hsize); + return crc; +#endif + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_handle = *handle; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_handle); + +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_type = type; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_type); + +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_opc = opc; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_opc); + +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_xid = last_xid; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_xid); + +void lustre_msg_set_last_committed(struct lustre_msg *msg, __u64 last_committed) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_last_committed = last_committed; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_last_committed); + +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_pre_versions[0] = versions[0]; + pb->pb_pre_versions[1] = versions[1]; + pb->pb_pre_versions[2] = versions[2]; + pb->pb_pre_versions[3] = versions[3]; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_versions); + +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_transno = transno; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_transno); + +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_status = status; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_status); + +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_conn_cnt = conn_cnt; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_conn_cnt); + +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_time; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + __u32 opc = lustre_msg_get_opc(msg); + struct ptlrpc_body *pb; + + /* Don't set jobid for ldlm ast RPCs, they've been shrinked. + * See the comment in ptlrpc_request_pack(). */ + if (!opc || opc == LDLM_BL_CALLBACK || + opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK) + return; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, + sizeof(struct ptlrpc_body)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + + if (jobid != NULL) + memcpy(pb->pb_jobid, jobid, JOBSTATS_JOBID_SIZE); + else if (pb->pb_jobid[0] == '\0') + lustre_get_jobid(pb->pb_jobid); + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} +EXPORT_SYMBOL(lustre_msg_set_jobid); + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + + +void ptlrpc_request_set_replen(struct ptlrpc_request *req) +{ + int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); + + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, + req->rq_pill.rc_area[RCL_SERVER]); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_request_set_replen); + +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *lens) +{ + req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) + req->rq_reqmsg->lm_repsize = req->rq_replen; +} +EXPORT_SYMBOL(ptlrpc_req_set_repsize); + +/** + * Send a remote set_info_async. + * + * This may go from client to server or server to client. + */ +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + obd_count keylen, void *key, + obd_count vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *req; + char *tmp; + int rc; + ENTRY; + + req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); + if (req == NULL) + RETURN(-ENOMEM); + + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, + RCL_CLIENT, keylen); + req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, + RCL_CLIENT, vallen); + rc = ptlrpc_request_pack(req, version, opcode); + if (rc) { + ptlrpc_request_free(req); + RETURN(rc); + } + + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); + memcpy(tmp, key, keylen); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); + memcpy(tmp, val, vallen); + + ptlrpc_request_set_replen(req); + + if (set) { + ptlrpc_set_add_req(set, req); + ptlrpc_check_set(NULL, set); + } else { + rc = ptlrpc_queue_wait(req); + ptlrpc_req_finished(req); + } + + RETURN(rc); +} +EXPORT_SYMBOL(do_set_info_async); + +/* byte flipping routines for all wire types declared in + * lustre_idl.h implemented here. + */ +void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) +{ + __swab32s (&b->pb_type); + __swab32s (&b->pb_version); + __swab32s (&b->pb_opc); + __swab32s (&b->pb_status); + __swab64s (&b->pb_last_xid); + __swab64s (&b->pb_last_seen); + __swab64s (&b->pb_last_committed); + __swab64s (&b->pb_transno); + __swab32s (&b->pb_flags); + __swab32s (&b->pb_op_flags); + __swab32s (&b->pb_conn_cnt); + __swab32s (&b->pb_timeout); + __swab32s (&b->pb_service_time); + __swab32s (&b->pb_limit); + __swab64s (&b->pb_slv); + __swab64s (&b->pb_pre_versions[0]); + __swab64s (&b->pb_pre_versions[1]); + __swab64s (&b->pb_pre_versions[2]); + __swab64s (&b->pb_pre_versions[3]); + CLASSERT(offsetof(typeof(*b), pb_padding) != 0); + /* While we need to maintain compatibility between + * clients and servers without ptlrpc_body_v2 (< 2.3) + * do not swab any fields beyond pb_jobid, as we are + * using this swab function for both ptlrpc_body + * and ptlrpc_body_v2. */ + CLASSERT(offsetof(typeof(*b), pb_jobid) != 0); +} +EXPORT_SYMBOL(lustre_swab_ptlrpc_body); + +void lustre_swab_connect(struct obd_connect_data *ocd) +{ + __swab64s(&ocd->ocd_connect_flags); + __swab32s(&ocd->ocd_version); + __swab32s(&ocd->ocd_grant); + __swab64s(&ocd->ocd_ibits_known); + __swab32s(&ocd->ocd_index); + __swab32s(&ocd->ocd_brw_size); + /* ocd_blocksize and ocd_inodespace don't need to be swabbed because + * they are 8-byte values */ + __swab16s(&ocd->ocd_grant_extent); + __swab32s(&ocd->ocd_unused); + __swab64s(&ocd->ocd_transno); + __swab32s(&ocd->ocd_group); + __swab32s(&ocd->ocd_cksum_types); + __swab32s(&ocd->ocd_instance); + /* Fields after ocd_cksum_types are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE) + __swab32s(&ocd->ocd_max_easize); + if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES) + __swab64s(&ocd->ocd_maxbytes); + CLASSERT(offsetof(typeof(*ocd), padding1) != 0); + CLASSERT(offsetof(typeof(*ocd), padding2) != 0); + CLASSERT(offsetof(typeof(*ocd), padding3) != 0); + CLASSERT(offsetof(typeof(*ocd), padding4) != 0); + CLASSERT(offsetof(typeof(*ocd), padding5) != 0); + CLASSERT(offsetof(typeof(*ocd), padding6) != 0); + CLASSERT(offsetof(typeof(*ocd), padding7) != 0); + CLASSERT(offsetof(typeof(*ocd), padding8) != 0); + CLASSERT(offsetof(typeof(*ocd), padding9) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingA) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingB) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingC) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingD) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingE) != 0); + CLASSERT(offsetof(typeof(*ocd), paddingF) != 0); +} + +void lustre_swab_obdo (struct obdo *o) +{ + __swab64s (&o->o_valid); + lustre_swab_ost_id(&o->o_oi); + __swab64s (&o->o_parent_seq); + __swab64s (&o->o_size); + __swab64s (&o->o_mtime); + __swab64s (&o->o_atime); + __swab64s (&o->o_ctime); + __swab64s (&o->o_blocks); + __swab64s (&o->o_grant); + __swab32s (&o->o_blksize); + __swab32s (&o->o_mode); + __swab32s (&o->o_uid); + __swab32s (&o->o_gid); + __swab32s (&o->o_flags); + __swab32s (&o->o_nlink); + __swab32s (&o->o_parent_oid); + __swab32s (&o->o_misc); + __swab64s (&o->o_ioepoch); + __swab32s (&o->o_stripe_idx); + __swab32s (&o->o_parent_ver); + /* o_handle is opaque */ + /* o_lcookie is swabbed elsewhere */ + __swab32s (&o->o_uid_h); + __swab32s (&o->o_gid_h); + __swab64s (&o->o_data_version); + CLASSERT(offsetof(typeof(*o), o_padding_4) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_5) != 0); + CLASSERT(offsetof(typeof(*o), o_padding_6) != 0); + +} +EXPORT_SYMBOL(lustre_swab_obdo); + +void lustre_swab_obd_statfs (struct obd_statfs *os) +{ + __swab64s (&os->os_type); + __swab64s (&os->os_blocks); + __swab64s (&os->os_bfree); + __swab64s (&os->os_bavail); + __swab64s (&os->os_files); + __swab64s (&os->os_ffree); + /* no need to swab os_fsid */ + __swab32s (&os->os_bsize); + __swab32s (&os->os_namelen); + __swab64s (&os->os_maxbytes); + __swab32s (&os->os_state); + CLASSERT(offsetof(typeof(*os), os_fprecreated) != 0); + CLASSERT(offsetof(typeof(*os), os_spare2) != 0); + CLASSERT(offsetof(typeof(*os), os_spare3) != 0); + CLASSERT(offsetof(typeof(*os), os_spare4) != 0); + CLASSERT(offsetof(typeof(*os), os_spare5) != 0); + CLASSERT(offsetof(typeof(*os), os_spare6) != 0); + CLASSERT(offsetof(typeof(*os), os_spare7) != 0); + CLASSERT(offsetof(typeof(*os), os_spare8) != 0); + CLASSERT(offsetof(typeof(*os), os_spare9) != 0); +} +EXPORT_SYMBOL(lustre_swab_obd_statfs); + +void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) +{ + lustre_swab_ost_id(&ioo->ioo_oid); + __swab32s(&ioo->ioo_max_brw); + __swab32s(&ioo->ioo_bufcnt); +} +EXPORT_SYMBOL(lustre_swab_obd_ioobj); + +void lustre_swab_niobuf_remote (struct niobuf_remote *nbr) +{ + __swab64s (&nbr->offset); + __swab32s (&nbr->len); + __swab32s (&nbr->flags); +} +EXPORT_SYMBOL(lustre_swab_niobuf_remote); + +void lustre_swab_ost_body (struct ost_body *b) +{ + lustre_swab_obdo (&b->oa); +} +EXPORT_SYMBOL(lustre_swab_ost_body); + +void lustre_swab_ost_last_id(obd_id *id) +{ + __swab64s(id); +} +EXPORT_SYMBOL(lustre_swab_ost_last_id); + +void lustre_swab_generic_32s(__u32 *val) +{ + __swab32s(val); +} +EXPORT_SYMBOL(lustre_swab_generic_32s); + +void lustre_swab_gl_desc(union ldlm_gl_desc *desc) +{ + lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid); + __swab64s(&desc->lquota_desc.gl_flags); + __swab64s(&desc->lquota_desc.gl_ver); + __swab64s(&desc->lquota_desc.gl_hardlimit); + __swab64s(&desc->lquota_desc.gl_softlimit); + __swab64s(&desc->lquota_desc.gl_time); + CLASSERT(offsetof(typeof(desc->lquota_desc), gl_pad2) != 0); +} + +void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb_v1); + +void lustre_swab_ost_lvb(struct ost_lvb *lvb) +{ + __swab64s(&lvb->lvb_size); + __swab64s(&lvb->lvb_mtime); + __swab64s(&lvb->lvb_atime); + __swab64s(&lvb->lvb_ctime); + __swab64s(&lvb->lvb_blocks); + __swab32s(&lvb->lvb_mtime_ns); + __swab32s(&lvb->lvb_atime_ns); + __swab32s(&lvb->lvb_ctime_ns); + __swab32s(&lvb->lvb_padding); +} +EXPORT_SYMBOL(lustre_swab_ost_lvb); + +void lustre_swab_lquota_lvb(struct lquota_lvb *lvb) +{ + __swab64s(&lvb->lvb_flags); + __swab64s(&lvb->lvb_id_may_rel); + __swab64s(&lvb->lvb_id_rel); + __swab64s(&lvb->lvb_id_qunit); + __swab64s(&lvb->lvb_pad1); +} +EXPORT_SYMBOL(lustre_swab_lquota_lvb); + +void lustre_swab_mdt_body (struct mdt_body *b) +{ + lustre_swab_lu_fid (&b->fid1); + lustre_swab_lu_fid (&b->fid2); + /* handle is opaque */ + __swab64s (&b->valid); + __swab64s (&b->size); + __swab64s (&b->mtime); + __swab64s (&b->atime); + __swab64s (&b->ctime); + __swab64s (&b->blocks); + __swab64s (&b->ioepoch); + CLASSERT(offsetof(typeof(*b), unused1) != 0); + __swab32s (&b->fsuid); + __swab32s (&b->fsgid); + __swab32s (&b->capability); + __swab32s (&b->mode); + __swab32s (&b->uid); + __swab32s (&b->gid); + __swab32s (&b->flags); + __swab32s (&b->rdev); + __swab32s (&b->nlink); + CLASSERT(offsetof(typeof(*b), unused2) != 0); + __swab32s (&b->suppgid); + __swab32s (&b->eadatasize); + __swab32s (&b->aclsize); + __swab32s (&b->max_mdsize); + __swab32s (&b->max_cookiesize); + __swab32s (&b->uid_h); + __swab32s (&b->gid_h); + CLASSERT(offsetof(typeof(*b), padding_5) != 0); +} +EXPORT_SYMBOL(lustre_swab_mdt_body); + +void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b) +{ + /* handle is opaque */ + __swab64s (&b->ioepoch); + __swab32s (&b->flags); + CLASSERT(offsetof(typeof(*b), padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_mdt_ioepoch); + +void lustre_swab_mgs_target_info(struct mgs_target_info *mti) +{ + int i; + __swab32s(&mti->mti_lustre_ver); + __swab32s(&mti->mti_stripe_index); + __swab32s(&mti->mti_config_ver); + __swab32s(&mti->mti_flags); + __swab32s(&mti->mti_instance); + __swab32s(&mti->mti_nid_count); + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + for (i = 0; i < MTI_NIDS_MAX; i++) + __swab64s(&mti->mti_nids[i]); +} +EXPORT_SYMBOL(lustre_swab_mgs_target_info); + +void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) +{ + int i; + + __swab64s(&entry->mne_version); + __swab32s(&entry->mne_instance); + __swab32s(&entry->mne_index); + __swab32s(&entry->mne_length); + + /* mne_nid_(count|type) must be one byte size because we're gonna + * access it w/o swapping. */ + CLASSERT(sizeof(entry->mne_nid_count) == sizeof(__u8)); + CLASSERT(sizeof(entry->mne_nid_type) == sizeof(__u8)); + + /* remove this assertion if ipv6 is supported. */ + LASSERT(entry->mne_nid_type == 0); + for (i = 0; i < entry->mne_nid_count; i++) { + CLASSERT(sizeof(lnet_nid_t) == sizeof(__u64)); + __swab64s(&entry->u.nids[i]); + } +} +EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry); + +void lustre_swab_mgs_config_body(struct mgs_config_body *body) +{ + __swab64s(&body->mcb_offset); + __swab32s(&body->mcb_units); + __swab16s(&body->mcb_type); +} +EXPORT_SYMBOL(lustre_swab_mgs_config_body); + +void lustre_swab_mgs_config_res(struct mgs_config_res *body) +{ + __swab64s(&body->mcr_offset); + __swab64s(&body->mcr_size); +} +EXPORT_SYMBOL(lustre_swab_mgs_config_res); + +static void lustre_swab_obd_dqinfo (struct obd_dqinfo *i) +{ + __swab64s (&i->dqi_bgrace); + __swab64s (&i->dqi_igrace); + __swab32s (&i->dqi_flags); + __swab32s (&i->dqi_valid); +} + +static void lustre_swab_obd_dqblk (struct obd_dqblk *b) +{ + __swab64s (&b->dqb_ihardlimit); + __swab64s (&b->dqb_isoftlimit); + __swab64s (&b->dqb_curinodes); + __swab64s (&b->dqb_bhardlimit); + __swab64s (&b->dqb_bsoftlimit); + __swab64s (&b->dqb_curspace); + __swab64s (&b->dqb_btime); + __swab64s (&b->dqb_itime); + __swab32s (&b->dqb_valid); + CLASSERT(offsetof(typeof(*b), dqb_padding) != 0); +} + +void lustre_swab_obd_quotactl (struct obd_quotactl *q) +{ + __swab32s (&q->qc_cmd); + __swab32s (&q->qc_type); + __swab32s (&q->qc_id); + __swab32s (&q->qc_stat); + lustre_swab_obd_dqinfo (&q->qc_dqinfo); + lustre_swab_obd_dqblk (&q->qc_dqblk); +} +EXPORT_SYMBOL(lustre_swab_obd_quotactl); + +void lustre_swab_mdt_remote_perm (struct mdt_remote_perm *p) +{ + __swab32s (&p->rp_uid); + __swab32s (&p->rp_gid); + __swab32s (&p->rp_fsuid); + __swab32s (&p->rp_fsuid_h); + __swab32s (&p->rp_fsgid); + __swab32s (&p->rp_fsgid_h); + __swab32s (&p->rp_access_perm); + __swab32s (&p->rp_padding); +}; +EXPORT_SYMBOL(lustre_swab_mdt_remote_perm); + +void lustre_swab_fid2path(struct getinfo_fid2path *gf) +{ + lustre_swab_lu_fid(&gf->gf_fid); + __swab64s(&gf->gf_recno); + __swab32s(&gf->gf_linkno); + __swab32s(&gf->gf_pathlen); +} +EXPORT_SYMBOL(lustre_swab_fid2path); + +void lustre_swab_fiemap_extent(struct ll_fiemap_extent *fm_extent) +{ + __swab64s(&fm_extent->fe_logical); + __swab64s(&fm_extent->fe_physical); + __swab64s(&fm_extent->fe_length); + __swab32s(&fm_extent->fe_flags); + __swab32s(&fm_extent->fe_device); +} + +void lustre_swab_fiemap(struct ll_user_fiemap *fiemap) +{ + int i; + + __swab64s(&fiemap->fm_start); + __swab64s(&fiemap->fm_length); + __swab32s(&fiemap->fm_flags); + __swab32s(&fiemap->fm_mapped_extents); + __swab32s(&fiemap->fm_extent_count); + __swab32s(&fiemap->fm_reserved); + + for (i = 0; i < fiemap->fm_mapped_extents; i++) + lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); +} +EXPORT_SYMBOL(lustre_swab_fiemap); + +void lustre_swab_idx_info(struct idx_info *ii) +{ + __swab32s(&ii->ii_magic); + __swab32s(&ii->ii_flags); + __swab16s(&ii->ii_count); + __swab32s(&ii->ii_attrs); + lustre_swab_lu_fid(&ii->ii_fid); + __swab64s(&ii->ii_version); + __swab64s(&ii->ii_hash_start); + __swab64s(&ii->ii_hash_end); + __swab16s(&ii->ii_keysize); + __swab16s(&ii->ii_recsize); +} + +void lustre_swab_lip_header(struct lu_idxpage *lip) +{ + /* swab header */ + __swab32s(&lip->lip_magic); + __swab16s(&lip->lip_flags); + __swab16s(&lip->lip_nr); +} +EXPORT_SYMBOL(lustre_swab_lip_header); + +void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) +{ + __swab32s(&rr->rr_opcode); + __swab32s(&rr->rr_cap); + __swab32s(&rr->rr_fsuid); + /* rr_fsuid_h is unused */ + __swab32s(&rr->rr_fsgid); + /* rr_fsgid_h is unused */ + __swab32s(&rr->rr_suppgid1); + /* rr_suppgid1_h is unused */ + __swab32s(&rr->rr_suppgid2); + /* rr_suppgid2_h is unused */ + lustre_swab_lu_fid(&rr->rr_fid1); + lustre_swab_lu_fid(&rr->rr_fid2); + __swab64s(&rr->rr_mtime); + __swab64s(&rr->rr_atime); + __swab64s(&rr->rr_ctime); + __swab64s(&rr->rr_size); + __swab64s(&rr->rr_blocks); + __swab32s(&rr->rr_bias); + __swab32s(&rr->rr_mode); + __swab32s(&rr->rr_flags); + __swab32s(&rr->rr_flags_h); + __swab32s(&rr->rr_umask); + + CLASSERT(offsetof(typeof(*rr), rr_padding_4) != 0); +}; +EXPORT_SYMBOL(lustre_swab_mdt_rec_reint); + +void lustre_swab_lov_desc (struct lov_desc *ld) +{ + __swab32s (&ld->ld_tgt_count); + __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); + __swab64s (&ld->ld_default_stripe_size); + __swab64s (&ld->ld_default_stripe_offset); + __swab32s (&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} +EXPORT_SYMBOL(lustre_swab_lov_desc); + +void lustre_swab_lmv_desc (struct lmv_desc *ld) +{ + __swab32s (&ld->ld_tgt_count); + __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); + __swab64s (&ld->ld_default_hash_size); + __swab32s (&ld->ld_qos_maxage); + /* uuid endian insensitive */ +} + +void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea) +{ + __swab32s(&mea->mea_magic); + __swab32s(&mea->mea_count); + __swab32s(&mea->mea_master); + CLASSERT(offsetof(typeof(*mea), mea_padding) != 0); +} + +void lustre_swab_lmv_user_md(struct lmv_user_md *lum) +{ + int i; + + __swab32s(&lum->lum_magic); + __swab32s(&lum->lum_stripe_count); + __swab32s(&lum->lum_stripe_offset); + __swab32s(&lum->lum_hash_type); + __swab32s(&lum->lum_type); + CLASSERT(offsetof(typeof(*lum), lum_padding1) != 0); + CLASSERT(offsetof(typeof(*lum), lum_padding2) != 0); + CLASSERT(offsetof(typeof(*lum), lum_padding3) != 0); + + for (i = 0; i < lum->lum_stripe_count; i++) { + __swab32s(&lum->lum_objects[i].lum_mds); + lustre_swab_lu_fid(&lum->lum_objects[i].lum_fid); + } + +} +EXPORT_SYMBOL(lustre_swab_lmv_user_md); + +static void print_lum (struct lov_user_md *lum) +{ + CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); + CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic); + CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern); + CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lmm_oi_id(&lum->lmm_oi)); + CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lmm_oi_seq(&lum->lmm_oi)); + CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); + CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); + CDEBUG(D_OTHER, "\tlmm_stripe_offset/lmm_layout_gen: %#x\n", + lum->lmm_stripe_offset); +} + +static void lustre_swab_lmm_oi(struct ost_id *oi) +{ + __swab64s(&oi->oi.oi_id); + __swab64s(&oi->oi.oi_seq); +} + +static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) +{ + ENTRY; + __swab32s(&lum->lmm_magic); + __swab32s(&lum->lmm_pattern); + lustre_swab_lmm_oi(&lum->lmm_oi); + __swab32s(&lum->lmm_stripe_size); + __swab16s(&lum->lmm_stripe_count); + __swab16s(&lum->lmm_stripe_offset); + print_lum(lum); + EXIT; +} + +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); + lustre_swab_lov_user_md_common(lum); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); + +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); + lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); + /* lmm_pool_name nothing to do with char */ + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); + +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); + __swab32s(&lmm->lmm_magic); + __swab32s(&lmm->lmm_pattern); + lustre_swab_lmm_oi(&lmm->lmm_oi); + __swab32s(&lmm->lmm_stripe_size); + __swab16s(&lmm->lmm_stripe_count); + __swab16s(&lmm->lmm_layout_gen); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_mds_md); + +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count) +{ + int i; + ENTRY; + for (i = 0; i < stripe_count; i++) { + lustre_swab_ost_id(&(lod[i].l_ost_oi)); + __swab32s(&(lod[i].l_ost_gen)); + __swab32s(&(lod[i].l_ost_idx)); + } + EXIT; +} +EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); + +void lustre_swab_ldlm_res_id (struct ldlm_res_id *id) +{ + int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + __swab64s (&id->name[i]); +} +EXPORT_SYMBOL(lustre_swab_ldlm_res_id); + +void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d) +{ + /* the lock data is a union and the first two fields are always an + * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock + * data the same way. */ + __swab64s(&d->l_extent.start); + __swab64s(&d->l_extent.end); + __swab64s(&d->l_extent.gid); + __swab64s(&d->l_flock.lfw_owner); + __swab32s(&d->l_flock.lfw_pid); +} +EXPORT_SYMBOL(lustre_swab_ldlm_policy_data); + +void lustre_swab_ldlm_intent (struct ldlm_intent *i) +{ + __swab64s (&i->opc); +} +EXPORT_SYMBOL(lustre_swab_ldlm_intent); + +void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r) +{ + __swab32s (&r->lr_type); + CLASSERT(offsetof(typeof(*r), lr_padding) != 0); + lustre_swab_ldlm_res_id (&r->lr_name); +} +EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc); + +void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l) +{ + lustre_swab_ldlm_resource_desc (&l->l_resource); + __swab32s (&l->l_req_mode); + __swab32s (&l->l_granted_mode); + lustre_swab_ldlm_policy_data (&l->l_policy_data); +} +EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); + +void lustre_swab_ldlm_request (struct ldlm_request *rq) +{ + __swab32s (&rq->lock_flags); + lustre_swab_ldlm_lock_desc (&rq->lock_desc); + __swab32s (&rq->lock_count); + /* lock_handle[] opaque */ +} +EXPORT_SYMBOL(lustre_swab_ldlm_request); + +void lustre_swab_ldlm_reply (struct ldlm_reply *r) +{ + __swab32s (&r->lock_flags); + CLASSERT(offsetof(typeof(*r), lock_padding) != 0); + lustre_swab_ldlm_lock_desc (&r->lock_desc); + /* lock_handle opaque */ + __swab64s (&r->lock_policy_res1); + __swab64s (&r->lock_policy_res2); +} +EXPORT_SYMBOL(lustre_swab_ldlm_reply); + +void lustre_swab_quota_body(struct quota_body *b) +{ + lustre_swab_lu_fid(&b->qb_fid); + lustre_swab_lu_fid((struct lu_fid *)&b->qb_id); + __swab32s(&b->qb_flags); + __swab64s(&b->qb_count); + __swab64s(&b->qb_usage); + __swab64s(&b->qb_slv_ver); +} + +/* Dump functions */ +void dump_ioo(struct obd_ioobj *ioo) +{ + CDEBUG(D_RPCTRACE, + "obd_ioobj: ioo_oid="DOSTID", ioo_max_brw=%#x, " + "ioo_bufct=%d\n", POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, + ioo->ioo_bufcnt); +} +EXPORT_SYMBOL(dump_ioo); + +void dump_rniobuf(struct niobuf_remote *nb) +{ + CDEBUG(D_RPCTRACE, "niobuf_remote: offset="LPU64", len=%d, flags=%x\n", + nb->offset, nb->len, nb->flags); +} +EXPORT_SYMBOL(dump_rniobuf); + +void dump_obdo(struct obdo *oa) +{ + __u32 valid = oa->o_valid; + + CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid); + if (valid & OBD_MD_FLID) + CDEBUG(D_RPCTRACE, "obdo: id = "DOSTID"\n", POSTID(&oa->o_oi)); + if (valid & OBD_MD_FLFID) + CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = "LPX64"\n", + oa->o_parent_seq); + if (valid & OBD_MD_FLSIZE) + CDEBUG(D_RPCTRACE, "obdo: o_size = "LPD64"\n", oa->o_size); + if (valid & OBD_MD_FLMTIME) + CDEBUG(D_RPCTRACE, "obdo: o_mtime = "LPD64"\n", oa->o_mtime); + if (valid & OBD_MD_FLATIME) + CDEBUG(D_RPCTRACE, "obdo: o_atime = "LPD64"\n", oa->o_atime); + if (valid & OBD_MD_FLCTIME) + CDEBUG(D_RPCTRACE, "obdo: o_ctime = "LPD64"\n", oa->o_ctime); + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + CDEBUG(D_RPCTRACE, "obdo: o_blocks = "LPD64"\n", oa->o_blocks); + if (valid & OBD_MD_FLGRANT) + CDEBUG(D_RPCTRACE, "obdo: o_grant = "LPD64"\n", oa->o_grant); + if (valid & OBD_MD_FLBLKSZ) + CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize); + if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE)) + CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n", + oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) | + (valid & OBD_MD_FLMODE ? ~S_IFMT : 0))); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid); + if (valid & OBD_MD_FLUID) + CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid); + if (valid & OBD_MD_FLGID) + CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h); + if (valid & OBD_MD_FLFLAGS) + CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags); + if (valid & OBD_MD_FLNLINK) + CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink); + else if (valid & OBD_MD_FLCKSUM) + CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n", + oa->o_nlink); + if (valid & OBD_MD_FLGENER) + CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n", + oa->o_parent_oid); + if (valid & OBD_MD_FLEPOCH) + CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = "LPD64"\n", + oa->o_ioepoch); + if (valid & OBD_MD_FLFID) { + CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n", + oa->o_stripe_idx); + CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n", + oa->o_parent_ver); + } + if (valid & OBD_MD_FLHANDLE) + CDEBUG(D_RPCTRACE, "obdo: o_handle = "LPD64"\n", + oa->o_handle.cookie); + if (valid & OBD_MD_FLCOOKIE) + CDEBUG(D_RPCTRACE, "obdo: o_lcookie = " + "(llog_cookie dumping not yet implemented)\n"); +} +EXPORT_SYMBOL(dump_obdo); + +void dump_ost_body(struct ost_body *ob) +{ + dump_obdo(&ob->oa); +} +EXPORT_SYMBOL(dump_ost_body); + +void dump_rcs(__u32 *rc) +{ + CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); +} +EXPORT_SYMBOL(dump_rcs); + +static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqmsg); + + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + CERROR("bad lustre msg magic: %#08X\n", + req->rq_reqmsg->lm_magic); + } + return 0; +} + +static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repmsg); + + switch (req->rq_repmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); + default: + /* uninitialized yet */ + return 0; + } +} + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ... ) +{ + int req_ok = req->rq_reqmsg != NULL; + int rep_ok = req->rq_repmsg != NULL; + lnet_nid_t nid = LNET_NID_ANY; + va_list args; + + if (ptlrpc_req_need_swab(req)) { + req_ok = req_ok && req_ptlrpc_body_swabbed(req); + rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); + } + + if (req->rq_import && req->rq_import->imp_connection) + nid = req->rq_import->imp_connection->c_peer.nid; + else if (req->rq_export && req->rq_export->exp_connection) + nid = req->rq_export->exp_connection->c_peer.nid; + + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " req@%p x"LPU64"/t"LPD64"("LPD64") o%d->%s@%s:%d/%d" + " lens %d/%d e %d to %d dl "CFS_TIME_T" ref %d " + "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", + req, req->rq_xid, req->rq_transno, + req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0, + req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1, + req->rq_import ? + req->rq_import->imp_obd->obd_name : + req->rq_export ? + req->rq_export->exp_client_uuid.uuid : + "", + libcfs_nid2str(nid), + req->rq_request_portal, req->rq_reply_portal, + req->rq_reqlen, req->rq_replen, + req->rq_early_count, req->rq_timedout, + req->rq_deadline, + atomic_read(&req->rq_refcount), + DEBUG_REQ_FLAGS(req), + req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, + rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1, + req->rq_status, + rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1); +} +EXPORT_SYMBOL(_debug_req); + +void lustre_swab_lustre_capa(struct lustre_capa *c) +{ + lustre_swab_lu_fid(&c->lc_fid); + __swab64s (&c->lc_opc); + __swab64s (&c->lc_uid); + __swab64s (&c->lc_gid); + __swab32s (&c->lc_flags); + __swab32s (&c->lc_keyid); + __swab32s (&c->lc_timeout); + __swab32s (&c->lc_expiry); +} +EXPORT_SYMBOL(lustre_swab_lustre_capa); + +void lustre_swab_lustre_capa_key(struct lustre_capa_key *k) +{ + __swab64s (&k->lk_seq); + __swab32s (&k->lk_keyid); + CLASSERT(offsetof(typeof(*k), lk_padding) != 0); +} +EXPORT_SYMBOL(lustre_swab_lustre_capa_key); + +void lustre_swab_hsm_user_state(struct hsm_user_state *state) +{ + __swab32s(&state->hus_states); + __swab32s(&state->hus_archive_id); +} +EXPORT_SYMBOL(lustre_swab_hsm_user_state); + +void lustre_swab_hsm_state_set(struct hsm_state_set *hss) +{ + __swab32s(&hss->hss_valid); + __swab64s(&hss->hss_setmask); + __swab64s(&hss->hss_clearmask); + __swab32s(&hss->hss_archive_id); +} +EXPORT_SYMBOL(lustre_swab_hsm_state_set); + +void lustre_swab_hsm_extent(struct hsm_extent *extent) +{ + __swab64s(&extent->offset); + __swab64s(&extent->length); +} + +void lustre_swab_hsm_current_action(struct hsm_current_action *action) +{ + __swab32s(&action->hca_state); + __swab32s(&action->hca_action); + lustre_swab_hsm_extent(&action->hca_location); +} +EXPORT_SYMBOL(lustre_swab_hsm_current_action); + +void lustre_swab_hsm_user_item(struct hsm_user_item *hui) +{ + lustre_swab_lu_fid(&hui->hui_fid); + lustre_swab_hsm_extent(&hui->hui_extent); +} +EXPORT_SYMBOL(lustre_swab_hsm_user_item); + +void lustre_swab_layout_intent(struct layout_intent *li) +{ + __swab32s(&li->li_opc); + __swab32s(&li->li_flags); + __swab64s(&li->li_start); + __swab64s(&li->li_end); +} +EXPORT_SYMBOL(lustre_swab_layout_intent); + +void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) +{ + lustre_swab_lu_fid(&hpk->hpk_fid); + __swab64s(&hpk->hpk_cookie); + __swab64s(&hpk->hpk_extent.offset); + __swab64s(&hpk->hpk_extent.length); + __swab16s(&hpk->hpk_flags); + __swab16s(&hpk->hpk_errval); +} +EXPORT_SYMBOL(lustre_swab_hsm_progress_kernel); + +void lustre_swab_hsm_request(struct hsm_request *hr) +{ + __swab32s(&hr->hr_action); + __swab32s(&hr->hr_archive_id); + __swab64s(&hr->hr_flags); + __swab32s(&hr->hr_itemcount); + __swab32s(&hr->hr_data_len); +} +EXPORT_SYMBOL(lustre_swab_hsm_request); + +void lustre_swab_update_buf(struct update_buf *ub) +{ + __swab32s(&ub->ub_magic); + __swab32s(&ub->ub_count); +} +EXPORT_SYMBOL(lustre_swab_update_buf); + +void lustre_swab_update_reply_buf(struct update_reply *ur) +{ + int i; + + __swab32s(&ur->ur_version); + __swab32s(&ur->ur_count); + for (i = 0; i < ur->ur_count; i++) + __swab32s(&ur->ur_lens[i]); +} +EXPORT_SYMBOL(lustre_swab_update_reply_buf); + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) +{ + __swab64s(&msl->msl_flags); +} +EXPORT_SYMBOL(lustre_swab_swap_layouts); diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c new file mode 100644 index 000000000000..d926d2b36fb4 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/pers.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + + +void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc, + int mdidx) +{ + CLASSERT(PTLRPC_MAX_BRW_PAGES < LI_POISON); + + LASSERT(mdidx < desc->bd_md_max_brw); + LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); + LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | + LNET_MD_PHYS))); + + md->options |= LNET_MD_KIOV; + md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV); + md->length = min_t(unsigned int, LNET_MAX_IOV, md->length); + if (desc->bd_enc_iov) + md->start = &desc->bd_enc_iov[mdidx * LNET_MAX_IOV]; + else + md->start = &desc->bd_iov[mdidx * LNET_MAX_IOV]; +} + +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, + int pageoffset, int len) +{ + lnet_kiov_t *kiov = &desc->bd_iov[desc->bd_iov_count]; + + kiov->kiov_page = page; + kiov->kiov_offset = pageoffset; + kiov->kiov_len = len; + + desc->bd_iov_count++; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c new file mode 100644 index 000000000000..ef5269aee0de --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c @@ -0,0 +1,763 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/pinger.c + * + * Portal-RPC reconnection and replay operations, for use in recovery. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include +#include +#include "ptlrpc_internal.h" + +static int suppress_pings; +CFS_MODULE_PARM(suppress_pings, "i", int, 0644, "Suppress pings"); + +struct mutex pinger_mutex; +static LIST_HEAD(pinger_imports); +static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list); + +int ptlrpc_pinger_suppress_pings() +{ + return suppress_pings; +} +EXPORT_SYMBOL(ptlrpc_pinger_suppress_pings); + +struct ptlrpc_request * +ptlrpc_prep_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + + req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, + LUSTRE_OBD_VERSION, OBD_PING); + if (req) { + ptlrpc_request_set_replen(req); + req->rq_no_resend = req->rq_no_delay = 1; + } + return req; +} + +int ptlrpc_obd_ping(struct obd_device *obd) +{ + int rc; + struct ptlrpc_request *req; + ENTRY; + + req = ptlrpc_prep_ping(obd->u.cli.cl_import); + if (req == NULL) + RETURN(-ENOMEM); + + req->rq_send_state = LUSTRE_IMP_FULL; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_obd_ping); + +int ptlrpc_ping(struct obd_import *imp) +{ + struct ptlrpc_request *req; + ENTRY; + + req = ptlrpc_prep_ping(imp); + if (req == NULL) { + CERROR("OOM trying to ping %s->%s\n", + imp->imp_obd->obd_uuid.uuid, + obd2cli_tgt(imp->imp_obd)); + RETURN(-ENOMEM); + } + + DEBUG_REQ(D_INFO, req, "pinging %s->%s", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1); + + RETURN(0); +} + +void ptlrpc_update_next_ping(struct obd_import *imp, int soon) +{ + int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL; + if (imp->imp_state == LUSTRE_IMP_DISCON) { + int dtime = max_t(int, CONNECTION_SWITCH_MIN, + AT_OFF ? 0 : + at_get(&imp->imp_at.iat_net_latency)); + time = min(time, dtime); + } + imp->imp_next_ping = cfs_time_shift(time); +} + +void ptlrpc_ping_import_soon(struct obd_import *imp) +{ + imp->imp_next_ping = cfs_time_current(); +} + +static inline int imp_is_deactive(struct obd_import *imp) +{ + return (imp->imp_deactive || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE)); +} + +static inline int ptlrpc_next_reconnect(struct obd_import *imp) +{ + if (imp->imp_server_timeout) + return cfs_time_shift(obd_timeout / 2); + else + return cfs_time_shift(obd_timeout); +} + +static atomic_t suspend_timeouts = ATOMIC_INIT(0); +static cfs_time_t suspend_wakeup_time = 0; + +cfs_duration_t pinger_check_timeout(cfs_time_t time) +{ + struct timeout_item *item; + cfs_time_t timeout = PING_INTERVAL; + + /* The timeout list is a increase order sorted list */ + mutex_lock(&pinger_mutex); + list_for_each_entry(item, &timeout_list, ti_chain) { + int ti_timeout = item->ti_timeout; + if (timeout > ti_timeout) + timeout = ti_timeout; + break; + } + mutex_unlock(&pinger_mutex); + + return cfs_time_sub(cfs_time_add(time, cfs_time_seconds(timeout)), + cfs_time_current()); +} + +static wait_queue_head_t suspend_timeouts_waitq; + +cfs_time_t ptlrpc_suspend_wakeup_time(void) +{ + return suspend_wakeup_time; +} + +void ptlrpc_deactivate_timeouts(struct obd_import *imp) +{ + /*XXX: disabled for now, will be replaced by adaptive timeouts */ +#if 0 + if (imp->imp_no_timeout) + return; + imp->imp_no_timeout = 1; + atomic_inc(&suspend_timeouts); + CDEBUG(D_HA|D_WARNING, "deactivate timeouts %u\n", + atomic_read(&suspend_timeouts)); +#endif +} + +void ptlrpc_activate_timeouts(struct obd_import *imp) +{ + /*XXX: disabled for now, will be replaced by adaptive timeouts */ +#if 0 + if (!imp->imp_no_timeout) + return; + imp->imp_no_timeout = 0; + LASSERT(atomic_read(&suspend_timeouts) > 0); + if (atomic_dec_and_test(&suspend_timeouts)) { + suspend_wakeup_time = cfs_time_current(); + wake_up(&suspend_timeouts_waitq); + } + CDEBUG(D_HA|D_WARNING, "activate timeouts %u\n", + atomic_read(&suspend_timeouts)); +#endif +} + +int ptlrpc_check_suspend(void) +{ + if (atomic_read(&suspend_timeouts)) + return 1; + return 0; +} + +int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req) +{ + struct l_wait_info lwi; + + if (atomic_read(&suspend_timeouts)) { + DEBUG_REQ(D_NET, req, "-- suspend %d regular timeout", + atomic_read(&suspend_timeouts)); + lwi = LWI_INTR(NULL, NULL); + l_wait_event(suspend_timeouts_waitq, + atomic_read(&suspend_timeouts) == 0, &lwi); + DEBUG_REQ(D_NET, req, "-- recharge regular timeout"); + return 1; + } + return 0; +} + + +static bool ir_up; + +void ptlrpc_pinger_ir_up(void) +{ + CDEBUG(D_HA, "IR up\n"); + ir_up = true; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_up); + +void ptlrpc_pinger_ir_down(void) +{ + CDEBUG(D_HA, "IR down\n"); + ir_up = false; +} +EXPORT_SYMBOL(ptlrpc_pinger_ir_down); + +static void ptlrpc_pinger_process_import(struct obd_import *imp, + unsigned long this_ping) +{ + int level; + int force; + int force_next; + int suppress; + + spin_lock(&imp->imp_lock); + + level = imp->imp_state; + force = imp->imp_force_verify; + force_next = imp->imp_force_next_verify; + /* + * This will be used below only if the import is "FULL". + */ + suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS); + + imp->imp_force_verify = 0; + + if (cfs_time_aftereq(imp->imp_next_ping - 5 * CFS_TICK, this_ping) && + !force) { + spin_unlock(&imp->imp_lock); + return; + } + + imp->imp_force_next_verify = 0; + + spin_unlock(&imp->imp_lock); + + CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u " + "force %u force_next %u deactive %u pingable %u suppress %u\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level), level, force, force_next, + imp->imp_deactive, imp->imp_pingable, suppress); + + if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { + /* wait for a while before trying recovery again */ + imp->imp_next_ping = ptlrpc_next_reconnect(imp); + if (!imp->imp_no_pinger_recover) + ptlrpc_initiate_recovery(imp); + } else if (level != LUSTRE_IMP_FULL || + imp->imp_obd->obd_no_recov || + imp_is_deactive(imp)) { + CDEBUG(D_HA, "%s->%s: not pinging (in recovery " + "or recovery disabled: %s)\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), + ptlrpc_import_state_name(level)); + } else if ((imp->imp_pingable && !suppress) || force_next || force) { + ptlrpc_ping(imp); + } +} + +static int ptlrpc_pinger_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + ENTRY; + + /* Record that the thread is running */ + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + /* And now, loop forever, pinging as needed. */ + while (1) { + cfs_time_t this_ping = cfs_time_current(); + struct l_wait_info lwi; + cfs_duration_t time_to_next_wake; + struct timeout_item *item; + struct list_head *iter; + + mutex_lock(&pinger_mutex); + list_for_each_entry(item, &timeout_list, ti_chain) { + item->ti_cb(item, item->ti_cb_data); + } + list_for_each(iter, &pinger_imports) { + struct obd_import *imp = + list_entry(iter, struct obd_import, + imp_pinger_chain); + + ptlrpc_pinger_process_import(imp, this_ping); + /* obd_timeout might have changed */ + if (imp->imp_pingable && imp->imp_next_ping && + cfs_time_after(imp->imp_next_ping, + cfs_time_add(this_ping, + cfs_time_seconds(PING_INTERVAL)))) + ptlrpc_update_next_ping(imp, 0); + } + mutex_unlock(&pinger_mutex); + /* update memory usage info */ + obd_update_maxusage(); + + /* Wait until the next ping time, or until we're stopped. */ + time_to_next_wake = pinger_check_timeout(this_ping); + /* The ping sent by ptlrpc_send_rpc may get sent out + say .01 second after this. + ptlrpc_pinger_sending_on_import will then set the + next ping time to next_ping + .01 sec, which means + we will SKIP the next ping at next_ping, and the + ping will get sent 2 timeouts from now! Beware. */ + CDEBUG(D_INFO, "next wakeup in "CFS_DURATION_T" (" + CFS_TIME_T")\n", time_to_next_wake, + cfs_time_add(this_ping,cfs_time_seconds(PING_INTERVAL))); + if (time_to_next_wake > 0) { + lwi = LWI_TIMEOUT(max_t(cfs_duration_t, + time_to_next_wake, + cfs_time_seconds(1)), + NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_event(thread), + &lwi); + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) { + EXIT; + break; + } else { + /* woken after adding import to reset timer */ + thread_test_and_clear_flags(thread, SVC_EVENT); + } + } + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_NET, "pinger thread exiting, process %d\n", current_pid()); + return 0; +} + +static struct ptlrpc_thread *pinger_thread = NULL; + +int ptlrpc_start_pinger(void) +{ + struct l_wait_info lwi = { 0 }; + int rc; + ENTRY; + + if (pinger_thread != NULL) + RETURN(-EALREADY); + + OBD_ALLOC_PTR(pinger_thread); + if (pinger_thread == NULL) + RETURN(-ENOMEM); + init_waitqueue_head(&pinger_thread->t_ctl_waitq); + init_waitqueue_head(&suspend_timeouts_waitq); + + strcpy(pinger_thread->t_name, "ll_ping"); + + /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we + * just drop the VM and FILES in cfs_daemonize_ctxt() right away. */ + rc = PTR_ERR(kthread_run(ptlrpc_pinger_main, + pinger_thread, pinger_thread->t_name)); + if (IS_ERR_VALUE(rc)) { + CERROR("cannot start thread: %d\n", rc); + OBD_FREE(pinger_thread, sizeof(*pinger_thread)); + pinger_thread = NULL; + RETURN(rc); + } + l_wait_event(pinger_thread->t_ctl_waitq, + thread_is_running(pinger_thread), &lwi); + + if (suppress_pings) + CWARN("Pings will be suppressed at the request of the " + "administrator. The configuration shall meet the " + "additional requirements described in the manual. " + "(Search for the \"suppress_pings\" kernel module " + "parameter.)\n"); + + RETURN(0); +} + +int ptlrpc_pinger_remove_timeouts(void); + +int ptlrpc_stop_pinger(void) +{ + struct l_wait_info lwi = { 0 }; + int rc = 0; + ENTRY; + + if (pinger_thread == NULL) + RETURN(-EALREADY); + + ptlrpc_pinger_remove_timeouts(); + mutex_lock(&pinger_mutex); + thread_set_flags(pinger_thread, SVC_STOPPING); + wake_up(&pinger_thread->t_ctl_waitq); + mutex_unlock(&pinger_mutex); + + l_wait_event(pinger_thread->t_ctl_waitq, + thread_is_stopped(pinger_thread), &lwi); + + OBD_FREE_PTR(pinger_thread); + pinger_thread = NULL; + RETURN(rc); +} + +void ptlrpc_pinger_sending_on_import(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 0); +} +EXPORT_SYMBOL(ptlrpc_pinger_sending_on_import); + +void ptlrpc_pinger_commit_expected(struct obd_import *imp) +{ + ptlrpc_update_next_ping(imp, 1); + LASSERT(spin_is_locked(&imp->imp_lock)); + /* + * Avoid reading stale imp_connect_data. When not sure if pings are + * expected or not on next connection, we assume they are not and force + * one anyway to guarantee the chance of updating + * imp_peer_committed_transno. + */ + if (imp->imp_state != LUSTRE_IMP_FULL || + OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS)) + imp->imp_force_next_verify = 1; +} + +int ptlrpc_pinger_add_import(struct obd_import *imp) +{ + ENTRY; + if (!list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + mutex_lock(&pinger_mutex); + CDEBUG(D_HA, "adding pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we add to pinger we want recovery on this import */ + imp->imp_obd->obd_no_recov = 0; + ptlrpc_update_next_ping(imp, 0); + /* XXX sort, blah blah */ + list_add_tail(&imp->imp_pinger_chain, &pinger_imports); + class_import_get(imp); + + ptlrpc_pinger_wake_up(); + mutex_unlock(&pinger_mutex); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_add_import); + +int ptlrpc_pinger_del_import(struct obd_import *imp) +{ + ENTRY; + if (list_empty(&imp->imp_pinger_chain)) + RETURN(-ENOENT); + + mutex_lock(&pinger_mutex); + list_del_init(&imp->imp_pinger_chain); + CDEBUG(D_HA, "removing pingable import %s->%s\n", + imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); + /* if we remove from pinger we don't want recovery on this import */ + imp->imp_obd->obd_no_recov = 1; + class_import_put(imp); + mutex_unlock(&pinger_mutex); + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_pinger_del_import); + +/** + * Register a timeout callback to the pinger list, and the callback will + * be called when timeout happens. + */ +struct timeout_item* ptlrpc_new_timeout(int time, enum timeout_event event, + timeout_cb_t cb, void *data) +{ + struct timeout_item *ti; + + OBD_ALLOC_PTR(ti); + if (!ti) + return(NULL); + + INIT_LIST_HEAD(&ti->ti_obd_list); + INIT_LIST_HEAD(&ti->ti_chain); + ti->ti_timeout = time; + ti->ti_event = event; + ti->ti_cb = cb; + ti->ti_cb_data = data; + + return ti; +} + +/** + * Register timeout event on the the pinger thread. + * Note: the timeout list is an sorted list with increased timeout value. + */ +static struct timeout_item* +ptlrpc_pinger_register_timeout(int time, enum timeout_event event, + timeout_cb_t cb, void *data) +{ + struct timeout_item *item, *tmp; + + LASSERT(mutex_is_locked(&pinger_mutex)); + + list_for_each_entry(item, &timeout_list, ti_chain) + if (item->ti_event == event) + goto out; + + item = ptlrpc_new_timeout(time, event, cb, data); + if (item) { + list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) { + if (tmp->ti_timeout < time) { + list_add(&item->ti_chain, &tmp->ti_chain); + goto out; + } + } + list_add(&item->ti_chain, &timeout_list); + } +out: + return item; +} + +/* Add a client_obd to the timeout event list, when timeout(@time) + * happens, the callback(@cb) will be called. + */ +int ptlrpc_add_timeout_client(int time, enum timeout_event event, + timeout_cb_t cb, void *data, + struct list_head *obd_list) +{ + struct timeout_item *ti; + + mutex_lock(&pinger_mutex); + ti = ptlrpc_pinger_register_timeout(time, event, cb, data); + if (!ti) { + mutex_unlock(&pinger_mutex); + return (-EINVAL); + } + list_add(obd_list, &ti->ti_obd_list); + mutex_unlock(&pinger_mutex); + return 0; +} +EXPORT_SYMBOL(ptlrpc_add_timeout_client); + +int ptlrpc_del_timeout_client(struct list_head *obd_list, + enum timeout_event event) +{ + struct timeout_item *ti = NULL, *item; + + if (list_empty(obd_list)) + return 0; + mutex_lock(&pinger_mutex); + list_del_init(obd_list); + /** + * If there are no obd attached to the timeout event + * list, remove this timeout event from the pinger + */ + list_for_each_entry(item, &timeout_list, ti_chain) { + if (item->ti_event == event) { + ti = item; + break; + } + } + LASSERTF(ti != NULL, "ti is NULL ! \n"); + if (list_empty(&ti->ti_obd_list)) { + list_del(&ti->ti_chain); + OBD_FREE_PTR(ti); + } + mutex_unlock(&pinger_mutex); + return 0; +} +EXPORT_SYMBOL(ptlrpc_del_timeout_client); + +int ptlrpc_pinger_remove_timeouts(void) +{ + struct timeout_item *item, *tmp; + + mutex_lock(&pinger_mutex); + list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) { + LASSERT(list_empty(&item->ti_obd_list)); + list_del(&item->ti_chain); + OBD_FREE_PTR(item); + } + mutex_unlock(&pinger_mutex); + return 0; +} + +void ptlrpc_pinger_wake_up() +{ + thread_add_flags(pinger_thread, SVC_EVENT); + wake_up(&pinger_thread->t_ctl_waitq); +} + +/* Ping evictor thread */ +#define PET_READY 1 +#define PET_TERMINATE 2 + +static int pet_refcount = 0; +static int pet_state; +static wait_queue_head_t pet_waitq; +LIST_HEAD(pet_list); +static DEFINE_SPINLOCK(pet_lock); + +int ping_evictor_wake(struct obd_export *exp) +{ + struct obd_device *obd; + + spin_lock(&pet_lock); + if (pet_state != PET_READY) { + /* eventually the new obd will call here again. */ + spin_unlock(&pet_lock); + return 1; + } + + obd = class_exp2obd(exp); + if (list_empty(&obd->obd_evict_list)) { + class_incref(obd, "evictor", obd); + list_add(&obd->obd_evict_list, &pet_list); + } + spin_unlock(&pet_lock); + + wake_up(&pet_waitq); + return 0; +} + +static int ping_evictor_main(void *arg) +{ + struct obd_device *obd; + struct obd_export *exp; + struct l_wait_info lwi = { 0 }; + time_t expire_time; + ENTRY; + + unshare_fs_struct(); + + CDEBUG(D_HA, "Starting Ping Evictor\n"); + pet_state = PET_READY; + while (1) { + l_wait_event(pet_waitq, (!list_empty(&pet_list)) || + (pet_state == PET_TERMINATE), &lwi); + + /* loop until all obd's will be removed */ + if ((pet_state == PET_TERMINATE) && list_empty(&pet_list)) + break; + + /* we only get here if pet_exp != NULL, and the end of this + * loop is the only place which sets it NULL again, so lock + * is not strictly necessary. */ + spin_lock(&pet_lock); + obd = list_entry(pet_list.next, struct obd_device, + obd_evict_list); + spin_unlock(&pet_lock); + + expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT; + + CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", + obd->obd_name, expire_time); + + /* Exports can't be deleted out of the list while we hold + * the obd lock (class_unlink_export), which means we can't + * lose the last ref on the export. If they've already been + * removed from the list, we won't find them here. */ + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_exports_timed)) { + exp = list_entry(obd->obd_exports_timed.next, + struct obd_export, + exp_obd_chain_timed); + if (expire_time > exp->exp_last_request_time) { + class_export_get(exp); + spin_unlock(&obd->obd_dev_lock); + LCONSOLE_WARN("%s: haven't heard from client %s" + " (at %s) in %ld seconds. I think" + " it's dead, and I am evicting" + " it. exp %p, cur %ld expire %ld" + " last %ld\n", + obd->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + (long)(cfs_time_current_sec() - + exp->exp_last_request_time), + exp, (long)cfs_time_current_sec(), + (long)expire_time, + (long)exp->exp_last_request_time); + CDEBUG(D_HA, "Last request was at %ld\n", + exp->exp_last_request_time); + class_fail_export(exp); + class_export_put(exp); + spin_lock(&obd->obd_dev_lock); + } else { + /* List is sorted, so everyone below is ok */ + break; + } + } + spin_unlock(&obd->obd_dev_lock); + + spin_lock(&pet_lock); + list_del_init(&obd->obd_evict_list); + spin_unlock(&pet_lock); + + class_decref(obd, "evictor", obd); + } + CDEBUG(D_HA, "Exiting Ping Evictor\n"); + + RETURN(0); +} + +void ping_evictor_start(void) +{ + task_t *task; + + if (++pet_refcount > 1) + return; + + init_waitqueue_head(&pet_waitq); + + task = kthread_run(ping_evictor_main, NULL, "ll_evictor"); + if (IS_ERR(task)) { + pet_refcount--; + CERROR("Cannot start ping evictor thread: %ld\n", + PTR_ERR(task)); + } +} +EXPORT_SYMBOL(ping_evictor_start); + +void ping_evictor_stop(void) +{ + if (--pet_refcount > 0) + return; + + pet_state = PET_TERMINATE; + wake_up(&pet_waitq); +} +EXPORT_SYMBOL(ping_evictor_stop); diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h new file mode 100644 index 000000000000..9ba760089b9d --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h @@ -0,0 +1,303 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* Intramodule declarations for ptlrpc. */ + +#ifndef PTLRPC_INTERNAL_H +#define PTLRPC_INTERNAL_H + +#include "../ldlm/ldlm_internal.h" + +struct ldlm_namespace; +struct obd_import; +struct ldlm_res_id; +struct ptlrpc_request_set; +extern int test_req_buffer_pressure; +extern struct mutex ptlrpc_all_services_mutex; + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); +/* ptlrpcd.c */ +int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc); + +/* client.c */ +struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, + unsigned type, unsigned portal); +void ptlrpc_init_xid(void); + +/* events.c */ +int ptlrpc_init_portals(void); +void ptlrpc_exit_portals(void); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *); +void lustre_assert_wire_constants(void); +int ptlrpc_import_in_recovery(struct obd_import *imp); +int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt); +void ptlrpc_handle_failed_import(struct obd_import *imp); +int ptlrpc_replay_next(struct obd_import *imp, int *inflight); +void ptlrpc_initiate_recovery(struct obd_import *imp); + +int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); +int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset); + +#ifdef LPROCFS +void ptlrpc_lprocfs_register_service(struct proc_dir_entry *proc_entry, + struct ptlrpc_service *svc); +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); +void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount); +void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req, + long q_usec, long work_usec); +#else +#define ptlrpc_lprocfs_register_service(params...) do{}while(0) +#define ptlrpc_lprocfs_unregister_service(params...) do{}while(0) +#define ptlrpc_lprocfs_rpc_sent(params...) do{}while(0) +#define ptlrpc_lprocfs_do_request_stat(params...) do{}while(0) +#endif /* LPROCFS */ + +/* NRS */ + +/** + * NRS core object. + * + * Holds NRS core fields. + */ +struct nrs_core { + /** + * Protects nrs_core::nrs_policies, serializes external policy + * registration/unregistration, and NRS core lprocfs operations. + */ + struct mutex nrs_mutex; + /* XXX: This is just for liblustre. Remove the #if defined directive + * when the * "cfs_" prefix is dropped from cfs_list_head. */ + /** + * List of all policy descriptors registered with NRS core; protected + * by nrs_core::nrs_mutex. + */ + struct list_head nrs_policies; + +}; + +int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc); +void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc); + +void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); +void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req); +void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req); +void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req, bool hp); + +struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, + bool peek, bool force); + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, + bool force) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force); +} + +static inline struct ptlrpc_request * +ptlrpc_nrs_req_peek_nolock(struct ptlrpc_service_part *svcpt, bool hp) +{ + return ptlrpc_nrs_req_get_nolock0(svcpt, hp, true, false); +} + +void ptlrpc_nrs_req_del_nolock(struct ptlrpc_request *req); +bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp); + +int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, + enum ptlrpc_nrs_queue_type queue, char *name, + enum ptlrpc_nrs_ctl opc, bool single, void *arg); + +int ptlrpc_nrs_init(void); +void ptlrpc_nrs_fini(void); + +static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nrs_hp != NULL; +} + +static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc) +{ + /** + * If the first service partition has an HP NRS head, all service + * partitions will. + */ + return nrs_svcpt_has_hp(svc->srv_parts[0]); +} + +static inline +struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp) +{ + LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt))); + return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; +} + +static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_cpt; +} + +static inline +struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt->scp_service; +} + +static inline +struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy) +{ + return policy->pol_nrs->nrs_svcpt; +} + +static inline +struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy) +{ + return nrs_pol2svc(policy)->srv_cptable; +} + +static inline struct ptlrpc_nrs_resource * +nrs_request_resource(struct ptlrpc_nrs_request *nrq) +{ + LASSERT(nrq->nr_initialized); + LASSERT(!nrq->nr_finalized); + + return nrq->nr_res_ptrs[nrq->nr_res_idx]; +} + +static inline +struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq) +{ + return nrs_request_resource(nrq)->res_policy; +} + +#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:" +#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:" + +/** + * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum. + */ +#define LPROCFS_NRS_QUANTUM_MAX 65535 + +/** + * Max valid command string is the size of the labels, plus "65535" twice, plus + * a separating space character. + */ +#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \ + sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \ + NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX)) + +/* recovd_thread.c */ + +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); + +/* pers.c */ +void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc, + int mdcnt); +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page, + int pageoffset, int len); + +/* pack_generic.c */ +struct ptlrpc_reply_state * +lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt); +void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs); + +/* pinger.c */ +int ptlrpc_start_pinger(void); +int ptlrpc_stop_pinger(void); +void ptlrpc_pinger_sending_on_import(struct obd_import *imp); +void ptlrpc_pinger_commit_expected(struct obd_import *imp); +void ptlrpc_pinger_wake_up(void); +void ptlrpc_ping_import_soon(struct obd_import *imp); +int ping_evictor_wake(struct obd_export *exp); + +/* sec_null.c */ +int sptlrpc_null_init(void); +void sptlrpc_null_fini(void); + +/* sec_plain.c */ +int sptlrpc_plain_init(void); +void sptlrpc_plain_fini(void); + +/* sec_bulk.c */ +int sptlrpc_enc_pool_init(void); +void sptlrpc_enc_pool_fini(void); +int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, + int *eof, void *data); + +/* sec_lproc.c */ +int sptlrpc_lproc_init(void); +void sptlrpc_lproc_fini(void); + +/* sec_gc.c */ +int sptlrpc_gc_init(void); +void sptlrpc_gc_fini(void); + +/* sec_config.c */ +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +int sptlrpc_conf_init(void); +void sptlrpc_conf_fini(void); + +/* sec.c */ +int sptlrpc_init(void); +void sptlrpc_fini(void); + +static inline int ll_rpc_recoverable_error(int rc) +{ + return (rc == -ENOTCONN || rc == -ENODEV); +} + +static inline int tgt_mod_init(void) +{ + return 0; +} + +static inline void tgt_mod_exit(void) +{ + return; +} + +static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) +{ + if (atomic_dec_and_test(&set->set_refcount)) + OBD_FREE_PTR(set); +} +#endif /* PTLRPC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c new file mode 100644 index 000000000000..f6ea80f0b105 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c @@ -0,0 +1,154 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + + +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +extern spinlock_t ptlrpc_last_xid_lock; +#if RS_DEBUG +extern spinlock_t ptlrpc_rs_debug_lock; +#endif +extern struct mutex pinger_mutex; +extern struct mutex ptlrpcd_mutex; + +__init int ptlrpc_init(void) +{ + int rc, cleanup_phase = 0; + ENTRY; + + lustre_assert_wire_constants(); +#if RS_DEBUG + spin_lock_init(&ptlrpc_rs_debug_lock); +#endif + mutex_init(&ptlrpc_all_services_mutex); + mutex_init(&pinger_mutex); + mutex_init(&ptlrpcd_mutex); + ptlrpc_init_xid(); + + rc = req_layout_init(); + if (rc) + RETURN(rc); + + rc = ptlrpc_hr_init(); + if (rc) + RETURN(rc); + + cleanup_phase = 1; + + rc = ptlrpc_init_portals(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 2; + + rc = ptlrpc_connection_init(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 3; + + ptlrpc_put_connection_superhack = ptlrpc_connection_put; + + rc = ptlrpc_start_pinger(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 4; + + rc = ldlm_init(); + if (rc) + GOTO(cleanup, rc); + cleanup_phase = 5; + + rc = sptlrpc_init(); + if (rc) + GOTO(cleanup, rc); + + cleanup_phase = 7; + rc = ptlrpc_nrs_init(); + if (rc) + GOTO(cleanup, rc); + + cleanup_phase = 8; + rc = tgt_mod_init(); + if (rc) + GOTO(cleanup, rc); + RETURN(0); + +cleanup: + switch(cleanup_phase) { + case 8: + ptlrpc_nrs_fini(); + case 7: + sptlrpc_fini(); + case 5: + ldlm_exit(); + case 4: + ptlrpc_stop_pinger(); + case 3: + ptlrpc_connection_fini(); + case 2: + ptlrpc_exit_portals(); + case 1: + ptlrpc_hr_fini(); + req_layout_fini(); + default: ; + } + + return rc; +} + +static void __exit ptlrpc_exit(void) +{ + tgt_mod_exit(); + ptlrpc_nrs_fini(); + sptlrpc_fini(); + ldlm_exit(); + ptlrpc_stop_pinger(); + ptlrpc_exit_portals(); + ptlrpc_hr_fini(); + ptlrpc_connection_fini(); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Lustre Request Processor and Lock Management"); +MODULE_LICENSE("GPL"); + +cfs_module(ptlrpc, "1.0.0", ptlrpc_init, ptlrpc_exit); diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c new file mode 100644 index 000000000000..185841fe8d00 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c @@ -0,0 +1,827 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/ptlrpcd.c + */ + +/** \defgroup ptlrpcd PortalRPC daemon + * + * ptlrpcd is a special thread with its own set where other user might add + * requests when they don't want to wait for their completion. + * PtlRPCD will take care of sending such requests and then processing their + * replies and calling completion callbacks as necessary. + * The callbacks are called directly from ptlrpcd context. + * It is important to never significantly block (esp. on RPCs!) within such + * completion handler or a deadlock might occur where ptlrpcd enters some + * callback that attempts to send another RPC and wait for it to return, + * during which time ptlrpcd is completely blocked, so e.g. if import + * fails, recovery cannot progress because connection requests are also + * sent by ptlrpcd. + * + * @{ + */ + +#define DEBUG_SUBSYSTEM S_RPC + +# include + +#include +# include + +#include +#include /* for obd_zombie */ +#include /* for OBD_FAIL_CHECK */ +#include /* cl_env_{get,put}() */ +#include + +#include "ptlrpc_internal.h" + +struct ptlrpcd { + int pd_size; + int pd_index; + int pd_nthreads; + struct ptlrpcd_ctl pd_thread_rcv; + struct ptlrpcd_ctl pd_threads[0]; +}; + +static int max_ptlrpcds; +CFS_MODULE_PARM(max_ptlrpcds, "i", int, 0644, + "Max ptlrpcd thread count to be started."); + +static int ptlrpcd_bind_policy = PDB_POLICY_PAIR; +CFS_MODULE_PARM(ptlrpcd_bind_policy, "i", int, 0644, + "Ptlrpcd threads binding mode."); +static struct ptlrpcd *ptlrpcds; + +struct mutex ptlrpcd_mutex; +static int ptlrpcd_users = 0; + +void ptlrpcd_wake(struct ptlrpc_request *req) +{ + struct ptlrpc_request_set *rq_set = req->rq_set; + + LASSERT(rq_set != NULL); + + wake_up(&rq_set->set_waitq); +} +EXPORT_SYMBOL(ptlrpcd_wake); + +static struct ptlrpcd_ctl * +ptlrpcd_select_pc(struct ptlrpc_request *req, pdl_policy_t policy, int index) +{ + int idx = 0; + + if (req != NULL && req->rq_send_state != LUSTRE_IMP_FULL) + return &ptlrpcds->pd_thread_rcv; + + switch (policy) { + case PDL_POLICY_SAME: + idx = smp_processor_id() % ptlrpcds->pd_nthreads; + break; + case PDL_POLICY_LOCAL: + /* Before CPU partition patches available, process it the same + * as "PDL_POLICY_ROUND". */ +# ifdef CFS_CPU_MODE_NUMA +# warning "fix this code to use new CPU partition APIs" +# endif + /* Fall through to PDL_POLICY_ROUND until the CPU + * CPU partition patches are available. */ + index = -1; + case PDL_POLICY_PREFERRED: + if (index >= 0 && index < num_online_cpus()) { + idx = index % ptlrpcds->pd_nthreads; + break; + } + /* Fall through to PDL_POLICY_ROUND for bad index. */ + default: + /* Fall through to PDL_POLICY_ROUND for unknown policy. */ + case PDL_POLICY_ROUND: + /* We do not care whether it is strict load balance. */ + idx = ptlrpcds->pd_index + 1; + if (idx == smp_processor_id()) + idx++; + idx %= ptlrpcds->pd_nthreads; + ptlrpcds->pd_index = idx; + break; + } + + return &ptlrpcds->pd_threads[idx]; +} + +/** + * Move all request from an existing request set to the ptlrpcd queue. + * All requests from the set must be in phase RQ_PHASE_NEW. + */ +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set) +{ + struct list_head *tmp, *pos; + struct ptlrpcd_ctl *pc; + struct ptlrpc_request_set *new; + int count, i; + + pc = ptlrpcd_select_pc(NULL, PDL_POLICY_LOCAL, -1); + new = pc->pc_set; + + list_for_each_safe(pos, tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(pos, struct ptlrpc_request, + rq_set_chain); + + LASSERT(req->rq_phase == RQ_PHASE_NEW); + req->rq_set = new; + req->rq_queued_time = cfs_time_current(); + } + + spin_lock(&new->set_new_req_lock); + list_splice_init(&set->set_requests, &new->set_new_requests); + i = atomic_read(&set->set_remaining); + count = atomic_add_return(i, &new->set_new_count); + atomic_set(&set->set_remaining, 0); + spin_unlock(&new->set_new_req_lock); + if (count == i) { + wake_up(&new->set_waitq); + + /* XXX: It maybe unnecessary to wakeup all the partners. But to + * guarantee the async RPC can be processed ASAP, we have + * no other better choice. It maybe fixed in future. */ + for (i = 0; i < pc->pc_npartners; i++) + wake_up(&pc->pc_partners[i]->pc_set->set_waitq); + } +} +EXPORT_SYMBOL(ptlrpcd_add_rqset); + +/** + * Return transferred RPCs count. + */ +static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des, + struct ptlrpc_request_set *src) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + int rc = 0; + + spin_lock(&src->set_new_req_lock); + if (likely(!list_empty(&src->set_new_requests))) { + list_for_each_safe(pos, tmp, &src->set_new_requests) { + req = list_entry(pos, struct ptlrpc_request, + rq_set_chain); + req->rq_set = des; + } + list_splice_init(&src->set_new_requests, + &des->set_requests); + rc = atomic_read(&src->set_new_count); + atomic_add(rc, &des->set_remaining); + atomic_set(&src->set_new_count, 0); + } + spin_unlock(&src->set_new_req_lock); + return rc; +} + +/** + * Requests that are added to the ptlrpcd queue are sent via + * ptlrpcd_check->ptlrpc_check_set(). + */ +void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx) +{ + struct ptlrpcd_ctl *pc; + + if (req->rq_reqmsg) + lustre_msg_set_jobid(req->rq_reqmsg, NULL); + + spin_lock(&req->rq_lock); + if (req->rq_invalid_rqset) { + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(5), + back_to_sleep, NULL); + + req->rq_invalid_rqset = 0; + spin_unlock(&req->rq_lock); + l_wait_event(req->rq_set_waitq, (req->rq_set == NULL), &lwi); + } else if (req->rq_set) { + /* If we have a vaid "rq_set", just reuse it to avoid double + * linked. */ + LASSERT(req->rq_phase == RQ_PHASE_NEW); + LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY); + + /* ptlrpc_check_set will decrease the count */ + atomic_inc(&req->rq_set->set_remaining); + spin_unlock(&req->rq_lock); + wake_up(&req->rq_set->set_waitq); + return; + } else { + spin_unlock(&req->rq_lock); + } + + pc = ptlrpcd_select_pc(req, policy, idx); + + DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]", + req, pc->pc_name, pc->pc_index); + + ptlrpc_set_add_new_req(pc, req); +} +EXPORT_SYMBOL(ptlrpcd_add_req); + +static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set) +{ + atomic_inc(&set->set_refcount); +} + +/** + * Check if there is more work to do on ptlrpcd set. + * Returns 1 if yes. + */ +static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + struct ptlrpc_request_set *set = pc->pc_set; + int rc = 0; + int rc2; + ENTRY; + + if (atomic_read(&set->set_new_count)) { + spin_lock(&set->set_new_req_lock); + if (likely(!list_empty(&set->set_new_requests))) { + list_splice_init(&set->set_new_requests, + &set->set_requests); + atomic_add(atomic_read(&set->set_new_count), + &set->set_remaining); + atomic_set(&set->set_new_count, 0); + /* + * Need to calculate its timeout. + */ + rc = 1; + } + spin_unlock(&set->set_new_req_lock); + } + + /* We should call lu_env_refill() before handling new requests to make + * sure that env key the requests depending on really exists. + */ + rc2 = lu_env_refill(env); + if (rc2 != 0) { + /* + * XXX This is very awkward situation, because + * execution can neither continue (request + * interpreters assume that env is set up), nor repeat + * the loop (as this potentially results in a tight + * loop of -ENOMEM's). + * + * Fortunately, refill only ever does something when + * new modules are loaded, i.e., early during boot up. + */ + CERROR("Failure to refill session: %d\n", rc2); + RETURN(rc); + } + + if (atomic_read(&set->set_remaining)) + rc |= ptlrpc_check_set(env, set); + + if (!list_empty(&set->set_requests)) { + /* + * XXX: our set never completes, so we prune the completed + * reqs after each iteration. boy could this be smarter. + */ + list_for_each_safe(pos, tmp, &set->set_requests) { + req = list_entry(pos, struct ptlrpc_request, + rq_set_chain); + if (req->rq_phase != RQ_PHASE_COMPLETE) + continue; + + list_del_init(&req->rq_set_chain); + req->rq_set = NULL; + ptlrpc_req_finished(req); + } + } + + if (rc == 0) { + /* + * If new requests have been added, make sure to wake up. + */ + rc = atomic_read(&set->set_new_count); + + /* If we have nothing to do, check whether we can take some + * work from our partner threads. */ + if (rc == 0 && pc->pc_npartners > 0) { + struct ptlrpcd_ctl *partner; + struct ptlrpc_request_set *ps; + int first = pc->pc_cursor; + + do { + partner = pc->pc_partners[pc->pc_cursor++]; + if (pc->pc_cursor >= pc->pc_npartners) + pc->pc_cursor = 0; + if (partner == NULL) + continue; + + spin_lock(&partner->pc_lock); + ps = partner->pc_set; + if (ps == NULL) { + spin_unlock(&partner->pc_lock); + continue; + } + + ptlrpc_reqset_get(ps); + spin_unlock(&partner->pc_lock); + + if (atomic_read(&ps->set_new_count)) { + rc = ptlrpcd_steal_rqset(set, ps); + if (rc > 0) + CDEBUG(D_RPCTRACE, "transfer %d" + " async RPCs [%d->%d]\n", + rc, partner->pc_index, + pc->pc_index); + } + ptlrpc_reqset_put(ps); + } while (rc == 0 && pc->pc_cursor != first); + } + } + + RETURN(rc); +} + +/** + * Main ptlrpcd thread. + * ptlrpc's code paths like to execute in process context, so we have this + * thread which spins on a set which contains the rpcs and sends them. + * + */ +static int ptlrpcd(void *arg) +{ + struct ptlrpcd_ctl *pc = arg; + struct ptlrpc_request_set *set = pc->pc_set; + struct lu_env env = { .le_ses = NULL }; + int rc, exit = 0; + ENTRY; + + unshare_fs_struct(); +#if defined(CONFIG_SMP) + if (test_bit(LIOD_BIND, &pc->pc_flags)) { + int index = pc->pc_index; + + if (index >= 0 && index < num_possible_cpus()) { + while (!cpu_online(index)) { + if (++index >= num_possible_cpus()) + index = 0; + } + cfs_set_cpus_allowed(current, + *cpumask_of_node(cpu_to_node(index))); + } + } +#endif + /* + * XXX So far only "client" ptlrpcd uses an environment. In + * the future, ptlrpcd thread (or a thread-set) has to given + * an argument, describing its "scope". + */ + rc = lu_context_init(&env.le_ctx, + LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF); + complete(&pc->pc_starting); + + if (rc != 0) + RETURN(rc); + + /* + * This mainloop strongly resembles ptlrpc_set_wait() except that our + * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when + * there are requests in the set. New requests come in on the set's + * new_req_list and ptlrpcd_check() moves them into the set. + */ + do { + struct l_wait_info lwi; + int timeout; + + timeout = ptlrpc_set_next_timeout(set); + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), + ptlrpc_expired_set, set); + + lu_context_enter(&env.le_ctx); + l_wait_event(set->set_waitq, + ptlrpcd_check(&env, pc), &lwi); + lu_context_exit(&env.le_ctx); + + /* + * Abort inflight rpcs for forced stop case. + */ + if (test_bit(LIOD_STOP, &pc->pc_flags)) { + if (test_bit(LIOD_FORCE, &pc->pc_flags)) + ptlrpc_abort_set(set); + exit++; + } + + /* + * Let's make one more loop to make sure that ptlrpcd_check() + * copied all raced new rpcs into the set so we can kill them. + */ + } while (exit < 2); + + /* + * Wait for inflight requests to drain. + */ + if (!list_empty(&set->set_requests)) + ptlrpc_set_wait(set); + lu_context_fini(&env.le_ctx); + + complete(&pc->pc_finishing); + + return 0; +} + +/* XXX: We want multiple CPU cores to share the async RPC load. So we start many + * ptlrpcd threads. We also want to reduce the ptlrpcd overhead caused by + * data transfer cross-CPU cores. So we bind ptlrpcd thread to specified + * CPU core. But binding all ptlrpcd threads maybe cause response delay + * because of some CPU core(s) busy with other loads. + * + * For example: "ls -l", some async RPCs for statahead are assigned to + * ptlrpcd_0, and ptlrpcd_0 is bound to CPU_0, but CPU_0 may be quite busy + * with other non-ptlrpcd, like "ls -l" itself (we want to the "ls -l" + * thread, statahead thread, and ptlrpcd thread can run in parallel), under + * such case, the statahead async RPCs can not be processed in time, it is + * unexpected. If ptlrpcd_0 can be re-scheduled on other CPU core, it may + * be better. But it breaks former data transfer policy. + * + * So we shouldn't be blind for avoiding the data transfer. We make some + * compromise: divide the ptlrpcd threds pool into two parts. One part is + * for bound mode, each ptlrpcd thread in this part is bound to some CPU + * core. The other part is for free mode, all the ptlrpcd threads in the + * part can be scheduled on any CPU core. We specify some partnership + * between bound mode ptlrpcd thread(s) and free mode ptlrpcd thread(s), + * and the async RPC load within the partners are shared. + * + * It can partly avoid data transfer cross-CPU (if the bound mode ptlrpcd + * thread can be scheduled in time), and try to guarantee the async RPC + * processed ASAP (as long as the free mode ptlrpcd thread can be scheduled + * on any CPU core). + * + * As for how to specify the partnership between bound mode ptlrpcd + * thread(s) and free mode ptlrpcd thread(s), the simplest way is to use + * pair. In future, we can specify some more complex + * partnership based on the patches for CPU partition. But before such + * patches are available, we prefer to use the simplest one. + */ +# ifdef CFS_CPU_MODE_NUMA +# warning "fix ptlrpcd_bind() to use new CPU partition APIs" +# endif +static int ptlrpcd_bind(int index, int max) +{ + struct ptlrpcd_ctl *pc; + int rc = 0; +#if defined(CONFIG_NUMA) + cpumask_t mask; +#endif + ENTRY; + + LASSERT(index <= max - 1); + pc = &ptlrpcds->pd_threads[index]; + switch (ptlrpcd_bind_policy) { + case PDB_POLICY_NONE: + pc->pc_npartners = -1; + break; + case PDB_POLICY_FULL: + pc->pc_npartners = 0; + set_bit(LIOD_BIND, &pc->pc_flags); + break; + case PDB_POLICY_PAIR: + LASSERT(max % 2 == 0); + pc->pc_npartners = 1; + break; + case PDB_POLICY_NEIGHBOR: +#if defined(CONFIG_NUMA) + { + int i; + mask = *cpumask_of_node(cpu_to_node(index)); + for (i = max; i < num_online_cpus(); i++) + cpu_clear(i, mask); + pc->pc_npartners = cpus_weight(mask) - 1; + set_bit(LIOD_BIND, &pc->pc_flags); + } +#else + LASSERT(max >= 3); + pc->pc_npartners = 2; +#endif + break; + default: + CERROR("unknown ptlrpcd bind policy %d\n", ptlrpcd_bind_policy); + rc = -EINVAL; + } + + if (rc == 0 && pc->pc_npartners > 0) { + OBD_ALLOC(pc->pc_partners, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + if (pc->pc_partners == NULL) { + pc->pc_npartners = 0; + rc = -ENOMEM; + } else { + switch (ptlrpcd_bind_policy) { + case PDB_POLICY_PAIR: + if (index & 0x1) { + set_bit(LIOD_BIND, &pc->pc_flags); + pc->pc_partners[0] = &ptlrpcds-> + pd_threads[index - 1]; + ptlrpcds->pd_threads[index - 1]. + pc_partners[0] = pc; + } + break; + case PDB_POLICY_NEIGHBOR: +#if defined(CONFIG_NUMA) + { + struct ptlrpcd_ctl *ppc; + int i, pidx; + /* partners are cores in the same NUMA node. + * setup partnership only with ptlrpcd threads + * that are already initialized + */ + for (pidx = 0, i = 0; i < index; i++) { + if (cpu_isset(i, mask)) { + ppc = &ptlrpcds->pd_threads[i]; + pc->pc_partners[pidx++] = ppc; + ppc->pc_partners[ppc-> + pc_npartners++] = pc; + } + } + /* adjust number of partners to the number + * of partnership really setup */ + pc->pc_npartners = pidx; + } +#else + if (index & 0x1) + set_bit(LIOD_BIND, &pc->pc_flags); + if (index > 0) { + pc->pc_partners[0] = &ptlrpcds-> + pd_threads[index - 1]; + ptlrpcds->pd_threads[index - 1]. + pc_partners[1] = pc; + if (index == max - 1) { + pc->pc_partners[1] = + &ptlrpcds->pd_threads[0]; + ptlrpcds->pd_threads[0]. + pc_partners[0] = pc; + } + } +#endif + break; + } + } + } + + RETURN(rc); +} + + +int ptlrpcd_start(int index, int max, const char *name, struct ptlrpcd_ctl *pc) +{ + int rc; + int env = 0; + ENTRY; + + /* + * Do not allow start second thread for one pc. + */ + if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Starting second thread (%s) for same pc %p\n", + name, pc); + RETURN(0); + } + + pc->pc_index = index; + init_completion(&pc->pc_starting); + init_completion(&pc->pc_finishing); + spin_lock_init(&pc->pc_lock); + strncpy(pc->pc_name, name, sizeof(pc->pc_name) - 1); + pc->pc_set = ptlrpc_prep_set(); + if (pc->pc_set == NULL) + GOTO(out, rc = -ENOMEM); + /* + * So far only "client" ptlrpcd uses an environment. In the future, + * ptlrpcd thread (or a thread-set) has to be given an argument, + * describing its "scope". + */ + rc = lu_context_init(&pc->pc_env.le_ctx, LCT_CL_THREAD|LCT_REMEMBER); + if (rc != 0) + GOTO(out, rc); + + env = 1; + { + task_t *task; + if (index >= 0) { + rc = ptlrpcd_bind(index, max); + if (rc < 0) + GOTO(out, rc); + } + + task = kthread_run(ptlrpcd, pc, pc->pc_name); + if (IS_ERR(task)) + GOTO(out, rc = PTR_ERR(task)); + + rc = 0; + wait_for_completion(&pc->pc_starting); + } +out: + if (rc) { + if (pc->pc_set != NULL) { + struct ptlrpc_request_set *set = pc->pc_set; + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + } + if (env != 0) + lu_context_fini(&pc->pc_env.le_ctx); + clear_bit(LIOD_BIND, &pc->pc_flags); + clear_bit(LIOD_START, &pc->pc_flags); + } + RETURN(rc); +} + +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) +{ + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + set_bit(LIOD_STOP, &pc->pc_flags); + if (force) + set_bit(LIOD_FORCE, &pc->pc_flags); + wake_up(&pc->pc_set->set_waitq); + +out: + EXIT; +} + +void ptlrpcd_free(struct ptlrpcd_ctl *pc) +{ + struct ptlrpc_request_set *set = pc->pc_set; + ENTRY; + + if (!test_bit(LIOD_START, &pc->pc_flags)) { + CWARN("Thread for pc %p was not started\n", pc); + goto out; + } + + wait_for_completion(&pc->pc_finishing); + lu_context_fini(&pc->pc_env.le_ctx); + + spin_lock(&pc->pc_lock); + pc->pc_set = NULL; + spin_unlock(&pc->pc_lock); + ptlrpc_set_destroy(set); + + clear_bit(LIOD_START, &pc->pc_flags); + clear_bit(LIOD_STOP, &pc->pc_flags); + clear_bit(LIOD_FORCE, &pc->pc_flags); + clear_bit(LIOD_BIND, &pc->pc_flags); + +out: + if (pc->pc_npartners > 0) { + LASSERT(pc->pc_partners != NULL); + + OBD_FREE(pc->pc_partners, + sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners); + pc->pc_partners = NULL; + } + pc->pc_npartners = 0; + EXIT; +} + +static void ptlrpcd_fini(void) +{ + int i; + ENTRY; + + if (ptlrpcds != NULL) { + for (i = 0; i < ptlrpcds->pd_nthreads; i++) + ptlrpcd_stop(&ptlrpcds->pd_threads[i], 0); + for (i = 0; i < ptlrpcds->pd_nthreads; i++) + ptlrpcd_free(&ptlrpcds->pd_threads[i]); + ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0); + ptlrpcd_free(&ptlrpcds->pd_thread_rcv); + OBD_FREE(ptlrpcds, ptlrpcds->pd_size); + ptlrpcds = NULL; + } + + EXIT; +} + +static int ptlrpcd_init(void) +{ + int nthreads = num_online_cpus(); + char name[16]; + int size, i = -1, j, rc = 0; + ENTRY; + + if (max_ptlrpcds > 0 && max_ptlrpcds < nthreads) + nthreads = max_ptlrpcds; + if (nthreads < 2) + nthreads = 2; + if (nthreads < 3 && ptlrpcd_bind_policy == PDB_POLICY_NEIGHBOR) + ptlrpcd_bind_policy = PDB_POLICY_PAIR; + else if (nthreads % 2 != 0 && ptlrpcd_bind_policy == PDB_POLICY_PAIR) + nthreads &= ~1; /* make sure it is even */ + + size = offsetof(struct ptlrpcd, pd_threads[nthreads]); + OBD_ALLOC(ptlrpcds, size); + if (ptlrpcds == NULL) + GOTO(out, rc = -ENOMEM); + + snprintf(name, 15, "ptlrpcd_rcv"); + set_bit(LIOD_RECOVERY, &ptlrpcds->pd_thread_rcv.pc_flags); + rc = ptlrpcd_start(-1, nthreads, name, &ptlrpcds->pd_thread_rcv); + if (rc < 0) + GOTO(out, rc); + + /* XXX: We start nthreads ptlrpc daemons. Each of them can process any + * non-recovery async RPC to improve overall async RPC efficiency. + * + * But there are some issues with async I/O RPCs and async non-I/O + * RPCs processed in the same set under some cases. The ptlrpcd may + * be blocked by some async I/O RPC(s), then will cause other async + * non-I/O RPC(s) can not be processed in time. + * + * Maybe we should distinguish blocked async RPCs from non-blocked + * async RPCs, and process them in different ptlrpcd sets to avoid + * unnecessary dependency. But how to distribute async RPCs load + * among all the ptlrpc daemons becomes another trouble. */ + for (i = 0; i < nthreads; i++) { + snprintf(name, 15, "ptlrpcd_%d", i); + rc = ptlrpcd_start(i, nthreads, name, &ptlrpcds->pd_threads[i]); + if (rc < 0) + GOTO(out, rc); + } + + ptlrpcds->pd_size = size; + ptlrpcds->pd_index = 0; + ptlrpcds->pd_nthreads = nthreads; + +out: + if (rc != 0 && ptlrpcds != NULL) { + for (j = 0; j <= i; j++) + ptlrpcd_stop(&ptlrpcds->pd_threads[j], 0); + for (j = 0; j <= i; j++) + ptlrpcd_free(&ptlrpcds->pd_threads[j]); + ptlrpcd_stop(&ptlrpcds->pd_thread_rcv, 0); + ptlrpcd_free(&ptlrpcds->pd_thread_rcv); + OBD_FREE(ptlrpcds, size); + ptlrpcds = NULL; + } + + RETURN(0); +} + +int ptlrpcd_addref(void) +{ + int rc = 0; + ENTRY; + + mutex_lock(&ptlrpcd_mutex); + if (++ptlrpcd_users == 1) + rc = ptlrpcd_init(); + mutex_unlock(&ptlrpcd_mutex); + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpcd_addref); + +void ptlrpcd_decref(void) +{ + mutex_lock(&ptlrpcd_mutex); + if (--ptlrpcd_users == 0) + ptlrpcd_fini(); + mutex_unlock(&ptlrpcd_mutex); +} +EXPORT_SYMBOL(ptlrpcd_decref); +/** @} ptlrpcd */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c new file mode 100644 index 000000000000..2960889834a2 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c @@ -0,0 +1,357 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/recover.c + * + * Author: Mike Shaver + */ + +#define DEBUG_SUBSYSTEM S_RPC +# include + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for IOC_LOV_SET_OSC_ACTIVE */ +#include + +#include "ptlrpc_internal.h" + +/** + * Start recovery on disconnected import. + * This is done by just attempting a connect + */ +void ptlrpc_initiate_recovery(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); + ptlrpc_connect_import(imp); + + EXIT; +} + +/** + * Identify what request from replay list needs to be replayed next + * (based on what we have already replayed) and send it to server. + */ +int ptlrpc_replay_next(struct obd_import *imp, int *inflight) +{ + int rc = 0; + struct list_head *tmp, *pos; + struct ptlrpc_request *req = NULL; + __u64 last_transno; + ENTRY; + + *inflight = 0; + + /* It might have committed some after we last spoke, so make sure we + * get rid of them now. + */ + spin_lock(&imp->imp_lock); + imp->imp_last_transno_checked = 0; + ptlrpc_free_committed(imp); + last_transno = imp->imp_last_replay_transno; + spin_unlock(&imp->imp_lock); + + CDEBUG(D_HA, "import %p from %s committed "LPU64" last "LPU64"\n", + imp, obd2cli_tgt(imp->imp_obd), + imp->imp_peer_committed_transno, last_transno); + + /* Do I need to hold a lock across this iteration? We shouldn't be + * racing with any additions to the list, because we're in recovery + * and are therefore not processing additional requests to add. Calls + * to ptlrpc_free_committed might commit requests, but nothing "newer" + * than the one we're replaying (it can't be committed until it's + * replayed, and we're doing that here). l_f_e_safe protects against + * problems with the current request being committed, in the unlikely + * event of that race. So, in conclusion, I think that it's safe to + * perform this list-walk without the imp_lock held. + * + * But, the {mdc,osc}_replay_open callbacks both iterate + * request lists, and have comments saying they assume the + * imp_lock is being held by ptlrpc_replay, but it's not. it's + * just a little race... + */ + list_for_each_safe(tmp, pos, &imp->imp_replay_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_replay_list); + + /* If need to resend the last sent transno (because a + reconnect has occurred), then stop on the matching + req and send it again. If, however, the last sent + transno has been committed then we continue replay + from the next request. */ + if (req->rq_transno > last_transno) { + if (imp->imp_resend_replay) + lustre_msg_add_flags(req->rq_reqmsg, + MSG_RESENT); + break; + } + req = NULL; + } + + spin_lock(&imp->imp_lock); + imp->imp_resend_replay = 0; + spin_unlock(&imp->imp_lock); + + if (req != NULL) { + rc = ptlrpc_replay_req(req); + if (rc) { + CERROR("recovery replay error %d for req " + LPU64"\n", rc, req->rq_xid); + RETURN(rc); + } + *inflight = 1; + } + RETURN(rc); +} + +/** + * Schedule resending of request on sending_list. This is done after + * we completed replaying of requests and locks. + */ +int ptlrpc_resend(struct obd_import *imp) +{ + struct ptlrpc_request *req, *next; + + ENTRY; + + /* As long as we're in recovery, nothing should be added to the sending + * list, so we don't need to hold the lock during this iteration and + * resend process. + */ + /* Well... what if lctl recover is called twice at the same time? + */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_RECOVER) { + spin_unlock(&imp->imp_lock); + RETURN(-1); + } + + list_for_each_entry_safe(req, next, &imp->imp_sending_list, + rq_list) { + LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON, + "req %p bad\n", req); + LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req); + if (!ptlrpc_no_resend(req)) + ptlrpc_resend_req(req); + } + spin_unlock(&imp->imp_lock); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_resend); + +/** + * Go through all requests in delayed list and wake their threads + * for resending + */ +void ptlrpc_wake_delayed(struct obd_import *imp) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + ptlrpc_client_wake_req(req); + } + spin_unlock(&imp->imp_lock); +} +EXPORT_SYMBOL(ptlrpc_wake_delayed); + +void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) +{ + struct obd_import *imp = failed_req->rq_import; + ENTRY; + + CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", + imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid); + + if (ptlrpc_set_import_discon(imp, + lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) { + if (!imp->imp_replayable) { + CDEBUG(D_HA, "import %s@%s for %s not replayable, " + "auto-deactivating\n", + obd2cli_tgt(imp->imp_obd), + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_deactivate_import(imp); + } + /* to control recovery via lctl {disable|enable}_recovery */ + if (imp->imp_deactive == 0) + ptlrpc_connect_import(imp); + } + + /* Wait for recovery to complete and resend. If evicted, then + this request will be errored out later.*/ + spin_lock(&failed_req->rq_lock); + if (!failed_req->rq_no_resend) + failed_req->rq_resend = 1; + spin_unlock(&failed_req->rq_lock); + + EXIT; +} + +/** + * Administratively active/deactive a client. + * This should only be called by the ioctl interface, currently + * - the lctl deactivate and activate commands + * - echo 0/1 >> /proc/osc/XXX/active + * - client umount -f (ll_umount_begin) + */ +int ptlrpc_set_import_active(struct obd_import *imp, int active) +{ + struct obd_device *obd = imp->imp_obd; + int rc = 0; + + ENTRY; + LASSERT(obd); + + /* When deactivating, mark import invalid, and abort in-flight + * requests. */ + if (!active) { + LCONSOLE_WARN("setting import %s INACTIVE by administrator " + "request\n", obd2cli_tgt(imp->imp_obd)); + + /* set before invalidate to avoid messages about imp_inval + * set without imp_deactive in ptlrpc_import_delay_req */ + spin_lock(&imp->imp_lock); + imp->imp_deactive = 1; + spin_unlock(&imp->imp_lock); + + obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE); + + ptlrpc_invalidate_import(imp); + } + + /* When activating, mark import valid, and attempt recovery */ + if (active) { + CDEBUG(D_HA, "setting import %s VALID\n", + obd2cli_tgt(imp->imp_obd)); + + spin_lock(&imp->imp_lock); + imp->imp_deactive = 0; + spin_unlock(&imp->imp_lock); + obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE); + + rc = ptlrpc_recover_import(imp, NULL, 0); + } + + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_set_import_active); + +/* Attempt to reconnect an import */ +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) +{ + int rc = 0; + ENTRY; + + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive || + atomic_read(&imp->imp_inval_count)) + rc = -EINVAL; + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + /* force import to be disconnected. */ + ptlrpc_set_import_discon(imp, 0); + + if (new_uuid) { + struct obd_uuid uuid; + + /* intruct import to use new uuid */ + obd_str2uuid(&uuid, new_uuid); + rc = import_set_conn_priority(imp, &uuid); + if (rc) + GOTO(out, rc); + } + + /* Check if reconnect is already in progress */ + spin_lock(&imp->imp_lock); + if (imp->imp_state != LUSTRE_IMP_DISCON) { + imp->imp_force_verify = 1; + rc = -EALREADY; + } + spin_unlock(&imp->imp_lock); + if (rc) + GOTO(out, rc); + + rc = ptlrpc_connect_import(imp); + if (rc) + GOTO(out, rc); + + if (!async) { + struct l_wait_info lwi; + int secs = cfs_time_seconds(obd_timeout); + + CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", + obd2cli_tgt(imp->imp_obd), secs); + + lwi = LWI_TIMEOUT(secs, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + !ptlrpc_import_in_recovery(imp), &lwi); + CDEBUG(D_HA, "%s: recovery finished\n", + obd2cli_tgt(imp->imp_obd)); + } + EXIT; + +out: + return rc; +} +EXPORT_SYMBOL(ptlrpc_recover_import); + +int ptlrpc_import_in_recovery(struct obd_import *imp) +{ + int in_recovery = 1; + spin_lock(&imp->imp_lock); + if (imp->imp_state == LUSTRE_IMP_FULL || + imp->imp_state == LUSTRE_IMP_CLOSED || + imp->imp_state == LUSTRE_IMP_DISCON) + in_recovery = 0; + spin_unlock(&imp->imp_lock); + return in_recovery; +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c new file mode 100644 index 000000000000..36e8bed5458a --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c @@ -0,0 +1,2465 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/*********************************************** + * policy registers * + ***********************************************/ + +static rwlock_t policy_lock; +static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = { + NULL, +}; + +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(policy->sp_name); + LASSERT(policy->sp_cops); + LASSERT(policy->sp_sops); + + if (number >= SPTLRPC_POLICY_MAX) + return -EINVAL; + + write_lock(&policy_lock); + if (unlikely(policies[number])) { + write_unlock(&policy_lock); + return -EALREADY; + } + policies[number] = policy; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: registered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_register_policy); + +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) +{ + __u16 number = policy->sp_policy; + + LASSERT(number < SPTLRPC_POLICY_MAX); + + write_lock(&policy_lock); + if (unlikely(policies[number] == NULL)) { + write_unlock(&policy_lock); + CERROR("%s: already unregistered\n", policy->sp_name); + return -EINVAL; + } + + LASSERT(policies[number] == policy); + policies[number] = NULL; + write_unlock(&policy_lock); + + CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name); + return 0; +} +EXPORT_SYMBOL(sptlrpc_unregister_policy); + +static +struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor) +{ + static DEFINE_MUTEX(load_mutex); + static atomic_t loaded = ATOMIC_INIT(0); + struct ptlrpc_sec_policy *policy; + __u16 number = SPTLRPC_FLVR_POLICY(flavor); + __u16 flag = 0; + + if (number >= SPTLRPC_POLICY_MAX) + return NULL; + + while (1) { + read_lock(&policy_lock); + policy = policies[number]; + if (policy && !try_module_get(policy->sp_owner)) + policy = NULL; + if (policy == NULL) + flag = atomic_read(&loaded); + read_unlock(&policy_lock); + + if (policy != NULL || flag != 0 || + number != SPTLRPC_POLICY_GSS) + break; + + /* try to load gss module, once */ + mutex_lock(&load_mutex); + if (atomic_read(&loaded) == 0) { + if (request_module("ptlrpc_gss") == 0) + CDEBUG(D_SEC, + "module ptlrpc_gss loaded on demand\n"); + else + CERROR("Unable to load module ptlrpc_gss\n"); + + atomic_set(&loaded, 1); + } + mutex_unlock(&load_mutex); + } + + return policy; +} + +__u32 sptlrpc_name2flavor_base(const char *name) +{ + if (!strcmp(name, "null")) + return SPTLRPC_FLVR_NULL; + if (!strcmp(name, "plain")) + return SPTLRPC_FLVR_PLAIN; + if (!strcmp(name, "krb5n")) + return SPTLRPC_FLVR_KRB5N; + if (!strcmp(name, "krb5a")) + return SPTLRPC_FLVR_KRB5A; + if (!strcmp(name, "krb5i")) + return SPTLRPC_FLVR_KRB5I; + if (!strcmp(name, "krb5p")) + return SPTLRPC_FLVR_KRB5P; + + return SPTLRPC_FLVR_INVALID; +} +EXPORT_SYMBOL(sptlrpc_name2flavor_base); + +const char *sptlrpc_flavor2name_base(__u32 flvr) +{ + __u32 base = SPTLRPC_FLVR_BASE(flvr); + + if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) + return "null"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) + return "plain"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) + return "krb5n"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) + return "krb5a"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) + return "krb5i"; + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) + return "krb5p"; + + CERROR("invalid wire flavor 0x%x\n", flvr); + return "invalid"; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_base); + +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize) +{ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) + snprintf(buf, bufsize, "hash:%s", + sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); + else + snprintf(buf, bufsize, "%s", + sptlrpc_flavor2name_base(sf->sf_rpc)); + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); + +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +{ + snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc)); + + /* + * currently we don't support customized bulk specification for + * flavors other than plain + */ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { + char bspec[16]; + + bspec[0] = '-'; + sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); + strncat(buf, bspec, bufsize); + } + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name); + +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} +EXPORT_SYMBOL(sptlrpc_secflags2str); + +/************************************************** + * client context APIs * + **************************************************/ + +static +struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec) +{ + struct vfs_cred vcred; + int create = 1, remove_dead = 1; + + LASSERT(sec); + LASSERT(sec->ps_policy->sp_cops->lookup_ctx); + + if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY)) { + vcred.vc_uid = 0; + vcred.vc_gid = 0; + if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) { + create = 0; + remove_dead = 0; + } + } else { + vcred.vc_uid = current_uid(); + vcred.vc_gid = current_gid(); + } + + return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, + create, remove_dead); +} + +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx) +{ + atomic_inc(&ctx->cc_refcount); + return ctx; +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_get); + +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync) +{ + struct ptlrpc_sec *sec = ctx->cc_sec; + + LASSERT(sec); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (!atomic_dec_and_test(&ctx->cc_refcount)) + return; + + sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_put); + +/** + * Expire the client context immediately. + * + * \pre Caller must hold at least 1 reference on the \a ctx. + */ +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(ctx->cc_ops->die); + ctx->cc_ops->die(ctx, 0); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_expire); + +/** + * To wake up the threads who are waiting for this client context. Called + * after some status change happened on \a ctx. + */ +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_request *req, *next; + + spin_lock(&ctx->cc_lock); + list_for_each_entry_safe(req, next, &ctx->cc_req_list, + rq_ctx_chain) { + list_del_init(&req->rq_ctx_chain); + ptlrpc_client_wake_req(req); + } + spin_unlock(&ctx->cc_lock); +} +EXPORT_SYMBOL(sptlrpc_cli_ctx_wakeup); + +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize) +{ + LASSERT(ctx->cc_ops); + + if (ctx->cc_ops->display == NULL) + return 0; + + return ctx->cc_ops->display(ctx, buf, bufsize); +} + +static int import_sec_check_expire(struct obd_import *imp) +{ + int adapt = 0; + + spin_lock(&imp->imp_lock); + if (imp->imp_sec_expire && + imp->imp_sec_expire < cfs_time_current_sec()) { + adapt = 1; + imp->imp_sec_expire = 0; + } + spin_unlock(&imp->imp_lock); + + if (!adapt) + return 0; + + CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n"); + return sptlrpc_import_sec_adapt(imp, NULL, 0); +} + +static int import_sec_validate_get(struct obd_import *imp, + struct ptlrpc_sec **sec) +{ + int rc; + + if (unlikely(imp->imp_sec_expire)) { + rc = import_sec_check_expire(imp); + if (rc) + return rc; + } + + *sec = sptlrpc_import_sec_ref(imp); + if (*sec == NULL) { + CERROR("import %p (%s) with no sec\n", + imp, ptlrpc_import_state_name(imp->imp_state)); + return -EACCES; + } + + if (unlikely((*sec)->ps_dying)) { + CERROR("attempt to use dying sec %p\n", sec); + sptlrpc_sec_put(*sec); + return -EACCES; + } + + return 0; +} + +/** + * Given a \a req, find or allocate a appropriate context for it. + * \pre req->rq_cli_ctx == NULL. + * + * \retval 0 succeed, and req->rq_cli_ctx is set. + * \retval -ev error number, and req->rq_cli_ctx == NULL. + */ +int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_sec *sec; + int rc; + ENTRY; + + LASSERT(!req->rq_cli_ctx); + LASSERT(imp); + + rc = import_sec_validate_get(imp, &sec); + if (rc) + RETURN(rc); + + req->rq_cli_ctx = get_my_ctx(sec); + + sptlrpc_sec_put(sec); + + if (!req->rq_cli_ctx) { + CERROR("req %p: fail to get context\n", req); + RETURN(-ENOMEM); + } + + RETURN(0); +} + +/** + * Drop the context for \a req. + * \pre req->rq_cli_ctx != NULL. + * \post req->rq_cli_ctx == NULL. + * + * If \a sync == 0, this function should return quickly without sleep; + * otherwise it might trigger and wait for the whole process of sending + * an context-destroying rpc to server. + */ +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync) +{ + ENTRY; + + LASSERT(req); + LASSERT(req->rq_cli_ctx); + + /* request might be asked to release earlier while still + * in the context waiting list. + */ + if (!list_empty(&req->rq_ctx_chain)) { + spin_lock(&req->rq_cli_ctx->cc_lock); + list_del_init(&req->rq_ctx_chain); + spin_unlock(&req->rq_cli_ctx->cc_lock); + } + + sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync); + req->rq_cli_ctx = NULL; + EXIT; +} + +static +int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, + struct ptlrpc_cli_ctx *oldctx, + struct ptlrpc_cli_ctx *newctx) +{ + struct sptlrpc_flavor old_flvr; + char *reqmsg = NULL; /* to workaround old gcc */ + int reqmsg_size; + int rc = 0; + + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqlen); + LASSERT(req->rq_replen); + + CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), " + "switch sec %p(%s) -> %p(%s)\n", req, + oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec), + newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec), + oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name, + newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name); + + /* save flavor */ + old_flvr = req->rq_flvr; + + /* save request message */ + reqmsg_size = req->rq_reqlen; + if (reqmsg_size != 0) { + OBD_ALLOC_LARGE(reqmsg, reqmsg_size); + if (reqmsg == NULL) + return -ENOMEM; + memcpy(reqmsg, req->rq_reqmsg, reqmsg_size); + } + + /* release old req/rep buf */ + req->rq_cli_ctx = oldctx; + sptlrpc_cli_free_reqbuf(req); + sptlrpc_cli_free_repbuf(req); + req->rq_cli_ctx = newctx; + + /* recalculate the flavor */ + sptlrpc_req_set_flavor(req, 0); + + /* alloc new request buffer + * we don't need to alloc reply buffer here, leave it to the + * rest procedure of ptlrpc */ + if (reqmsg_size != 0) { + rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size); + if (!rc) { + LASSERT(req->rq_reqmsg); + memcpy(req->rq_reqmsg, reqmsg, reqmsg_size); + } else { + CWARN("failed to alloc reqbuf: %d\n", rc); + req->rq_flvr = old_flvr; + } + + OBD_FREE_LARGE(reqmsg, reqmsg_size); + } + return rc; +} + +/** + * If current context of \a req is dead somehow, e.g. we just switched flavor + * thus marked original contexts dead, we'll find a new context for it. if + * no switch is needed, \a req will end up with the same context. + * + * \note a request must have a context, to keep other parts of code happy. + * In any case of failure during the switching, we must restore the old one. + */ +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx; + struct ptlrpc_cli_ctx *newctx; + int rc; + ENTRY; + + LASSERT(oldctx); + + sptlrpc_cli_ctx_get(oldctx); + sptlrpc_req_put_ctx(req, 0); + + rc = sptlrpc_req_get_ctx(req); + if (unlikely(rc)) { + LASSERT(!req->rq_cli_ctx); + + /* restore old ctx */ + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + newctx = req->rq_cli_ctx; + LASSERT(newctx); + + if (unlikely(newctx == oldctx && + test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) { + /* + * still get the old dead ctx, usually means system too busy + */ + CDEBUG(D_SEC, + "ctx (%p, fl %lx) doesn't switch, relax a little bit\n", + newctx, newctx->cc_flags); + + schedule_timeout_and_set_state(TASK_INTERRUPTIBLE, + HZ); + } else { + /* + * it's possible newctx == oldctx if we're switching + * subflavor with the same sec. + */ + rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); + if (rc) { + /* restore old ctx */ + sptlrpc_req_put_ctx(req, 0); + req->rq_cli_ctx = oldctx; + RETURN(rc); + } + + LASSERT(req->rq_cli_ctx == newctx); + } + + sptlrpc_cli_ctx_put(oldctx, 1); + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_req_replace_dead_ctx); + +static +int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx) +{ + if (cli_ctx_is_refreshed(ctx)) + return 1; + return 0; +} + +static +int ctx_refresh_timeout(void *data) +{ + struct ptlrpc_request *req = data; + int rc; + + /* conn_cnt is needed in expire_one_request */ + lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt); + + rc = ptlrpc_expire_one_request(req, 1); + /* if we started recovery, we should mark this ctx dead; otherwise + * in case of lgssd died nobody would retire this ctx, following + * connecting will still find the same ctx thus cause deadlock. + * there's an assumption that expire time of the request should be + * later than the context refresh expire time. + */ + if (rc == 0) + req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0); + return rc; +} + +static +void ctx_refresh_interrupt(void *data) +{ + struct ptlrpc_request *req = data; + + spin_lock(&req->rq_lock); + req->rq_intr = 1; + spin_unlock(&req->rq_lock); +} + +static +void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) +{ + spin_lock(&ctx->cc_lock); + if (!list_empty(&req->rq_ctx_chain)) + list_del_init(&req->rq_ctx_chain); + spin_unlock(&ctx->cc_lock); +} + +/** + * To refresh the context of \req, if it's not up-to-date. + * \param timeout + * - < 0: don't wait + * - = 0: wait until success or fatal error occur + * - > 0: timeout value (in seconds) + * + * The status of the context could be subject to be changed by other threads + * at any time. We allow this race, but once we return with 0, the caller will + * suppose it's uptodated and keep using it until the owning rpc is done. + * + * \retval 0 only if the context is uptodated. + * \retval -ev error number. + */ +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec *sec; + struct l_wait_info lwi; + int rc; + ENTRY; + + LASSERT(ctx); + + if (req->rq_ctx_init || req->rq_ctx_fini) + RETURN(0); + + /* + * during the process a request's context might change type even + * (e.g. from gss ctx to null ctx), so each loop we need to re-check + * everything + */ +again: + rc = import_sec_validate_get(req->rq_import, &sec); + if (rc) + RETURN(rc); + + if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n", + req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc); + req_off_ctx_list(req, ctx); + sptlrpc_req_replace_dead_ctx(req); + ctx = req->rq_cli_ctx; + } + sptlrpc_sec_put(sec); + + if (cli_ctx_is_eternal(ctx)) + RETURN(0); + + if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { + LASSERT(ctx->cc_ops->refresh); + ctx->cc_ops->refresh(ctx); + } + LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0); + + LASSERT(ctx->cc_ops->validate); + if (ctx->cc_ops->validate(ctx) == 0) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + req_off_ctx_list(req, ctx); + RETURN(-EPERM); + } + + /* + * There's a subtle issue for resending RPCs, suppose following + * situation: + * 1. the request was sent to server. + * 2. recovery was kicked start, after finished the request was + * marked as resent. + * 3. resend the request. + * 4. old reply from server received, we accept and verify the reply. + * this has to be success, otherwise the error will be aware + * by application. + * 5. new reply from server received, dropped by LNet. + * + * Note the xid of old & new request is the same. We can't simply + * change xid for the resent request because the server replies on + * it for reply reconstruction. + * + * Commonly the original context should be uptodate because we + * have a expiry nice time; server will keep its context because + * we at least hold a ref of old context which prevent context + * destroying RPC being sent. So server still can accept the request + * and finish the RPC. But if that's not the case: + * 1. If server side context has been trimmed, a NO_CONTEXT will + * be returned, gss_cli_ctx_verify/unseal will switch to new + * context by force. + * 2. Current context never be refreshed, then we are fine: we + * never really send request with old context before. + */ + if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) && + unlikely(req->rq_reqmsg) && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { + req_off_ctx_list(req, ctx); + RETURN(0); + } + + if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { + req_off_ctx_list(req, ctx); + /* + * don't switch ctx if import was deactivated + */ + if (req->rq_import->imp_deactive) { + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(-EINTR); + } + + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc) { + LASSERT(ctx == req->rq_cli_ctx); + CERROR("req %p: failed to replace dead ctx %p: %d\n", + req, ctx, rc); + spin_lock(&req->rq_lock); + req->rq_err = 1; + spin_unlock(&req->rq_lock); + RETURN(rc); + } + + ctx = req->rq_cli_ctx; + goto again; + } + + /* + * Now we're sure this context is during upcall, add myself into + * waiting list + */ + spin_lock(&ctx->cc_lock); + if (list_empty(&req->rq_ctx_chain)) + list_add(&req->rq_ctx_chain, &ctx->cc_req_list); + spin_unlock(&ctx->cc_lock); + + if (timeout < 0) + RETURN(-EWOULDBLOCK); + + /* Clear any flags that may be present from previous sends */ + LASSERT(req->rq_receiving_reply == 0); + spin_lock(&req->rq_lock); + req->rq_err = 0; + req->rq_timedout = 0; + req->rq_resend = 0; + req->rq_restart = 0; + spin_unlock(&req->rq_lock); + + lwi = LWI_TIMEOUT_INTR(timeout * HZ, ctx_refresh_timeout, + ctx_refresh_interrupt, req); + rc = l_wait_event(req->rq_reply_waitq, ctx_check_refresh(ctx), &lwi); + + /* + * following cases could lead us here: + * - successfully refreshed; + * - interrupted; + * - timedout, and we don't want recover from the failure; + * - timedout, and waked up upon recovery finished; + * - someone else mark this ctx dead by force; + * - someone invalidate the req and call ptlrpc_client_wake_req(), + * e.g. ptlrpc_abort_inflight(); + */ + if (!cli_ctx_is_refreshed(ctx)) { + /* timed out or interruptted */ + req_off_ctx_list(req, ctx); + + LASSERT(rc != 0); + RETURN(rc); + } + + goto again; +} + +/** + * Initialize flavor settings for \a req, according to \a opcode. + * + * \note this could be called in two situations: + * - new request from ptlrpc_pre_req(), with proper @opcode + * - old request which changed ctx in the middle, with @opcode == 0 + */ +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) +{ + struct ptlrpc_sec *sec; + + LASSERT(req->rq_import); + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_cli_ctx->cc_sec); + LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0); + + /* special security flags accoding to opcode */ + switch (opcode) { + case OST_READ: + case MDS_READPAGE: + case MGS_CONFIG_READ: + case OBD_IDX_READ: + req->rq_bulk_read = 1; + break; + case OST_WRITE: + case MDS_WRITEPAGE: + req->rq_bulk_write = 1; + break; + case SEC_CTX_INIT: + req->rq_ctx_init = 1; + break; + case SEC_CTX_FINI: + req->rq_ctx_fini = 1; + break; + case 0: + /* init/fini rpc won't be resend, so can't be here */ + LASSERT(req->rq_ctx_init == 0); + LASSERT(req->rq_ctx_fini == 0); + + /* cleanup flags, which should be recalculated */ + req->rq_pack_udesc = 0; + req->rq_pack_bulk = 0; + break; + } + + sec = req->rq_cli_ctx->cc_sec; + + spin_lock(&sec->ps_lock); + req->rq_flvr = sec->ps_flvr; + spin_unlock(&sec->ps_lock); + + /* force SVC_NULL for context initiation rpc, SVC_INTG for context + * destruction rpc */ + if (unlikely(req->rq_ctx_init)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); + else if (unlikely(req->rq_ctx_fini)) + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); + + /* user descriptor flag, null security can't do it anyway */ + if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && + (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL)) + req->rq_pack_udesc = 1; + + /* bulk security flag */ + if ((req->rq_bulk_read || req->rq_bulk_write) && + sptlrpc_flavor_has_bulk(&req->rq_flvr)) + req->rq_pack_bulk = 1; +} + +void sptlrpc_request_out_callback(struct ptlrpc_request *req) +{ + if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) + return; + + LASSERT(req->rq_clrbuf); + if (req->rq_pool || !req->rq_reqbuf) + return; + + OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; +} + +/** + * Given an import \a imp, check whether current user has a valid context + * or not. We may create a new context and try to refresh it, and try + * repeatedly try in case of non-fatal errors. Return 0 means success. + */ +int sptlrpc_import_check_ctx(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + might_sleep(); + + sec = sptlrpc_import_sec_ref(imp); + ctx = get_my_ctx(sec); + sptlrpc_sec_put(sec); + + if (!ctx) + RETURN(-ENOMEM); + + if (cli_ctx_is_eternal(ctx) || + ctx->cc_ops->validate(ctx) == 0) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); + } + + if (cli_ctx_is_error(ctx)) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(-EACCES); + } + + OBD_ALLOC_PTR(req); + if (!req) + RETURN(-ENOMEM); + + spin_lock_init(&req->rq_lock); + atomic_set(&req->rq_refcount, 10000); + INIT_LIST_HEAD(&req->rq_ctx_chain); + init_waitqueue_head(&req->rq_reply_waitq); + init_waitqueue_head(&req->rq_set_waitq); + req->rq_import = imp; + req->rq_flvr = sec->ps_flvr; + req->rq_cli_ctx = ctx; + + rc = sptlrpc_req_refresh_ctx(req, 0); + LASSERT(list_empty(&req->rq_ctx_chain)); + sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1); + OBD_FREE_PTR(req); + + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform the pre-defined security transformation + * upon the request message of \a req. After this function called, + * req->rq_reqmsg is still accessible as clear text. + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc = 0; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* we wrap bulk request here because now we can be sure + * the context is uptodate. + */ + if (req->rq_bulk) { + rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk); + if (rc) + RETURN(rc); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->sign); + rc = ctx->cc_ops->sign(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->seal); + rc = ctx->cc_ops->seal(ctx, req); + break; + default: + LBUG(); + } + + if (rc == 0) { + LASSERT(req->rq_reqdata_len); + LASSERT(req->rq_reqdata_len % 8 == 0); + LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); + } + + RETURN(rc); +} + +static int do_cli_unwrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + int rc; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata); + LASSERT(req->rq_repmsg == NULL); + + req->rq_rep_swab_mask = 0; + + rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len); + switch (rc) { + case 1: + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("failed unpack reply: x"LPU64"\n", req->rq_xid); + RETURN(-EPROTO); + } + + if (req->rq_repdata_len < sizeof(struct lustre_msg)) { + CERROR("replied data length %d too small\n", + req->rq_repdata_len); + RETURN(-EPROTO); + } + + if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) != + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + CERROR("reply policy %u doesn't match request policy %u\n", + SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr), + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)); + RETURN(-EPROTO); + } + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + case SPTLRPC_SVC_NULL: + case SPTLRPC_SVC_AUTH: + case SPTLRPC_SVC_INTG: + LASSERT(ctx->cc_ops->verify); + rc = ctx->cc_ops->verify(ctx, req); + break; + case SPTLRPC_SVC_PRIV: + LASSERT(ctx->cc_ops->unseal); + rc = ctx->cc_ops->unseal(ctx, req); + break; + default: + LBUG(); + } + LASSERT(rc || req->rq_repmsg || req->rq_resend); + + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && + !req->rq_ctx_init) + req->rq_rep_swab_mask = 0; + RETURN(rc); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the reply + * message of \a req. After return successfully, req->rq_repmsg points to + * the reply message in clear text. + * + * \pre the reply buffer should have been un-posted from LNet, so nothing is + * going to change. + */ +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); + + if (req->rq_reply_off == 0 && + (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { + CERROR("real reply with offset 0\n"); + return -EPROTO; + } + + if (req->rq_reply_off % 8 != 0) { + CERROR("reply at odd offset %u\n", req->rq_reply_off); + return -EPROTO; + } + + req->rq_repdata = (struct lustre_msg *) + (req->rq_repbuf + req->rq_reply_off); + req->rq_repdata_len = req->rq_nob_received; + + return do_cli_unwrap_reply(req); +} + +/** + * Used by ptlrpc client, to perform security transformation upon the early + * reply message of \a req. We expect the rq_reply_off is 0, and + * rq_nob_received is the early reply size. + * + * Because the receive buffer might be still posted, the reply data might be + * changed at any time, no matter we're holding rq_lock or not. For this reason + * we allocate a separate ptlrpc_request and reply buffer for early reply + * processing. + * + * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request. + * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned + * \a *req_ret to release it. + * \retval -ev error number, and \a req_ret will not be set. + */ +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret) +{ + struct ptlrpc_request *early_req; + char *early_buf; + int early_bufsz, early_size; + int rc; + ENTRY; + + OBD_ALLOC_PTR(early_req); + if (early_req == NULL) + RETURN(-ENOMEM); + + early_size = req->rq_nob_received; + early_bufsz = size_roundup_power2(early_size); + OBD_ALLOC_LARGE(early_buf, early_bufsz); + if (early_buf == NULL) + GOTO(err_req, rc = -ENOMEM); + + /* sanity checkings and copy data out, do it inside spinlock */ + spin_lock(&req->rq_lock); + + if (req->rq_replied) { + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_repdata == NULL); + LASSERT(req->rq_repmsg == NULL); + + if (req->rq_reply_off != 0) { + CERROR("early reply with offset %u\n", req->rq_reply_off); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EPROTO); + } + + if (req->rq_nob_received != early_size) { + /* even another early arrived the size should be the same */ + CERROR("data size has changed from %u to %u\n", + early_size, req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EINVAL); + } + + if (req->rq_nob_received < sizeof(struct lustre_msg)) { + CERROR("early reply length %d too small\n", + req->rq_nob_received); + spin_unlock(&req->rq_lock); + GOTO(err_buf, rc = -EALREADY); + } + + memcpy(early_buf, req->rq_repbuf, early_size); + spin_unlock(&req->rq_lock); + + spin_lock_init(&early_req->rq_lock); + early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx); + early_req->rq_flvr = req->rq_flvr; + early_req->rq_repbuf = early_buf; + early_req->rq_repbuf_len = early_bufsz; + early_req->rq_repdata = (struct lustre_msg *) early_buf; + early_req->rq_repdata_len = early_size; + early_req->rq_early = 1; + early_req->rq_reqmsg = req->rq_reqmsg; + + rc = do_cli_unwrap_reply(early_req); + if (rc) { + DEBUG_REQ(D_ADAPTTO, early_req, + "error %d unwrap early reply", rc); + GOTO(err_ctx, rc); + } + + LASSERT(early_req->rq_repmsg); + *req_ret = early_req; + RETURN(0); + +err_ctx: + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); +err_buf: + OBD_FREE_LARGE(early_buf, early_bufsz); +err_req: + OBD_FREE_PTR(early_req); + RETURN(rc); +} + +/** + * Used by ptlrpc client, to release a processed early reply \a early_req. + * + * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply(). + */ +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req) +{ + LASSERT(early_req->rq_repbuf); + LASSERT(early_req->rq_repdata); + LASSERT(early_req->rq_repmsg); + + sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); + OBD_FREE_LARGE(early_req->rq_repbuf, early_req->rq_repbuf_len); + OBD_FREE_PTR(early_req); +} + +/************************************************** + * sec ID * + **************************************************/ + +/* + * "fixed" sec (e.g. null) use sec_id < 0 + */ +static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1); + +int sptlrpc_get_next_secid(void) +{ + return atomic_inc_return(&sptlrpc_sec_id); +} +EXPORT_SYMBOL(sptlrpc_get_next_secid); + +/************************************************** + * client side high-level security APIs * + **************************************************/ + +static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, + int grace, int force) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT(policy->sp_cops); + LASSERT(policy->sp_cops->flush_ctx_cache); + + return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force); +} + +static void sec_cop_destroy_sec(struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec_policy *policy = sec->ps_policy; + + LASSERT_ATOMIC_ZERO(&sec->ps_refcount); + LASSERT_ATOMIC_ZERO(&sec->ps_nctx); + LASSERT(policy->sp_cops->destroy_sec); + + CDEBUG(D_SEC, "%s@%p: being destroied\n", sec->ps_policy->sp_name, sec); + + policy->sp_cops->destroy_sec(sec); + sptlrpc_policy_put(policy); +} + +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec) +{ + sec_cop_destroy_sec(sec); +} +EXPORT_SYMBOL(sptlrpc_sec_destroy); + +static void sptlrpc_sec_kill(struct ptlrpc_sec *sec) +{ + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (sec->ps_policy->sp_cops->kill_sec) { + sec->ps_policy->sp_cops->kill_sec(sec); + + sec_cop_flush_ctx_cache(sec, -1, 1, 1); + } +} + +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec) +{ + if (sec) + atomic_inc(&sec->ps_refcount); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_sec_get); + +void sptlrpc_sec_put(struct ptlrpc_sec *sec) +{ + if (sec) { + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + if (atomic_dec_and_test(&sec->ps_refcount)) { + sptlrpc_gc_del_sec(sec); + sec_cop_destroy_sec(sec); + } + } +} +EXPORT_SYMBOL(sptlrpc_sec_put); + +/* + * policy module is responsible for taking refrence of import + */ +static +struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf, + enum lustre_sec_part sp) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_sec *sec; + char str[32]; + ENTRY; + + if (svc_ctx) { + LASSERT(imp->imp_dlm_fake == 1); + + CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_policy_get(svc_ctx->sc_policy); + sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; + } else { + LASSERT(imp->imp_dlm_fake == 0); + + CDEBUG(D_SEC, "%s %s: select security flavor %s\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name, + sptlrpc_flavor2name(sf, str, sizeof(str))); + + policy = sptlrpc_wireflavor2policy(sf->sf_rpc); + if (!policy) { + CERROR("invalid flavor 0x%x\n", sf->sf_rpc); + RETURN(NULL); + } + } + + sec = policy->sp_cops->create_sec(imp, svc_ctx, sf); + if (sec) { + atomic_inc(&sec->ps_refcount); + + sec->ps_part = sp; + + if (sec->ps_gc_interval && policy->sp_cops->gc_ctx) + sptlrpc_gc_add_sec(sec); + } else { + sptlrpc_policy_put(policy); + } + + RETURN(sec); +} + +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp) +{ + struct ptlrpc_sec *sec; + + spin_lock(&imp->imp_lock); + sec = sptlrpc_sec_get(imp->imp_sec); + spin_unlock(&imp->imp_lock); + + return sec; +} +EXPORT_SYMBOL(sptlrpc_import_sec_ref); + +static void sptlrpc_import_sec_install(struct obd_import *imp, + struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec *old_sec; + + LASSERT_ATOMIC_POS(&sec->ps_refcount); + + spin_lock(&imp->imp_lock); + old_sec = imp->imp_sec; + imp->imp_sec = sec; + spin_unlock(&imp->imp_lock); + + if (old_sec) { + sptlrpc_sec_kill(old_sec); + + /* balance the ref taken by this import */ + sptlrpc_sec_put(old_sec); + } +} + +static inline +int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) +{ + return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); +} + +static inline +void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) +{ + *dst = *src; +} + +static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct sptlrpc_flavor *sf) +{ + char str1[32], str2[32]; + + if (sec->ps_flvr.sf_flags != sf->sf_flags) + CDEBUG(D_SEC, "changing sec flags: %s -> %s\n", + sptlrpc_secflags2str(sec->ps_flvr.sf_flags, + str1, sizeof(str1)), + sptlrpc_secflags2str(sf->sf_flags, + str2, sizeof(str2))); + + spin_lock(&sec->ps_lock); + flavor_copy(&sec->ps_flvr, sf); + spin_unlock(&sec->ps_lock); +} + +/** + * To get an appropriate ptlrpc_sec for the \a imp, according to the current + * configuration. Upon called, imp->imp_sec may or may not be NULL. + * + * - regular import: \a svc_ctx should be NULL and \a flvr is ignored; + * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request. + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *flvr) +{ + struct ptlrpc_connection *conn; + struct sptlrpc_flavor sf; + struct ptlrpc_sec *sec, *newsec; + enum lustre_sec_part sp; + char str[24]; + int rc = 0; + ENTRY; + + might_sleep(); + + if (imp == NULL) + RETURN(0); + + conn = imp->imp_connection; + + if (svc_ctx == NULL) { + struct client_obd *cliobd = &imp->imp_obd->u.cli; + /* + * normal import, determine flavor from rule set, except + * for mgc the flavor is predetermined. + */ + if (cliobd->cl_sp_me == LUSTRE_SP_MGC) + sf = cliobd->cl_flvr_mgc; + else + sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, + cliobd->cl_sp_to, + &cliobd->cl_target_uuid, + conn->c_self, &sf); + + sp = imp->imp_obd->u.cli.cl_sp_me; + } else { + /* reverse import, determine flavor from incoming reqeust */ + sf = *flvr; + + if (sf.sf_rpc != SPTLRPC_FLVR_NULL) + sf.sf_flags = PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY; + + sp = sptlrpc_target_sec_part(imp->imp_obd); + } + + sec = sptlrpc_import_sec_ref(imp); + if (sec) { + char str2[24]; + + if (flavor_equal(&sf, &sec->ps_flvr)) + GOTO(out, rc); + + CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), + sptlrpc_flavor2name(&sf, str2, sizeof(str2))); + + if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) == + SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) && + SPTLRPC_FLVR_MECH(sf.sf_rpc) == + SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) { + sptlrpc_import_sec_adapt_inplace(imp, sec, &sf); + GOTO(out, rc); + } + } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) { + CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid), + LNET_NIDNET(conn->c_self), + sptlrpc_flavor2name(&sf, str, sizeof(str))); + } + + mutex_lock(&imp->imp_sec_mutex); + + newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp); + if (newsec) { + sptlrpc_import_sec_install(imp, newsec); + } else { + CERROR("import %s->%s: failed to create new sec\n", + imp->imp_obd->obd_name, + obd_uuid2str(&conn->c_remote_uuid)); + rc = -EPERM; + } + + mutex_unlock(&imp->imp_sec_mutex); +out: + sptlrpc_sec_put(sec); + RETURN(rc); +} + +void sptlrpc_import_sec_put(struct obd_import *imp) +{ + if (imp->imp_sec) { + sptlrpc_sec_kill(imp->imp_sec); + + sptlrpc_sec_put(imp->imp_sec); + imp->imp_sec = NULL; + } +} + +static void import_flush_ctx_common(struct obd_import *imp, + uid_t uid, int grace, int force) +{ + struct ptlrpc_sec *sec; + + if (imp == NULL) + return; + + sec = sptlrpc_import_sec_ref(imp); + if (sec == NULL) + return; + + sec_cop_flush_ctx_cache(sec, uid, grace, force); + sptlrpc_sec_put(sec); +} + +void sptlrpc_import_flush_root_ctx(struct obd_import *imp) +{ + /* it's important to use grace mode, see explain in + * sptlrpc_req_refresh_ctx() */ + import_flush_ctx_common(imp, 0, 1, 1); +} + +void sptlrpc_import_flush_my_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, current_uid(), 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx); + +void sptlrpc_import_flush_all_ctx(struct obd_import *imp) +{ + import_flush_ctx_common(imp, -1, 1, 1); +} +EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx); + +/** + * Used by ptlrpc client to allocate request buffer of \a req. Upon return + * successfully, req->rq_reqmsg points to a buffer with size \a msgsize. + */ +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + int rc; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT(req->rq_reqmsg == NULL); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + policy = ctx->cc_sec->ps_policy; + rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize); + if (!rc) { + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqbuf || req->rq_clrbuf); + + /* zeroing preallocated buffer */ + if (req->rq_pool) + memset(req->rq_reqmsg, 0, msgsize); + } + + return rc; +} + +/** + * Used by ptlrpc client to free request buffer of \a req. After this + * req->rq_reqmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_reqbuf == NULL && req->rq_clrbuf == NULL) + return; + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_reqbuf(ctx->cc_sec, req); + req->rq_reqmsg = NULL; +} + +/* + * NOTE caller must guarantee the buffer size is enough for the enlargement + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize) +{ + void *src, *dst; + int oldsize, oldmsg_size, movesize; + + LASSERT(segment < msg->lm_bufcount); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return; + + /* nothing to do if we are enlarging the last segment */ + if (segment == msg->lm_bufcount - 1) { + msg->lm_buflens[segment] = newsize; + return; + } + + oldsize = msg->lm_buflens[segment]; + + src = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = newsize; + dst = lustre_msg_buf(msg, segment + 1, 0); + msg->lm_buflens[segment] = oldsize; + + /* move from segment + 1 to end segment */ + LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2); + oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + movesize = oldmsg_size - ((unsigned long) src - (unsigned long) msg); + LASSERT(movesize >= 0); + + if (movesize) + memmove(dst, src, movesize); + + /* note we don't clear the ares where old data live, not secret */ + + /* finally set new segment size */ + msg->lm_buflens[segment] = newsize; +} +EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace); + +/** + * Used by ptlrpc client to enlarge the \a segment of request message pointed + * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be + * preserved after the enlargement. this must be called after original request + * buffer being allocated. + * + * \note after this be called, rq_reqmsg and rq_reqlen might have been changed, + * so caller should refresh its local pointers if needed. + */ +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + int segment, int newsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_cops *cops; + struct lustre_msg *msg = req->rq_reqmsg; + + LASSERT(ctx); + LASSERT(msg); + LASSERT(msg->lm_bufcount > segment); + LASSERT(msg->lm_buflens[segment] <= newsize); + + if (msg->lm_buflens[segment] == newsize) + return 0; + + cops = ctx->cc_sec->ps_policy->sp_cops; + LASSERT(cops->enlarge_reqbuf); + return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize); +} +EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf); + +/** + * Used by ptlrpc client to allocate reply buffer of \a req. + * + * \note After this, req->rq_repmsg is still not accessible. + */ +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + + if (req->rq_repbuf) + RETURN(0); + + policy = ctx->cc_sec->ps_policy; + RETURN(policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize)); +} + +/** + * Used by ptlrpc client to free reply buffer of \a req. After this + * req->rq_repmsg is set to NULL and should not be accessed anymore. + */ +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec_policy *policy; + ENTRY; + + LASSERT(ctx); + LASSERT(ctx->cc_sec); + LASSERT(ctx->cc_sec->ps_policy); + LASSERT_ATOMIC_POS(&ctx->cc_refcount); + + if (req->rq_repbuf == NULL) + return; + LASSERT(req->rq_repbuf_len); + + policy = ctx->cc_sec->ps_policy; + policy->sp_cops->free_repbuf(ctx->cc_sec, req); + req->rq_repmsg = NULL; + EXIT; +} + +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->cc_sec->ps_policy; + + if (!policy->sp_cops->install_rctx) + return 0; + return policy->sp_cops->install_rctx(imp, ctx->cc_sec, ctx); +} + +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx) +{ + struct ptlrpc_sec_policy *policy = ctx->sc_policy; + + if (!policy->sp_sops->install_rctx) + return 0; + return policy->sp_sops->install_rctx(imp, ctx); +} + +/**************************************** + * server side security * + ****************************************/ + +static int flavor_allowed(struct sptlrpc_flavor *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor *flvr = &req->rq_flvr; + + if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) + return 1; + + if ((req->rq_ctx_init || req->rq_ctx_fini) && + SPTLRPC_FLVR_POLICY(exp->sf_rpc) == + SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && + SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) + return 1; + + return 0; +} + +#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10) + +/** + * Given an export \a exp, check whether the flavor of incoming \a req + * is allowed by the export \a exp. Main logic is about taking care of + * changing configurations. Return 0 means success. + */ +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor flavor; + + if (exp == NULL) + return 0; + + /* client side export has no imp_reverse, skip + * FIXME maybe we should check flavor this as well??? */ + if (exp->exp_imp_reverse == NULL) + return 0; + + /* don't care about ctx fini rpc */ + if (req->rq_ctx_fini) + return 0; + + spin_lock(&exp->exp_lock); + + /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for + * the first req with the new flavor, then treat it as current flavor, + * adapt reverse sec according to it. + * note the first rpc with new flavor might not be with root ctx, in + * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. */ + if (unlikely(exp->exp_flvr_changed) && + flavor_allowed(&exp->exp_flvr_old[1], req)) { + /* make the new flavor as "current", and old ones as + * about-to-expire */ + CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp, + exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr_old[1]; + exp->exp_flvr_old[1] = exp->exp_flvr_old[0]; + exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0]; + exp->exp_flvr_old[0] = exp->exp_flvr; + exp->exp_flvr_expire[0] = cfs_time_current_sec() + + EXP_FLVR_UPDATE_EXPIRE; + exp->exp_flvr = flavor; + + /* flavor change finished */ + exp->exp_flvr_changed = 0; + LASSERT(exp->exp_flvr_adapt == 1); + + /* if it's gss, we only interested in root ctx init */ + if (req->rq_auth_gss && + !(req->rq_ctx_init && + (req->rq_auth_usr_root || req->rq_auth_usr_mdt || + req->rq_auth_usr_ost))) { + spin_unlock(&exp->exp_lock); + CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n", + req->rq_auth_gss, req->rq_ctx_init, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, + req->rq_auth_usr_ost); + return 0; + } + + exp->exp_flvr_adapt = 0; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, &flavor); + } + + /* if it equals to the current flavor, we accept it, but need to + * dealing with reverse sec/ctx */ + if (likely(flavor_allowed(&exp->exp_flvr, req))) { + /* most cases should return here, we only interested in + * gss root ctx init */ + if (!req->rq_auth_gss || !req->rq_ctx_init || + (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost)) { + spin_unlock(&exp->exp_lock); + return 0; + } + + /* if flavor just changed, we should not proceed, just leave + * it and current flavor will be discovered and replaced + * shortly, and let _this_ rpc pass through */ + if (exp->exp_flvr_changed) { + LASSERT(exp->exp_flvr_adapt); + spin_unlock(&exp->exp_lock); + return 0; + } + + if (exp->exp_flvr_adapt) { + exp->exp_flvr_adapt = 0; + CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + flavor = exp->exp_flvr; + spin_unlock(&exp->exp_lock); + + return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, + req->rq_svc_ctx, + &flavor); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, " + "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + spin_unlock(&exp->exp_lock); + + return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse, + req->rq_svc_ctx); + } + } + + if (exp->exp_flvr_expire[0]) { + if (exp->exp_flvr_expire[0] >= cfs_time_current_sec()) { + if (flavor_allowed(&exp->exp_flvr_old[0], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the " + "middle one ("CFS_DURATION_T")\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[0] - + cfs_time_current_sec()); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark middle expired\n"); + exp->exp_flvr_expire[0] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } + + /* now it doesn't match the current flavor, the only chance we can + * accept it is match the old flavors which is not expired. */ + if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) { + if (exp->exp_flvr_expire[1] >= cfs_time_current_sec()) { + if (flavor_allowed(&exp->exp_flvr_old[1], req)) { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the " + "oldest one ("CFS_DURATION_T")\n", exp, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] - + cfs_time_current_sec()); + spin_unlock(&exp->exp_lock); + return 0; + } + } else { + CDEBUG(D_SEC, "mark oldest expired\n"); + exp->exp_flvr_expire[1] = 0; + } + CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n", + exp, exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, + req->rq_flvr.sf_rpc); + } else { + CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n", + exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + + spin_unlock(&exp->exp_lock); + + CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with " + "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n", + exp, exp->exp_obd->obd_name, + req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, + req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost, + req->rq_flvr.sf_rpc, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_expire[0] ? + (unsigned long) (exp->exp_flvr_expire[0] - + cfs_time_current_sec()) : 0, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] ? + (unsigned long) (exp->exp_flvr_expire[1] - + cfs_time_current_sec()) : 0); + return -EACCES; +} +EXPORT_SYMBOL(sptlrpc_target_export_check); + +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset) +{ + struct obd_export *exp; + struct sptlrpc_flavor new_flvr; + + LASSERT(obd); + + spin_lock(&obd->obd_dev_lock); + + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) { + if (exp->exp_connection == NULL) + continue; + + /* note if this export had just been updated flavor + * (exp_flvr_changed == 1), this will override the + * previous one. */ + spin_lock(&exp->exp_lock); + sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer, + exp->exp_connection->c_peer.nid, + &new_flvr); + if (exp->exp_flvr_changed || + !flavor_equal(&new_flvr, &exp->exp_flvr)) { + exp->exp_flvr_old[1] = new_flvr; + exp->exp_flvr_expire[1] = 0; + exp->exp_flvr_changed = 1; + exp->exp_flvr_adapt = 1; + + CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n", + exp, sptlrpc_part2name(exp->exp_sp_peer), + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[1].sf_rpc); + } + spin_unlock(&exp->exp_lock); + } + + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(sptlrpc_target_update_exp_flavor); + +static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) +{ + /* peer's claim is unreliable unless gss is being used */ + if (!req->rq_auth_gss || svc_rc == SECSVC_DROP) + return svc_rc; + + switch (req->rq_sp_from) { + case LUSTRE_SP_CLI: + if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source CLI"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MDT: + if (!req->rq_auth_usr_mdt) { + DEBUG_REQ(D_ERROR, req, "faked source MDT"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_OST: + if (!req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source OST"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_MGS: + case LUSTRE_SP_MGC: + if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && + !req->rq_auth_usr_ost) { + DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS"); + svc_rc = SECSVC_DROP; + } + break; + case LUSTRE_SP_ANY: + default: + DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from); + svc_rc = SECSVC_DROP; + } + + return svc_rc; +} + +/** + * Used by ptlrpc server, to perform transformation upon request message of + * incoming \a req. This must be the first thing to do with a incoming + * request in ptlrpc layer. + * + * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in + * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set. + * \retval SECSVC_COMPLETE success, the request has been fully processed, and + * reply message has been prepared. + * \retval SECSVC_DROP failed, this request should be dropped. + */ +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + struct lustre_msg *msg = req->rq_reqbuf; + int rc; + ENTRY; + + LASSERT(msg); + LASSERT(req->rq_reqmsg == NULL); + LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_svc_ctx == NULL); + + req->rq_req_swab_mask = 0; + + rc = __lustre_unpack_msg(msg, req->rq_reqdata_len); + switch (rc) { + case 1: + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + case 0: + break; + default: + CERROR("error unpacking request from %s x"LPU64"\n", + libcfs_id2str(req->rq_peer), req->rq_xid); + RETURN(SECSVC_DROP); + } + + req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); + req->rq_sp_from = LUSTRE_SP_ANY; + req->rq_auth_uid = INVALID_UID; + req->rq_auth_mapped_uid = INVALID_UID; + + policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); + if (!policy) { + CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + LASSERT(policy->sp_sops->accept); + rc = policy->sp_sops->accept(req); + sptlrpc_policy_put(policy); + LASSERT(req->rq_reqmsg || rc != SECSVC_OK); + LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); + + /* + * if it's not null flavor (which means embedded packing msg), + * reset the swab mask for the comming inner msg unpacking. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) + req->rq_req_swab_mask = 0; + + /* sanity check for the request source */ + rc = sptlrpc_svc_check_from(req, rc); + RETURN(rc); +} + +/** + * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed, + * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to + * a buffer of \a msglen size. + */ +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen) +{ + struct ptlrpc_sec_policy *policy; + struct ptlrpc_reply_state *rs; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->alloc_rs); + + rc = policy->sp_sops->alloc_rs(req, msglen); + if (unlikely(rc == -ENOMEM)) { + /* failed alloc, try emergency pool */ + rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_svcpt); + if (rs == NULL) + RETURN(-ENOMEM); + + req->rq_reply_state = rs; + rc = policy->sp_sops->alloc_rs(req, msglen); + if (rc) { + lustre_put_emerg_rs(rs); + req->rq_reply_state = NULL; + } + } + + LASSERT(rc != 0 || + (req->rq_reply_state && req->rq_reply_state->rs_msg)); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to perform transformation upon reply message. + * + * \post req->rq_reply_off is set to approriate server-controlled reply offset. + * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible. + */ +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_sec_policy *policy; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_svc_ctx->sc_policy); + + policy = req->rq_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->authorize); + + rc = policy->sp_sops->authorize(req); + LASSERT(rc || req->rq_reply_state->rs_repdata_len); + + RETURN(rc); +} + +/** + * Used by ptlrpc server, to free reply_state. + */ +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_sec_policy *policy; + unsigned int prealloc; + ENTRY; + + LASSERT(rs->rs_svc_ctx); + LASSERT(rs->rs_svc_ctx->sc_policy); + + policy = rs->rs_svc_ctx->sc_policy; + LASSERT(policy->sp_sops->free_rs); + + prealloc = rs->rs_prealloc; + policy->sp_sops->free_rs(rs); + + if (prealloc) + lustre_put_emerg_rs(rs); + EXIT; +} + +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx != NULL) + atomic_inc(&ctx->sc_refcount); +} + +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (atomic_dec_and_test(&ctx->sc_refcount)) { + if (ctx->sc_policy->sp_sops->free_ctx) + ctx->sc_policy->sp_sops->free_ctx(ctx); + } + req->rq_svc_ctx = NULL; +} + +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req) +{ + struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; + + if (ctx == NULL) + return; + + LASSERT_ATOMIC_POS(&ctx->sc_refcount); + if (ctx->sc_policy->sp_sops->invalidate_ctx) + ctx->sc_policy->sp_sops->invalidate_ctx(ctx); +} +EXPORT_SYMBOL(sptlrpc_svc_ctx_invalidate); + +/**************************************** + * bulk security * + ****************************************/ + +/** + * Perform transformation upon bulk data pointed by \a desc. This is called + * before transforming the request message. + */ +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->wrap_bulk) + return ctx->cc_ops->wrap_bulk(ctx, req, desc); + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); + +/** + * This is called after unwrap the reply message. + * return nob of actual plain text size received, or error code. + */ +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_read && !req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return desc->bd_nob_transferred; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + return desc->bd_nob_transferred; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); + +/** + * This is called after unwrap the reply message. + * return 0 for success or error code. + */ +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(!req->rq_bulk_read && req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_cli_ctx; + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + + /* + * if everything is going right, nob should equals to nob_transferred. + * in case of privacy mode, nob_transferred needs to be adjusted. + */ + if (desc->bd_nob != desc->bd_nob_transferred) { + CERROR("nob %d doesn't match transferred nob %d", + desc->bd_nob, desc->bd_nob_transferred); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write); + + +/**************************************** + * user descriptor helpers * + ****************************************/ + +int sptlrpc_current_user_desc_size(void) +{ + int ngroups; + + ngroups = current_ngroups; + + if (ngroups > LUSTRE_MAX_GROUPS) + ngroups = LUSTRE_MAX_GROUPS; + return sptlrpc_user_desc_size(ngroups); +} +EXPORT_SYMBOL(sptlrpc_current_user_desc_size); + +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) +{ + struct ptlrpc_user_desc *pud; + + pud = lustre_msg_buf(msg, offset, 0); + + pud->pud_uid = current_uid(); + pud->pud_gid = current_gid(); + pud->pud_fsuid = current_fsuid(); + pud->pud_fsgid = current_fsgid(); + pud->pud_cap = cfs_curproc_cap_pack(); + pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4; + + task_lock(current); + if (pud->pud_ngroups > current_ngroups) + pud->pud_ngroups = current_ngroups; + memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + pud->pud_ngroups * sizeof(__u32)); + task_unlock(current); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_pack_user_desc); + +int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_user_desc *pud; + int i; + + pud = lustre_msg_buf(msg, offset, sizeof(*pud)); + if (!pud) + return -EINVAL; + + if (swabbed) { + __swab32s(&pud->pud_uid); + __swab32s(&pud->pud_gid); + __swab32s(&pud->pud_fsuid); + __swab32s(&pud->pud_fsgid); + __swab32s(&pud->pud_cap); + __swab32s(&pud->pud_ngroups); + } + + if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) { + CERROR("%u groups is too large\n", pud->pud_ngroups); + return -EINVAL; + } + + if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) > + msg->lm_buflens[offset]) { + CERROR("%u groups are claimed but bufsize only %u\n", + pud->pud_ngroups, msg->lm_buflens[offset]); + return -EINVAL; + } + + if (swabbed) { + for (i = 0; i < pud->pud_ngroups; i++) + __swab32s(&pud->pud_groups[i]); + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_unpack_user_desc); + +/**************************************** + * misc helpers * + ****************************************/ + +const char * sec2target_str(struct ptlrpc_sec *sec) +{ + if (!sec || !sec->ps_import || !sec->ps_import->imp_obd) + return "*"; + if (sec_is_reverse(sec)) + return "c"; + return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid); +} +EXPORT_SYMBOL(sec2target_str); + +/* + * return true if the bulk data is protected + */ +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) +{ + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_INTG: + case SPTLRPC_BULK_SVC_PRIV: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); + +/**************************************** + * crypto API helper/alloc blkciper * + ****************************************/ + +/**************************************** + * initialize/finalize * + ****************************************/ + +int sptlrpc_init(void) +{ + int rc; + + rwlock_init(&policy_lock); + + rc = sptlrpc_gc_init(); + if (rc) + goto out; + + rc = sptlrpc_conf_init(); + if (rc) + goto out_gc; + + rc = sptlrpc_enc_pool_init(); + if (rc) + goto out_conf; + + rc = sptlrpc_null_init(); + if (rc) + goto out_pool; + + rc = sptlrpc_plain_init(); + if (rc) + goto out_null; + + rc = sptlrpc_lproc_init(); + if (rc) + goto out_plain; + + return 0; + +out_plain: + sptlrpc_plain_fini(); +out_null: + sptlrpc_null_fini(); +out_pool: + sptlrpc_enc_pool_fini(); +out_conf: + sptlrpc_conf_fini(); +out_gc: + sptlrpc_gc_fini(); +out: + return rc; +} + +void sptlrpc_fini(void) +{ + sptlrpc_lproc_fini(); + sptlrpc_plain_fini(); + sptlrpc_null_fini(); + sptlrpc_enc_pool_fini(); + sptlrpc_conf_fini(); + sptlrpc_gc_fini(); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c new file mode 100644 index 000000000000..60ab2eaf6683 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c @@ -0,0 +1,881 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_bulk.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +/**************************************** + * bulk encryption page pools * + ****************************************/ + + +#define PTRS_PER_PAGE (PAGE_CACHE_SIZE / sizeof(void *)) +#define PAGES_PER_POOL (PTRS_PER_PAGE) + +#define IDLE_IDX_MAX (100) +#define IDLE_IDX_WEIGHT (3) + +#define CACHE_QUIESCENT_PERIOD (20) + +static struct ptlrpc_enc_page_pool { + /* + * constants + */ + unsigned long epp_max_pages; /* maximum pages can hold, const */ + unsigned int epp_max_pools; /* number of pools, const */ + + /* + * wait queue in case of not enough free pages. + */ + wait_queue_head_t epp_waitq; /* waiting threads */ + unsigned int epp_waitqlen; /* wait queue length */ + unsigned long epp_pages_short; /* # of pages wanted of in-q users */ + unsigned int epp_growing:1; /* during adding pages */ + + /* + * indicating how idle the pools are, from 0 to MAX_IDLE_IDX + * this is counted based on each time when getting pages from + * the pools, not based on time. which means in case that system + * is idled for a while but the idle_idx might still be low if no + * activities happened in the pools. + */ + unsigned long epp_idle_idx; + + /* last shrink time due to mem tight */ + long epp_last_shrink; + long epp_last_access; + + /* + * in-pool pages bookkeeping + */ + spinlock_t epp_lock; /* protect following fields */ + unsigned long epp_total_pages; /* total pages in pools */ + unsigned long epp_free_pages; /* current pages available */ + + /* + * statistics + */ + unsigned long epp_st_max_pages; /* # of pages ever reached */ + unsigned int epp_st_grows; /* # of grows */ + unsigned int epp_st_grow_fails; /* # of add pages failures */ + unsigned int epp_st_shrinks; /* # of shrinks */ + unsigned long epp_st_access; /* # of access */ + unsigned long epp_st_missings; /* # of cache missing */ + unsigned long epp_st_lowfree; /* lowest free pages reached */ + unsigned int epp_st_max_wqlen; /* highest waitqueue length */ + cfs_time_t epp_st_max_wait; /* in jeffies */ + /* + * pointers to pools + */ + struct page ***epp_pools; +} page_pools; + +/* + * memory shrinker + */ +const int pools_shrinker_seeks = DEFAULT_SEEKS; +static struct shrinker *pools_shrinker = NULL; + + +/* + * /proc/fs/lustre/sptlrpc/encrypt_page_pools + */ +int sptlrpc_proc_read_enc_pool(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int rc; + + spin_lock(&page_pools.epp_lock); + + rc = snprintf(page, count, + "physical pages: %lu\n" + "pages per pool: %lu\n" + "max pages: %lu\n" + "max pools: %u\n" + "total pages: %lu\n" + "total free: %lu\n" + "idle index: %lu/100\n" + "last shrink: %lds\n" + "last access: %lds\n" + "max pages reached: %lu\n" + "grows: %u\n" + "grows failure: %u\n" + "shrinks: %u\n" + "cache access: %lu\n" + "cache missing: %lu\n" + "low free mark: %lu\n" + "max waitqueue depth: %u\n" + "max wait time: "CFS_TIME_T"/%u\n" + , + num_physpages, + PAGES_PER_POOL, + page_pools.epp_max_pages, + page_pools.epp_max_pools, + page_pools.epp_total_pages, + page_pools.epp_free_pages, + page_pools.epp_idle_idx, + cfs_time_current_sec() - page_pools.epp_last_shrink, + cfs_time_current_sec() - page_pools.epp_last_access, + page_pools.epp_st_max_pages, + page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, + page_pools.epp_st_access, + page_pools.epp_st_missings, + page_pools.epp_st_lowfree, + page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, HZ + ); + + spin_unlock(&page_pools.epp_lock); + return rc; +} + +static void enc_pools_release_free_pages(long npages) +{ + int p_idx, g_idx; + int p_idx_max1, p_idx_max2; + + LASSERT(npages > 0); + LASSERT(npages <= page_pools.epp_free_pages); + LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); + + /* max pool index before the release */ + p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; + + page_pools.epp_free_pages -= npages; + page_pools.epp_total_pages -= npages; + + /* max pool index after the release */ + p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : + ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + LASSERT(page_pools.epp_pools[p_idx]); + + while (npages--) { + LASSERT(page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + + __free_page(page_pools.epp_pools[p_idx][g_idx]); + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + }; + + /* free unused pools */ + while (p_idx_max1 < p_idx_max2) { + LASSERT(page_pools.epp_pools[p_idx_max2]); + OBD_FREE(page_pools.epp_pools[p_idx_max2], PAGE_CACHE_SIZE); + page_pools.epp_pools[p_idx_max2] = NULL; + p_idx_max2--; + } +} + +/* + * could be called frequently for query (@nr_to_scan == 0). + * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. + */ +static int enc_pools_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + if (unlikely(shrink_param(sc, nr_to_scan) != 0)) { + spin_lock(&page_pools.epp_lock); + shrink_param(sc, nr_to_scan) = min_t(unsigned long, + shrink_param(sc, nr_to_scan), + page_pools.epp_free_pages - + PTLRPC_MAX_BRW_PAGES); + if (shrink_param(sc, nr_to_scan) > 0) { + enc_pools_release_free_pages(shrink_param(sc, + nr_to_scan)); + CDEBUG(D_SEC, "released %ld pages, %ld left\n", + (long)shrink_param(sc, nr_to_scan), + page_pools.epp_free_pages); + + page_pools.epp_st_shrinks++; + page_pools.epp_last_shrink = cfs_time_current_sec(); + } + spin_unlock(&page_pools.epp_lock); + } + + /* + * if no pool access for a long time, we consider it's fully idle. + * a little race here is fine. + */ + if (unlikely(cfs_time_current_sec() - page_pools.epp_last_access > + CACHE_QUIESCENT_PERIOD)) { + spin_lock(&page_pools.epp_lock); + page_pools.epp_idle_idx = IDLE_IDX_MAX; + spin_unlock(&page_pools.epp_lock); + } + + LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); + return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) * + (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; +} + +static inline +int npages_to_npools(unsigned long npages) +{ + return (int) ((npages + PAGES_PER_POOL - 1) / PAGES_PER_POOL); +} + +/* + * return how many pages cleaned up. + */ +static unsigned long enc_pools_cleanup(struct page ***pools, int npools) +{ + unsigned long cleaned = 0; + int i, j; + + for (i = 0; i < npools; i++) { + if (pools[i]) { + for (j = 0; j < PAGES_PER_POOL; j++) { + if (pools[i][j]) { + __free_page(pools[i][j]); + cleaned++; + } + } + OBD_FREE(pools[i], PAGE_CACHE_SIZE); + pools[i] = NULL; + } + } + + return cleaned; +} + +/* + * merge @npools pointed by @pools which contains @npages new pages + * into current pools. + * + * we have options to avoid most memory copy with some tricks. but we choose + * the simplest way to avoid complexity. It's not frequently called. + */ +static void enc_pools_insert(struct page ***pools, int npools, int npages) +{ + int freeslot; + int op_idx, np_idx, og_idx, ng_idx; + int cur_npools, end_npools; + + LASSERT(npages > 0); + LASSERT(page_pools.epp_total_pages+npages <= page_pools.epp_max_pages); + LASSERT(npages_to_npools(npages) == npools); + LASSERT(page_pools.epp_growing); + + spin_lock(&page_pools.epp_lock); + + /* + * (1) fill all the free slots of current pools. + */ + /* free slots are those left by rent pages, and the extra ones with + * index >= total_pages, locate at the tail of last pool. */ + freeslot = page_pools.epp_total_pages % PAGES_PER_POOL; + if (freeslot != 0) + freeslot = PAGES_PER_POOL - freeslot; + freeslot += page_pools.epp_total_pages - page_pools.epp_free_pages; + + op_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + og_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + np_idx = npools - 1; + ng_idx = (npages - 1) % PAGES_PER_POOL; + + while (freeslot) { + LASSERT(page_pools.epp_pools[op_idx][og_idx] == NULL); + LASSERT(pools[np_idx][ng_idx] != NULL); + + page_pools.epp_pools[op_idx][og_idx] = pools[np_idx][ng_idx]; + pools[np_idx][ng_idx] = NULL; + + freeslot--; + + if (++og_idx == PAGES_PER_POOL) { + op_idx++; + og_idx = 0; + } + if (--ng_idx < 0) { + if (np_idx == 0) + break; + np_idx--; + ng_idx = PAGES_PER_POOL - 1; + } + } + + /* + * (2) add pools if needed. + */ + cur_npools = (page_pools.epp_total_pages + PAGES_PER_POOL - 1) / + PAGES_PER_POOL; + end_npools = (page_pools.epp_total_pages + npages + PAGES_PER_POOL -1) / + PAGES_PER_POOL; + LASSERT(end_npools <= page_pools.epp_max_pools); + + np_idx = 0; + while (cur_npools < end_npools) { + LASSERT(page_pools.epp_pools[cur_npools] == NULL); + LASSERT(np_idx < npools); + LASSERT(pools[np_idx] != NULL); + + page_pools.epp_pools[cur_npools++] = pools[np_idx]; + pools[np_idx++] = NULL; + } + + page_pools.epp_total_pages += npages; + page_pools.epp_free_pages += npages; + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + if (page_pools.epp_total_pages > page_pools.epp_st_max_pages) + page_pools.epp_st_max_pages = page_pools.epp_total_pages; + + CDEBUG(D_SEC, "add %d pages to total %lu\n", npages, + page_pools.epp_total_pages); + + spin_unlock(&page_pools.epp_lock); +} + +static int enc_pools_add_pages(int npages) +{ + static DEFINE_MUTEX(add_pages_mutex); + struct page ***pools; + int npools, alloced = 0; + int i, j, rc = -ENOMEM; + + if (npages < PTLRPC_MAX_BRW_PAGES) + npages = PTLRPC_MAX_BRW_PAGES; + + mutex_lock(&add_pages_mutex); + + if (npages + page_pools.epp_total_pages > page_pools.epp_max_pages) + npages = page_pools.epp_max_pages - page_pools.epp_total_pages; + LASSERT(npages > 0); + + page_pools.epp_st_grows++; + + npools = npages_to_npools(npages); + OBD_ALLOC(pools, npools * sizeof(*pools)); + if (pools == NULL) + goto out; + + for (i = 0; i < npools; i++) { + OBD_ALLOC(pools[i], PAGE_CACHE_SIZE); + if (pools[i] == NULL) + goto out_pools; + + for (j = 0; j < PAGES_PER_POOL && alloced < npages; j++) { + pools[i][j] = alloc_page(__GFP_IO | + __GFP_HIGHMEM); + if (pools[i][j] == NULL) + goto out_pools; + + alloced++; + } + } + LASSERT(alloced == npages); + + enc_pools_insert(pools, npools, npages); + CDEBUG(D_SEC, "added %d pages into pools\n", npages); + rc = 0; + +out_pools: + enc_pools_cleanup(pools, npools); + OBD_FREE(pools, npools * sizeof(*pools)); +out: + if (rc) { + page_pools.epp_st_grow_fails++; + CERROR("Failed to allocate %d enc pages\n", npages); + } + + mutex_unlock(&add_pages_mutex); + return rc; +} + +static inline void enc_pools_wakeup(void) +{ + LASSERT(spin_is_locked(&page_pools.epp_lock)); + LASSERT(page_pools.epp_waitqlen >= 0); + + if (unlikely(page_pools.epp_waitqlen)) { + LASSERT(waitqueue_active(&page_pools.epp_waitq)); + wake_up_all(&page_pools.epp_waitq); + } +} + +static int enc_pools_should_grow(int page_needed, long now) +{ + /* don't grow if someone else is growing the pools right now, + * or the pools has reached its full capacity + */ + if (page_pools.epp_growing || + page_pools.epp_total_pages == page_pools.epp_max_pages) + return 0; + + /* if total pages is not enough, we need to grow */ + if (page_pools.epp_total_pages < page_needed) + return 1; + + /* + * we wanted to return 0 here if there was a shrink just happened + * moment ago, but this may cause deadlock if both client and ost + * live on single node. + */ +#if 0 + if (now - page_pools.epp_last_shrink < 2) + return 0; +#endif + + /* + * here we perhaps need consider other factors like wait queue + * length, idle index, etc. ? + */ + + /* grow the pools in any other cases */ + return 1; +} + +/* + * we allocate the requested pages atomically. + */ +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) +{ + wait_queue_t waitlink; + unsigned long this_idle = -1; + cfs_time_t tick = 0; + long now; + int p_idx, g_idx; + int i; + + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); + + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_iov != NULL) + return 0; + + OBD_ALLOC(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + if (desc->bd_enc_iov == NULL) + return -ENOMEM; + + spin_lock(&page_pools.epp_lock); + + page_pools.epp_st_access++; +again: + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { + if (tick == 0) + tick = cfs_time_current(); + + now = cfs_time_current_sec(); + + page_pools.epp_st_missings++; + page_pools.epp_pages_short += desc->bd_iov_count; + + if (enc_pools_should_grow(desc->bd_iov_count, now)) { + page_pools.epp_growing = 1; + + spin_unlock(&page_pools.epp_lock); + enc_pools_add_pages(page_pools.epp_pages_short / 2); + spin_lock(&page_pools.epp_lock); + + page_pools.epp_growing = 0; + + enc_pools_wakeup(); + } else { + if (++page_pools.epp_waitqlen > + page_pools.epp_st_max_wqlen) + page_pools.epp_st_max_wqlen = + page_pools.epp_waitqlen; + + set_current_state(TASK_UNINTERRUPTIBLE); + init_waitqueue_entry_current(&waitlink); + add_wait_queue(&page_pools.epp_waitq, &waitlink); + + spin_unlock(&page_pools.epp_lock); + waitq_wait(&waitlink, TASK_UNINTERRUPTIBLE); + remove_wait_queue(&page_pools.epp_waitq, &waitlink); + LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); + page_pools.epp_waitqlen--; + } + + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; + + this_idle = 0; + goto again; + } + + /* record max wait time */ + if (unlikely(tick != 0)) { + tick = cfs_time_current() - tick; + if (tick > page_pools.epp_st_max_wait) + page_pools.epp_st_max_wait = tick; + } + + /* proceed with rest of allocation */ + page_pools.epp_free_pages -= desc->bd_iov_count; + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); + desc->bd_enc_iov[i].kiov_page = + page_pools.epp_pools[p_idx][g_idx]; + page_pools.epp_pools[p_idx][g_idx] = NULL; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + if (page_pools.epp_free_pages < page_pools.epp_st_lowfree) + page_pools.epp_st_lowfree = page_pools.epp_free_pages; + + /* + * new idle index = (old * weight + new) / (weight + 1) + */ + if (this_idle == -1) { + this_idle = page_pools.epp_free_pages * IDLE_IDX_MAX / + page_pools.epp_total_pages; + } + page_pools.epp_idle_idx = (page_pools.epp_idle_idx * IDLE_IDX_WEIGHT + + this_idle) / + (IDLE_IDX_WEIGHT + 1); + + page_pools.epp_last_access = cfs_time_current_sec(); + + spin_unlock(&page_pools.epp_lock); + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_get_pages); + +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) +{ + int p_idx, g_idx; + int i; + + if (desc->bd_enc_iov == NULL) + return; + + LASSERT(desc->bd_iov_count > 0); + + spin_lock(&page_pools.epp_lock); + + p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; + g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; + + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= + page_pools.epp_total_pages); + LASSERT(page_pools.epp_pools[p_idx]); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); + LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); + LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); + + page_pools.epp_pools[p_idx][g_idx] = + desc->bd_enc_iov[i].kiov_page; + + if (++g_idx == PAGES_PER_POOL) { + p_idx++; + g_idx = 0; + } + } + + page_pools.epp_free_pages += desc->bd_iov_count; + + enc_pools_wakeup(); + + spin_unlock(&page_pools.epp_lock); + + OBD_FREE(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + desc->bd_enc_iov = NULL; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); + +/* + * we don't do much stuff for add_user/del_user anymore, except adding some + * initial pages in add_user() if current pools are empty, rest would be + * handled by the pools's self-adaption. + */ +int sptlrpc_enc_pool_add_user(void) +{ + int need_grow = 0; + + spin_lock(&page_pools.epp_lock); + if (page_pools.epp_growing == 0 && page_pools.epp_total_pages == 0) { + page_pools.epp_growing = 1; + need_grow = 1; + } + spin_unlock(&page_pools.epp_lock); + + if (need_grow) { + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); + + spin_lock(&page_pools.epp_lock); + page_pools.epp_growing = 0; + enc_pools_wakeup(); + spin_unlock(&page_pools.epp_lock); + } + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_add_user); + +int sptlrpc_enc_pool_del_user(void) +{ + return 0; +} +EXPORT_SYMBOL(sptlrpc_enc_pool_del_user); + +static inline void enc_pools_alloc(void) +{ + LASSERT(page_pools.epp_max_pools); + OBD_ALLOC_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +static inline void enc_pools_free(void) +{ + LASSERT(page_pools.epp_max_pools); + LASSERT(page_pools.epp_pools); + + OBD_FREE_LARGE(page_pools.epp_pools, + page_pools.epp_max_pools * + sizeof(*page_pools.epp_pools)); +} + +int sptlrpc_enc_pool_init(void) +{ + /* + * maximum capacity is 1/8 of total physical memory. + * is the 1/8 a good number? + */ + page_pools.epp_max_pages = num_physpages / 8; + page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); + + init_waitqueue_head(&page_pools.epp_waitq); + page_pools.epp_waitqlen = 0; + page_pools.epp_pages_short = 0; + + page_pools.epp_growing = 0; + + page_pools.epp_idle_idx = 0; + page_pools.epp_last_shrink = cfs_time_current_sec(); + page_pools.epp_last_access = cfs_time_current_sec(); + + spin_lock_init(&page_pools.epp_lock); + page_pools.epp_total_pages = 0; + page_pools.epp_free_pages = 0; + + page_pools.epp_st_max_pages = 0; + page_pools.epp_st_grows = 0; + page_pools.epp_st_grow_fails = 0; + page_pools.epp_st_shrinks = 0; + page_pools.epp_st_access = 0; + page_pools.epp_st_missings = 0; + page_pools.epp_st_lowfree = 0; + page_pools.epp_st_max_wqlen = 0; + page_pools.epp_st_max_wait = 0; + + enc_pools_alloc(); + if (page_pools.epp_pools == NULL) + return -ENOMEM; + + pools_shrinker = set_shrinker(pools_shrinker_seeks, + enc_pools_shrink); + if (pools_shrinker == NULL) { + enc_pools_free(); + return -ENOMEM; + } + + return 0; +} + +void sptlrpc_enc_pool_fini(void) +{ + unsigned long cleaned, npools; + + LASSERT(pools_shrinker); + LASSERT(page_pools.epp_pools); + LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); + + remove_shrinker(pools_shrinker); + + npools = npages_to_npools(page_pools.epp_total_pages); + cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); + LASSERT(cleaned == page_pools.epp_total_pages); + + enc_pools_free(); + + if (page_pools.epp_st_access > 0) { + CDEBUG(D_SEC, + "max pages %lu, grows %u, grow fails %u, shrinks %u, " + "access %lu, missing %lu, max qlen %u, max wait " + CFS_TIME_T"/%d\n", + page_pools.epp_st_max_pages, page_pools.epp_st_grows, + page_pools.epp_st_grow_fails, + page_pools.epp_st_shrinks, page_pools.epp_st_access, + page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, + page_pools.epp_st_max_wait, HZ); + } +} + + +static int cfs_hash_alg_id[] = { + [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, + [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, + [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, + [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, + [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, + [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, + [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, + [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, +}; +const char * sptlrpc_get_hash_name(__u8 hash_alg) +{ + return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); +} +EXPORT_SYMBOL(sptlrpc_get_hash_name); + +__u8 sptlrpc_get_hash_alg(const char *algname) +{ + return cfs_crypto_hash_alg(algname); +} +EXPORT_SYMBOL(sptlrpc_get_hash_alg); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + int size = msg->lm_buflens[offset]; + + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + if (bsd == NULL) { + CERROR("Invalid bulk sec desc: size %d\n", size); + return -EINVAL; + } + + if (swabbed) { + __swab32s(&bsd->bsd_nob); + } + + if (unlikely(bsd->bsd_version != 0)) { + CERROR("Unexpected version %u\n", bsd->bsd_version); + return -EPROTO; + } + + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); + return -EPROTO; + } + + /* FIXME more sanity check here */ + + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} +EXPORT_SYMBOL(bulk_sec_desc_unpack); + +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) +{ + struct cfs_crypto_hash_desc *hdesc; + int hashsize; + char hashbuf[64]; + unsigned int bufsize; + int i, err; + + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); + + hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); + if (IS_ERR(hdesc)) { + CERROR("Unable to initialize checksum hash %s\n", + cfs_crypto_hash_name(cfs_hash_alg_id[alg])); + return PTR_ERR(hdesc); + } + + hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); + + for (i = 0; i < desc->bd_iov_count; i++) { + cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page, + desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK, + desc->bd_iov[i].kiov_len); + } + if (hashsize > buflen) { + bufsize = sizeof(hashbuf); + err = cfs_crypto_hash_final(hdesc, (unsigned char *)hashbuf, + &bufsize); + memcpy(buf, hashbuf, buflen); + } else { + bufsize = buflen; + err = cfs_crypto_hash_final(hdesc, (unsigned char *)buf, + &bufsize); + } + + if (err) + cfs_crypto_hash_final(hdesc, NULL, NULL); + return err; +} +EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c new file mode 100644 index 000000000000..a45a3929b59f --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c @@ -0,0 +1,1233 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + +const char *sptlrpc_part2name(enum lustre_sec_part part) +{ + switch (part) { + case LUSTRE_SP_CLI: + return "cli"; + case LUSTRE_SP_MDT: + return "mdt"; + case LUSTRE_SP_OST: + return "ost"; + case LUSTRE_SP_MGC: + return "mgc"; + case LUSTRE_SP_MGS: + return "mgs"; + case LUSTRE_SP_ANY: + return "any"; + default: + return "err"; + } +} +EXPORT_SYMBOL(sptlrpc_part2name); + +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) +{ + const char *type = obd->obd_type->typ_name; + + if (!strcmp(type, LUSTRE_MDT_NAME)) + return LUSTRE_SP_MDT; + if (!strcmp(type, LUSTRE_OST_NAME)) + return LUSTRE_SP_OST; + if (!strcmp(type, LUSTRE_MGS_NAME)) + return LUSTRE_SP_MGS; + + CERROR("unknown target %p(%s)\n", obd, type); + return LUSTRE_SP_ANY; +} +EXPORT_SYMBOL(sptlrpc_target_sec_part); + +/**************************************** + * user supplied flavor string parsing * + ****************************************/ + +/* + * format: [-] + */ +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) +{ + char buf[32]; + char *bulk, *alg; + + memset(flvr, 0, sizeof(*flvr)); + + if (str == NULL || str[0] == '\0') { + flvr->sf_rpc = SPTLRPC_FLVR_INVALID; + return 0; + } + + strncpy(buf, str, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + + bulk = strchr(buf, '-'); + if (bulk) + *bulk++ = '\0'; + + flvr->sf_rpc = sptlrpc_name2flavor_base(buf); + if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) + goto err_out; + + /* + * currently only base flavor "plain" can have bulk specification. + */ + if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { + flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; + if (bulk) { + /* + * format: plain-hash: + */ + alg = strchr(bulk, ':'); + if (alg == NULL) + goto err_out; + *alg++ = '\0'; + + if (strcmp(bulk, "hash")) + goto err_out; + + flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); + if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) + goto err_out; + } + + if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); + else + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); + } else { + if (bulk) + goto err_out; + } + + flvr->sf_flags = 0; + return 0; + +err_out: + CERROR("invalid flavor string: %s\n", str); + return -EINVAL; +} +EXPORT_SYMBOL(sptlrpc_parse_flavor); + +/**************************************** + * configure rules * + ****************************************/ + +static void get_default_flavor(struct sptlrpc_flavor *sf) +{ + memset(sf, 0, sizeof(*sf)); + + sf->sf_rpc = SPTLRPC_FLVR_NULL; + sf->sf_flags = 0; +} + +static void sptlrpc_rule_init(struct sptlrpc_rule *rule) +{ + rule->sr_netid = LNET_NIDNET(LNET_NID_ANY); + rule->sr_from = LUSTRE_SP_ANY; + rule->sr_to = LUSTRE_SP_ANY; + rule->sr_padding = 0; + + get_default_flavor(&rule->sr_flvr); +} + +/* + * format: network[.direction]=flavor + */ +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) +{ + char *flavor, *dir; + int rc; + + sptlrpc_rule_init(rule); + + flavor = strchr(param, '='); + if (flavor == NULL) { + CERROR("invalid param, no '='\n"); + RETURN(-EINVAL); + } + *flavor++ = '\0'; + + dir = strchr(param, '.'); + if (dir) + *dir++ = '\0'; + + /* 1.1 network */ + if (strcmp(param, "default")) { + rule->sr_netid = libcfs_str2net(param); + if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) { + CERROR("invalid network name: %s\n", param); + RETURN(-EINVAL); + } + } + + /* 1.2 direction */ + if (dir) { + if (!strcmp(dir, "mdt2ost")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "mdt2mdt")) { + rule->sr_from = LUSTRE_SP_MDT; + rule->sr_to = LUSTRE_SP_MDT; + } else if (!strcmp(dir, "cli2ost")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_OST; + } else if (!strcmp(dir, "cli2mdt")) { + rule->sr_from = LUSTRE_SP_CLI; + rule->sr_to = LUSTRE_SP_MDT; + } else { + CERROR("invalid rule dir segment: %s\n", dir); + RETURN(-EINVAL); + } + } + + /* 2.1 flavor */ + rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); + if (rc) + RETURN(-EINVAL); + + RETURN(0); +} +EXPORT_SYMBOL(sptlrpc_parse_rule); + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset) +{ + LASSERT(rset->srs_nslot || + (rset->srs_nrule == 0 && rset->srs_rules == NULL)); + + if (rset->srs_nslot) { + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + sptlrpc_rule_set_init(rset); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_free); + +/* + * return 0 if the rule set could accomodate one more rule. + */ +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *rules; + int nslot; + + might_sleep(); + + if (rset->srs_nrule < rset->srs_nslot) + return 0; + + nslot = rset->srs_nslot + 8; + + /* better use realloc() if available */ + OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules)); + if (rules == NULL) + return -ENOMEM; + + if (rset->srs_nrule) { + LASSERT(rset->srs_nslot && rset->srs_rules); + memcpy(rules, rset->srs_rules, + rset->srs_nrule * sizeof(*rset->srs_rules)); + + OBD_FREE(rset->srs_rules, + rset->srs_nslot * sizeof(*rset->srs_rules)); + } + + rset->srs_rules = rules; + rset->srs_nslot = nslot; + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_expand); + +static inline int rule_spec_dir(struct sptlrpc_rule *rule) +{ + return (rule->sr_from != LUSTRE_SP_ANY || + rule->sr_to != LUSTRE_SP_ANY); +} +static inline int rule_spec_net(struct sptlrpc_rule *rule) +{ + return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY)); +} +static inline int rule_match_dir(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to); +} +static inline int rule_match_net(struct sptlrpc_rule *r1, + struct sptlrpc_rule *r2) +{ + return (r1->sr_netid == r2->sr_netid); +} + +/* + * merge @rule into @rset. + * the @rset slots might be expanded. + */ +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_rule *p = rset->srs_rules; + int spec_dir, spec_net; + int rc, n, match = 0; + + might_sleep(); + + spec_net = rule_spec_net(rule); + spec_dir = rule_spec_dir(rule); + + for (n = 0; n < rset->srs_nrule; n++) { + p = &rset->srs_rules[n]; + + /* test network match, if failed: + * - spec rule: skip rules which is also spec rule match, until + * we hit a wild rule, which means no more chance + * - wild rule: skip until reach the one which is also wild + * and matches + */ + if (!rule_match_net(p, rule)) { + if (spec_net) { + if (rule_spec_net(p)) + continue; + else + break; + } else { + continue; + } + } + + /* test dir match, same logic as net matching */ + if (!rule_match_dir(p, rule)) { + if (spec_dir) { + if (rule_spec_dir(p)) + continue; + else + break; + } else { + continue; + } + } + + /* find a match */ + match = 1; + break; + } + + if (match) { + LASSERT(n >= 0 && n < rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + /* remove this rule */ + if (n < rset->srs_nrule - 1) + memmove(&rset->srs_rules[n], + &rset->srs_rules[n + 1], + (rset->srs_nrule - n - 1) * + sizeof(*rule)); + rset->srs_nrule--; + } else { + /* override the rule */ + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + } + } else { + LASSERT(n >= 0 && n <= rset->srs_nrule); + + if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { + rc = sptlrpc_rule_set_expand(rset); + if (rc) + return rc; + + if (n < rset->srs_nrule) + memmove(&rset->srs_rules[n + 1], + &rset->srs_rules[n], + (rset->srs_nrule - n) * sizeof(*rule)); + memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); + rset->srs_nrule++; + } else { + CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); + } + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_merge); + +/** + * given from/to/nid, determine a matching flavor in ruleset. + * return 1 if a match found, otherwise return 0. + */ +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + + if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) && + r->sr_netid != LNET_NIDNET(LNET_NID_ANY) && + LNET_NIDNET(nid) != r->sr_netid) + continue; + + if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY && + from != r->sr_from) + continue; + + if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && + to != r->sr_to) + continue; + + *sf = r->sr_flvr; + return 1; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_rule_set_choose); + +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *r; + int n; + + for (n = 0; n < rset->srs_nrule; n++) { + r = &rset->srs_rules[n]; + CDEBUG(D_SEC, "<%02d> from %x to %x, net %x, rpc %x\n", n, + r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc); + } +} +EXPORT_SYMBOL(sptlrpc_rule_set_dump); + +static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, + struct sptlrpc_rule_set *tgt, + enum lustre_sec_part from, + enum lustre_sec_part to, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule_set *src[2] = { gen, tgt }; + struct sptlrpc_rule *rule; + int i, n, rc; + + might_sleep(); + + /* merge general rules firstly, then target-specific rules */ + for (i = 0; i < 2; i++) { + if (src[i] == NULL) + continue; + + for (n = 0; n < src[i]->srs_nrule; n++) { + rule = &src[i]->srs_rules[n]; + + if (from != LUSTRE_SP_ANY && + rule->sr_from != LUSTRE_SP_ANY && + rule->sr_from != from) + continue; + if (to != LUSTRE_SP_ANY && + rule->sr_to != LUSTRE_SP_ANY && + rule->sr_to != to) + continue; + + rc = sptlrpc_rule_set_merge(rset, rule); + if (rc) { + CERROR("can't merge: %d\n", rc); + return rc; + } + } + } + + return 0; +} + +/********************************** + * sptlrpc configuration support * + **********************************/ + +struct sptlrpc_conf_tgt { + struct list_head sct_list; + char sct_name[MAX_OBD_NAME]; + struct sptlrpc_rule_set sct_rset; +}; + +struct sptlrpc_conf { + struct list_head sc_list; + char sc_fsname[MTI_NAME_MAXLEN]; + unsigned int sc_modified; /* modified during updating */ + unsigned int sc_updated:1, /* updated copy from MGS */ + sc_local:1; /* local copy from target */ + struct sptlrpc_rule_set sc_rset; /* fs general rules */ + struct list_head sc_tgts; /* target-specific rules */ +}; + +static struct mutex sptlrpc_conf_lock; +static LIST_HEAD(sptlrpc_confs); + +static inline int is_hex(char c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f')); +} + +static void target2fsname(const char *tgt, char *fsname, int buflen) +{ + const char *ptr; + int len; + + ptr = strrchr(tgt, '-'); + if (ptr) { + if ((strncmp(ptr, "-MDT", 4) != 0 && + strncmp(ptr, "-OST", 4) != 0) || + !is_hex(ptr[4]) || !is_hex(ptr[5]) || + !is_hex(ptr[6]) || !is_hex(ptr[7])) + ptr = NULL; + } + + /* if we didn't find the pattern, treat the whole string as fsname */ + if (ptr == NULL) + len = strlen(tgt); + else + len = ptr - tgt; + + len = min(len, buflen - 1); + memcpy(fsname, tgt, len); + fsname[len] = '\0'; +} + +static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; + + sptlrpc_rule_set_free(&conf->sc_rset); + + list_for_each_entry_safe(conf_tgt, conf_tgt_next, + &conf->sc_tgts, sct_list) { + sptlrpc_rule_set_free(&conf_tgt->sct_rset); + list_del(&conf_tgt->sct_list); + OBD_FREE_PTR(conf_tgt); + } + LASSERT(list_empty(&conf->sc_tgts)); + + conf->sc_updated = 0; + conf->sc_local = 0; +} + +static void sptlrpc_conf_free(struct sptlrpc_conf *conf) +{ + CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); + + sptlrpc_conf_free_rsets(conf); + list_del(&conf->sc_list); + OBD_FREE_PTR(conf); +} + +static +struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, + const char *name, + int create) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + if (strcmp(conf_tgt->sct_name, name) == 0) + return conf_tgt; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf_tgt); + if (conf_tgt) { + strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); + sptlrpc_rule_set_init(&conf_tgt->sct_rset); + list_add(&conf_tgt->sct_list, &conf->sc_tgts); + } + + return conf_tgt; +} + +static +struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, + int create) +{ + struct sptlrpc_conf *conf; + + list_for_each_entry(conf, &sptlrpc_confs, sc_list) { + if (strcmp(conf->sc_fsname, fsname) == 0) + return conf; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf); + if (conf == NULL) + return NULL; + + strcpy(conf->sc_fsname, fsname); + sptlrpc_rule_set_init(&conf->sc_rset); + INIT_LIST_HEAD(&conf->sc_tgts); + list_add(&conf->sc_list, &sptlrpc_confs); + + CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); + return conf; +} + +/** + * caller must hold conf_lock already. + */ +static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, + const char *target, + struct sptlrpc_rule *rule) +{ + struct sptlrpc_conf_tgt *conf_tgt; + struct sptlrpc_rule_set *rule_set; + + /* fsname == target means general rules for the whole fs */ + if (strcmp(conf->sc_fsname, target) == 0) { + rule_set = &conf->sc_rset; + } else { + conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); + if (conf_tgt) { + rule_set = &conf_tgt->sct_rset; + } else { + CERROR("out of memory, can't merge rule!\n"); + return -ENOMEM; + } + } + + return sptlrpc_rule_set_merge(rule_set, rule); +} + +/** + * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we + * find one through the target name in the record inside conf_lock; + * otherwise means caller already hold conf_lock. + */ +static int __sptlrpc_process_config(struct lustre_cfg *lcfg, + struct sptlrpc_conf *conf) +{ + char *target, *param; + char fsname[MTI_NAME_MAXLEN]; + struct sptlrpc_rule rule; + int rc; + ENTRY; + + target = lustre_cfg_string(lcfg, 1); + if (target == NULL) { + CERROR("missing target name\n"); + RETURN(-EINVAL); + } + + param = lustre_cfg_string(lcfg, 2); + if (param == NULL) { + CERROR("missing parameter\n"); + RETURN(-EINVAL); + } + + CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); + + /* parse rule to make sure the format is correct */ + if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { + CERROR("Invalid sptlrpc parameter: %s\n", param); + RETURN(-EINVAL); + } + param += sizeof(PARAM_SRPC_FLVR) - 1; + + rc = sptlrpc_parse_rule(param, &rule); + if (rc) + RETURN(-EINVAL); + + if (conf == NULL) { + target2fsname(target, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("can't find conf\n"); + rc = -ENOMEM; + } else { + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + mutex_unlock(&sptlrpc_conf_lock); + } else { + LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); + rc = sptlrpc_conf_merge_rule(conf, target, &rule); + } + + if (rc == 0) + conf->sc_modified++; + + RETURN(rc); +} + +int sptlrpc_process_config(struct lustre_cfg *lcfg) +{ + return __sptlrpc_process_config(lcfg, NULL); +} +EXPORT_SYMBOL(sptlrpc_process_config); + +static int logname2fsname(const char *logname, char *buf, int buflen) +{ + char *ptr; + int len; + + ptr = strrchr(logname, '-'); + if (ptr == NULL || strcmp(ptr, "-sptlrpc")) { + CERROR("%s is not a sptlrpc config log\n", logname); + return -EINVAL; + } + + len = min((int) (ptr - logname), buflen - 1); + + memcpy(buf, logname, len); + buf[len] = '\0'; + return 0; +} + +void sptlrpc_conf_log_update_begin(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf && conf->sc_local) { + LASSERT(conf->sc_updated == 0); + sptlrpc_conf_free_rsets(conf); + } + conf->sc_modified = 0; + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); + +/** + * mark a config log has been updated + */ +void sptlrpc_conf_log_update_end(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + /* + * if original state is not updated, make sure the + * modified counter > 0 to enforce updating local copy. + */ + if (conf->sc_updated == 0) + conf->sc_modified++; + + conf->sc_updated = 1; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_end); + +void sptlrpc_conf_log_start(const char *logname) +{ + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + sptlrpc_conf_get(fsname, 1); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_start); + +void sptlrpc_conf_log_stop(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf) + sptlrpc_conf_free(conf); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_stop); + +static void inline flavor_set_flags(struct sptlrpc_flavor *sf, + enum lustre_sec_part from, + enum lustre_sec_part to, + unsigned int fl_udesc) +{ + /* + * null flavor doesn't need to set any flavor, and in fact + * we'd better not do that because everybody share a single sec. + */ + if (sf->sf_rpc == SPTLRPC_FLVR_NULL) + return; + + if (from == LUSTRE_SP_MDT) { + /* MDT->MDT; MDT->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { + /* CLI->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { + /* CLI->MDT */ + if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) + sf->sf_flags |= PTLRPC_SEC_FL_UDESC; + } +} + +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + char name[MTI_NAME_MAXLEN]; + int len, rc = 0; + + target2fsname(target->uuid, name, sizeof(name)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(name, 0); + if (conf == NULL) + goto out; + + /* convert uuid name (supposed end with _UUID) to target name */ + len = strlen(target->uuid); + LASSERT(len > 5); + memcpy(name, target->uuid, len - 5); + name[len - 5] = '\0'; + + conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); + if (conf_tgt) { + rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, + from, to, nid, sf); + if (rc) + goto out; + } + + rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf); +out: + mutex_unlock(&sptlrpc_conf_lock); + + if (rc == 0) + get_default_flavor(sf); + + flavor_set_flags(sf, from, to, 1); +} + +/** + * called by target devices, determine the expected flavor from + * certain peer (from, nid). + */ +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0) + get_default_flavor(sf); +} +EXPORT_SYMBOL(sptlrpc_target_choose_flavor); + +#define SEC_ADAPT_DELAY (10) + +/** + * called by client devices, notify the sptlrpc config has changed and + * do import_sec_adapt later. + */ +void sptlrpc_conf_client_adapt(struct obd_device *obd) +{ + struct obd_import *imp; + ENTRY; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0); + CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); + + /* serialize with connect/disconnect import */ + down_read(&obd->u.cli.cl_sem); + + imp = obd->u.cli.cl_import; + if (imp) { + spin_lock(&imp->imp_lock); + if (imp->imp_sec) + imp->imp_sec_expire = cfs_time_current_sec() + + SEC_ADAPT_DELAY; + spin_unlock(&imp->imp_lock); + } + + up_read(&obd->u.cli.cl_sem); + EXIT; +} +EXPORT_SYMBOL(sptlrpc_conf_client_adapt); + + +static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen) +{ + char dirbuf[8]; + char *net; + char *ptr = buf; + + if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY)) + net = "default"; + else + net = libcfs_net2str(r->sr_netid); + + if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY) + dirbuf[0] = '\0'; + else + snprintf(dirbuf, sizeof(dirbuf), ".%s2%s", + sptlrpc_part2name(r->sr_from), + sptlrpc_part2name(r->sr_to)); + + ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf); + + sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf)); + buf[buflen - 1] = '\0'; +} + +static int sptlrpc_record_rule_set(struct llog_handle *llh, + char *target, + struct sptlrpc_rule_set *rset) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + struct llog_rec_hdr rec; + int buflen; + char param[48]; + int i, rc; + + for (i = 0; i < rset->srs_nrule; i++) { + rule2string(&rset->srs_rules[i], param, sizeof(param)); + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set_string(&bufs, 1, target); + lustre_cfg_bufs_set_string(&bufs, 2, param); + lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs); + LASSERT(lcfg); + + buflen = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + rec.lrh_len = llog_data_len(buflen); + rec.lrh_type = OBD_CFG_REC; + rc = llog_write(NULL, llh, &rec, NULL, 0, (void *)lcfg, -1); + if (rc) + CERROR("failed to write a rec: rc = %d\n", rc); + lustre_cfg_free(lcfg); + } + return 0; +} + +static int sptlrpc_record_rules(struct llog_handle *llh, + struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset); + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + sptlrpc_record_rule_set(llh, conf_tgt->sct_name, + &conf_tgt->sct_rset); + } + return 0; +} + +#define LOG_SPTLRPC_TMP "sptlrpc.tmp" +#define LOG_SPTLRPC "sptlrpc" + +static +int sptlrpc_target_local_copy_conf(struct obd_device *obd, + struct sptlrpc_conf *conf) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + struct lvfs_run_ctxt saved; + struct dentry *dentry; + int rc; + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt == NULL) + RETURN(-EINVAL); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + dentry = ll_lookup_one_len(MOUNT_CONFIGS_DIR, cfs_fs_pwd(current->fs), + strlen(MOUNT_CONFIGS_DIR)); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + CERROR("cannot lookup %s directory: rc = %d\n", + MOUNT_CONFIGS_DIR, rc); + GOTO(out_ctx, rc); + } + + /* erase the old tmp log */ + rc = llog_erase(NULL, ctxt, NULL, LOG_SPTLRPC_TMP); + if (rc < 0 && rc != -ENOENT) { + CERROR("%s: cannot erase temporary sptlrpc log: rc = %d\n", + obd->obd_name, rc); + GOTO(out_dput, rc); + } + + /* write temporary log */ + rc = llog_open_create(NULL, ctxt, &llh, NULL, LOG_SPTLRPC_TMP); + if (rc) + GOTO(out_dput, rc); + rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + rc = sptlrpc_record_rules(llh, conf); + +out_close: + llog_close(NULL, llh); + if (rc == 0) + rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt, + LOG_SPTLRPC_TMP, LOG_SPTLRPC); +out_dput: + l_dput(dentry); +out_ctx: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n", + obd->obd_name, rc); + RETURN(rc); +} + +static int local_read_handler(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct sptlrpc_conf *conf = (struct sptlrpc_conf *) data; + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + int cfg_len, rc; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); + RETURN(-EINVAL); + } + + cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail); + + rc = lustre_cfg_sanity_check(lcfg, cfg_len); + if (rc) { + CERROR("Insane cfg\n"); + RETURN(rc); + } + + if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) { + CERROR("invalid command (%x)\n", lcfg->lcfg_command); + RETURN(-EINVAL); + } + + RETURN(__sptlrpc_process_config(lcfg, conf)); +} + +static +int sptlrpc_target_local_read_conf(struct obd_device *obd, + struct sptlrpc_conf *conf) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + struct lvfs_run_ctxt saved; + int rc; + ENTRY; + + LASSERT(conf->sc_updated == 0 && conf->sc_local == 0); + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt == NULL) { + CERROR("missing llog context\n"); + RETURN(-EINVAL); + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + rc = llog_open(NULL, ctxt, &llh, NULL, LOG_SPTLRPC, LLOG_OPEN_EXISTS); + if (rc < 0) { + if (rc == -ENOENT) + rc = 0; + GOTO(out_pop, rc); + } + + rc = llog_init_handle(NULL, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + if (llog_get_size(llh) <= 1) { + CDEBUG(D_SEC, "no local sptlrpc copy found\n"); + GOTO(out_close, rc = 0); + } + + rc = llog_process(NULL, llh, local_read_handler, (void *)conf, NULL); + + if (rc == 0) { + conf->sc_local = 1; + } else { + sptlrpc_conf_free_rsets(conf); + } + +out_close: + llog_close(NULL, llh); +out_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n", + obd->obd_name, rc); + RETURN(rc); +} + + +/** + * called by target devices, extract sptlrpc rules which applies to + * this target, to be used for future rpc flavor checking. + */ +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset, + int initial) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + enum lustre_sec_part sp_dst; + char fsname[MTI_NAME_MAXLEN]; + int rc = 0; + ENTRY; + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + sp_dst = LUSTRE_SP_MDT; + } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) { + sp_dst = LUSTRE_SP_OST; + } else { + CERROR("unexpected obd type %s\n", obd->obd_type->typ_name); + RETURN(-EINVAL); + } + CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid); + + target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("missing sptlrpc config log\n"); + GOTO(out, rc); + } + + if (conf->sc_updated == 0) { + /* + * always read from local copy. here another option is + * if we already have a local copy (read from another + * target device hosted on the same node) we simply use that. + */ + if (conf->sc_local) + sptlrpc_conf_free_rsets(conf); + + sptlrpc_target_local_read_conf(obd, conf); + } else { + LASSERT(conf->sc_local == 0); + + /* write a local copy */ + if (initial || conf->sc_modified) + sptlrpc_target_local_copy_conf(obd, conf); + else + CDEBUG(D_SEC, "unchanged, skip updating local copy\n"); + } + + /* extract rule set for this target */ + conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0); + + rc = sptlrpc_rule_set_extract(&conf->sc_rset, + conf_tgt ? &conf_tgt->sct_rset: NULL, + LUSTRE_SP_ANY, sp_dst, rset); +out: + mutex_unlock(&sptlrpc_conf_lock); + RETURN(rc); +} +EXPORT_SYMBOL(sptlrpc_conf_target_get_rules); + +int sptlrpc_conf_init(void) +{ + mutex_init(&sptlrpc_conf_lock); + return 0; +} + +void sptlrpc_conf_fini(void) +{ + struct sptlrpc_conf *conf, *conf_next; + + mutex_lock(&sptlrpc_conf_lock); + list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) { + sptlrpc_conf_free(conf); + } + LASSERT(list_empty(&sptlrpc_confs)); + mutex_unlock(&sptlrpc_conf_lock); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c new file mode 100644 index 000000000000..4c96a14a1bb6 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c @@ -0,0 +1,250 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_gc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include + +#include +#include +#include +#include + +#define SEC_GC_INTERVAL (30 * 60) + + +static struct mutex sec_gc_mutex; +static LIST_HEAD(sec_gc_list); +static spinlock_t sec_gc_list_lock; + +static LIST_HEAD(sec_gc_ctx_list); +static spinlock_t sec_gc_ctx_list_lock; + +static struct ptlrpc_thread sec_gc_thread; +static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); + + +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + LASSERT(sec->ps_gc_interval > 0); + LASSERT(list_empty(&sec->ps_gc_list)); + + sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval; + + spin_lock(&sec_gc_list_lock); + list_add_tail(&sec_gc_list, &sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} +EXPORT_SYMBOL(sptlrpc_gc_add_sec); + +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) +{ + if (list_empty(&sec->ps_gc_list)) + return; + + might_sleep(); + + /* signal before list_del to make iteration in gc thread safe */ + atomic_inc(&sec_gc_wait_del); + + spin_lock(&sec_gc_list_lock); + list_del_init(&sec->ps_gc_list); + spin_unlock(&sec_gc_list_lock); + + /* barrier */ + mutex_lock(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); + + atomic_dec(&sec_gc_wait_del); + + CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); +} +EXPORT_SYMBOL(sptlrpc_gc_del_sec); + +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx) +{ + LASSERT(list_empty(&ctx->cc_gc_chain)); + + CDEBUG(D_SEC, "hand over ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + spin_lock(&sec_gc_ctx_list_lock); + list_add(&ctx->cc_gc_chain, &sec_gc_ctx_list); + spin_unlock(&sec_gc_ctx_list_lock); + + thread_add_flags(&sec_gc_thread, SVC_SIGNAL); + wake_up(&sec_gc_thread.t_ctl_waitq); +} +EXPORT_SYMBOL(sptlrpc_gc_add_ctx); + +static void sec_process_ctx_list(void) +{ + struct ptlrpc_cli_ctx *ctx; + + spin_lock(&sec_gc_ctx_list_lock); + + while (!list_empty(&sec_gc_ctx_list)) { + ctx = list_entry(sec_gc_ctx_list.next, + struct ptlrpc_cli_ctx, cc_gc_chain); + list_del_init(&ctx->cc_gc_chain); + spin_unlock(&sec_gc_ctx_list_lock); + + LASSERT(ctx->cc_sec); + LASSERT(atomic_read(&ctx->cc_refcount) == 1); + CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n", + ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); + sptlrpc_cli_ctx_put(ctx, 1); + + spin_lock(&sec_gc_ctx_list_lock); + } + + spin_unlock(&sec_gc_ctx_list_lock); +} + +static void sec_do_gc(struct ptlrpc_sec *sec) +{ + LASSERT(sec->ps_policy->sp_cops->gc_ctx); + + if (unlikely(sec->ps_gc_next == 0)) { + CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n", + sec, sec->ps_policy->sp_name); + return; + } + + CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name); + + if (cfs_time_after(sec->ps_gc_next, cfs_time_current_sec())) + return; + + sec->ps_policy->sp_cops->gc_ctx(sec); + sec->ps_gc_next = cfs_time_current_sec() + sec->ps_gc_interval; +} + +static int sec_gc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *) arg; + struct l_wait_info lwi; + + unshare_fs_struct(); + + /* Record that the thread is running */ + thread_set_flags(thread, SVC_RUNNING); + wake_up(&thread->t_ctl_waitq); + + while (1) { + struct ptlrpc_sec *sec; + + thread_clear_flags(thread, SVC_SIGNAL); + sec_process_ctx_list(); +again: + /* go through sec list do gc. + * FIXME here we iterate through the whole list each time which + * is not optimal. we perhaps want to use balanced binary tree + * to trace each sec as order of expiry time. + * another issue here is we wakeup as fixed interval instead of + * according to each sec's expiry time */ + mutex_lock(&sec_gc_mutex); + list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { + /* if someone is waiting to be deleted, let it + * proceed as soon as possible. */ + if (atomic_read(&sec_gc_wait_del)) { + CDEBUG(D_SEC, "deletion pending, start over\n"); + mutex_unlock(&sec_gc_mutex); + goto again; + } + + sec_do_gc(sec); + } + mutex_unlock(&sec_gc_mutex); + + /* check ctx list again before sleep */ + sec_process_ctx_list(); + + lwi = LWI_TIMEOUT(SEC_GC_INTERVAL * HZ, NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopping(thread) || + thread_is_signal(thread), + &lwi); + + if (thread_test_and_clear_flags(thread, SVC_STOPPING)) + break; + } + + thread_set_flags(thread, SVC_STOPPED); + wake_up(&thread->t_ctl_waitq); + return 0; +} + +int sptlrpc_gc_init(void) +{ + struct l_wait_info lwi = { 0 }; + task_t *task; + + mutex_init(&sec_gc_mutex); + spin_lock_init(&sec_gc_list_lock); + spin_lock_init(&sec_gc_ctx_list_lock); + + /* initialize thread control */ + memset(&sec_gc_thread, 0, sizeof(sec_gc_thread)); + init_waitqueue_head(&sec_gc_thread.t_ctl_waitq); + + task = kthread_run(sec_gc_main, &sec_gc_thread, "sptlrpc_gc"); + if (IS_ERR(task)) { + CERROR("can't start gc thread: %ld\n", PTR_ERR(task)); + return PTR_ERR(task); + } + + l_wait_event(sec_gc_thread.t_ctl_waitq, + thread_is_running(&sec_gc_thread), &lwi); + return 0; +} + +void sptlrpc_gc_fini(void) +{ + struct l_wait_info lwi = { 0 }; + + thread_set_flags(&sec_gc_thread, SVC_STOPPING); + wake_up(&sec_gc_thread.t_ctl_waitq); + + l_wait_event(sec_gc_thread.t_ctl_waitq, + thread_is_stopped(&sec_gc_thread), &lwi); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c new file mode 100644 index 000000000000..920591b3bb17 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c @@ -0,0 +1,198 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_lproc.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "ptlrpc_internal.h" + + +struct proc_dir_entry *sptlrpc_proc_root = NULL; +EXPORT_SYMBOL(sptlrpc_proc_root); + +char *sec_flags2str(unsigned long flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strlcat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strlcat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strlcat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strlcat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strlcat(buf, "-,", bufsize); + + return buf; +} + +static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + char str[32]; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); + + seq_printf(seq, "rpc flavor: %s\n", + sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); + seq_printf(seq, "bulk flavor: %s\n", + sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); + seq_printf(seq, "flags: %s\n", + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); + seq_printf(seq, "id: %d\n", sec->ps_id); + seq_printf(seq, "refcount: %d\n", + atomic_read(&sec->ps_refcount)); + seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); + seq_printf(seq, "gc internal %ld\n", sec->ps_gc_interval); + seq_printf(seq, "gc next %ld\n", + sec->ps_gc_interval ? + sec->ps_gc_next - cfs_time_current_sec() : 0); + + sptlrpc_sec_put(sec); +out: + return 0; +} +LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs); + +static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *dev = seq->private; + struct client_obd *cli = &dev->u.cli; + struct ptlrpc_sec *sec = NULL; + + LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); + + if (cli->cl_import) + sec = sptlrpc_import_sec_ref(cli->cl_import); + if (sec == NULL) + goto out; + + if (sec->ps_policy->sp_cops->display) + sec->ps_policy->sp_cops->display(sec, seq); + + sptlrpc_sec_put(sec); +out: + return 0; +} +LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs); + +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev) +{ + int rc; + + if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 && + strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) { + CERROR("can't register lproc for obd type %s\n", + dev->obd_type->typ_name); + return -EINVAL; + } + + rc = lprocfs_obd_seq_create(dev, "srpc_info", 0444, + &sptlrpc_info_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_info for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + rc = lprocfs_obd_seq_create(dev, "srpc_contexts", 0444, + &sptlrpc_ctxs_lprocfs_fops, dev); + if (rc) { + CERROR("create proc entry srpc_contexts for %s: %d\n", + dev->obd_name, rc); + return rc; + } + + return 0; +} +EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach); + +static struct lprocfs_vars sptlrpc_lprocfs_vars[] = { + { "encrypt_page_pools", sptlrpc_proc_read_enc_pool, NULL, NULL }, + { NULL } +}; + +int sptlrpc_lproc_init(void) +{ + int rc; + + LASSERT(sptlrpc_proc_root == NULL); + + sptlrpc_proc_root = lprocfs_register("sptlrpc", proc_lustre_root, + sptlrpc_lprocfs_vars, NULL); + if (IS_ERR(sptlrpc_proc_root)) { + rc = PTR_ERR(sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + return rc; + } + return 0; +} + +void sptlrpc_lproc_fini(void) +{ + if (sptlrpc_proc_root) { + lprocfs_remove(&sptlrpc_proc_root); + sptlrpc_proc_root = NULL; + } +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c new file mode 100644 index 000000000000..ff1137fe4dd6 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c @@ -0,0 +1,464 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_null.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +static struct ptlrpc_sec_policy null_policy; +static struct ptlrpc_sec null_sec; +static struct ptlrpc_cli_ctx null_cli_ctx; +static struct ptlrpc_svc_ctx null_svc_ctx; + +/* + * we can temporarily use the topmost 8-bits of lm_secflvr to identify + * the source sec part. + */ +static inline +void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) +{ + msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24; +} + +static inline +enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) +{ + return (msg->lm_secflvr >> 24) & 0xFF; +} + +static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + + if (!req->rq_import->imp_dlm_fake) { + struct obd_device *obd = req->rq_import->imp_obd; + null_encode_sec_part(req->rq_reqbuf, + obd->u.cli.cl_sp_me); + } + req->rq_reqdata_len = req->rq_reqlen; + return 0; +} + +static +int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + __u32 cksums, cksumc; + + LASSERT(req->rq_repdata); + + req->rq_repmsg = req->rq_repdata; + req->rq_replen = req->rq_repdata_len; + + if (req->rq_early) { + cksums = lustre_msg_get_cksum(req->rq_repdata); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) + if (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_CKSUM_INCOMPAT18) + cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 0); + else + cksumc = lustre_msg_calc_cksum(req->rq_repmsg, 1); +#else +# warning "remove checksum compatibility support for b1_8" + cksumc = lustre_msg_calc_cksum(req->rq_repmsg); +#endif + if (cksumc != cksums) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cksumc, cksums); + return -EINVAL; + } + } + + return 0; +} + +static +struct ptlrpc_sec *null_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); + + /* general layer has take a module reference for us, because we never + * really destroy the sec, simply release the reference here. + */ + sptlrpc_policy_put(&null_policy); + return &null_sec; +} + +static +void null_destroy_sec(struct ptlrpc_sec *sec) +{ + LASSERT(sec == &null_sec); +} + +static +struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + atomic_inc(&null_cli_ctx.cc_refcount); + return &null_cli_ctx; +} + +static +int null_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, + int grace, int force) +{ + return 0; +} + +static +int null_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + if (!req->rq_reqbuf) { + int alloc_size = size_roundup_power2(msgsize); + + LASSERT(!req->rq_pool); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_size); + if (!req->rq_reqbuf) + return -ENOMEM; + + req->rq_reqbuf_len = alloc_size; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= msgsize); + memset(req->rq_reqbuf, 0, msgsize); + } + + req->rq_reqmsg = req->rq_reqbuf; + return 0; +} + +static +void null_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + if (!req->rq_pool) { + LASSERTF(req->rq_reqmsg == req->rq_reqbuf, + "req %p: reqmsg %p is not reqbuf %p in null sec\n", + req, req->rq_reqmsg, req->rq_reqbuf); + LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen, + "req %p: reqlen %d should smaller than buflen %d\n", + req, req->rq_reqlen, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } +} + +static +int null_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + /* add space for early replied */ + msgsize += lustre_msg_early_size(); + + msgsize = size_roundup_power2(msgsize); + + OBD_ALLOC_LARGE(req->rq_repbuf, msgsize); + if (!req->rq_repbuf) + return -ENOMEM; + + req->rq_repbuf_len = msgsize; + return 0; +} + +static +void null_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_repbuf); + + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; +} + +static +int null_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + struct lustre_msg *oldbuf = req->rq_reqmsg; + int oldsize, newmsg_size, alloc_size; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf == req->rq_reqmsg); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf)); + + /* compute new message size */ + oldsize = req->rq_reqbuf->lm_buflens[segment]; + req->rq_reqbuf->lm_buflens[segment] = newsize; + newmsg_size = lustre_packed_msg_size(oldbuf); + req->rq_reqbuf->lm_buflens[segment] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size); + + if (req->rq_reqbuf_len < newmsg_size) { + alloc_size = size_roundup_power2(newmsg_size); + + OBD_ALLOC_LARGE(newbuf, alloc_size); + if (newbuf == NULL) + return -ENOMEM; + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = req->rq_reqmsg = newbuf; + req->rq_reqbuf_len = alloc_size; + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + req->rq_reqlen = newmsg_size; + + return 0; +} + +static struct ptlrpc_svc_ctx null_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &null_policy, +}; + +static +int null_accept(struct ptlrpc_request *req) +{ + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_NULL); + + if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { + CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); + return SECSVC_DROP; + } + + req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf); + + req->rq_reqmsg = req->rq_reqbuf; + req->rq_reqlen = req->rq_reqdata_len; + + req->rq_svc_ctx = &null_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + return SECSVC_OK; +} + +static +int null_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + int rs_size = sizeof(*rs) + msgsize; + + LASSERT(msgsize % 8 == 0); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + return -ENOMEM; + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + rs->rs_msg = rs->rs_repbuf; + + req->rq_reply_state = rs; + return 0; +} + +static +void null_free_rs(struct ptlrpc_reply_state *rs) +{ + LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); +} + +static +int null_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + + LASSERT(rs); + + rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; + rs->rs_repdata_len = req->rq_replen; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = lustre_msg_early_size(); + else + req->rq_reply_off = 0; + } else { + __u32 cksum; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) + if (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_CKSUM_INCOMPAT18) + cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 0); + else + cksum = lustre_msg_calc_cksum(rs->rs_repbuf, 1); +#else +# warning "remove checksum compatibility support for b1_8" + cksum = lustre_msg_calc_cksum(rs->rs_repbuf); +#endif + lustre_msg_set_cksum(rs->rs_repbuf, cksum); + req->rq_reply_off = 0; + } + + return 0; +} + +static struct ptlrpc_ctx_ops null_ctx_ops = { + .refresh = null_ctx_refresh, + .sign = null_ctx_sign, + .verify = null_ctx_verify, +}; + +static struct ptlrpc_sec_cops null_sec_cops = { + .create_sec = null_create_sec, + .destroy_sec = null_destroy_sec, + .lookup_ctx = null_lookup_ctx, + .flush_ctx_cache = null_flush_ctx_cache, + .alloc_reqbuf = null_alloc_reqbuf, + .alloc_repbuf = null_alloc_repbuf, + .free_reqbuf = null_free_reqbuf, + .free_repbuf = null_free_repbuf, + .enlarge_reqbuf = null_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops null_sec_sops = { + .accept = null_accept, + .alloc_rs = null_alloc_rs, + .authorize = null_authorize, + .free_rs = null_free_rs, +}; + +static struct ptlrpc_sec_policy null_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "sec.null", + .sp_policy = SPTLRPC_POLICY_NULL, + .sp_cops = &null_sec_cops, + .sp_sops = &null_sec_sops, +}; + +static void null_init_internal(void) +{ + static HLIST_HEAD(__list); + + null_sec.ps_policy = &null_policy; + atomic_set(&null_sec.ps_refcount, 1); /* always busy */ + null_sec.ps_id = -1; + null_sec.ps_import = NULL; + null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; + null_sec.ps_flvr.sf_flags = 0; + null_sec.ps_part = LUSTRE_SP_ANY; + null_sec.ps_dying = 0; + spin_lock_init(&null_sec.ps_lock); + atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */ + INIT_LIST_HEAD(&null_sec.ps_gc_list); + null_sec.ps_gc_interval = 0; + null_sec.ps_gc_next = 0; + + hlist_add_head(&null_cli_ctx.cc_cache, &__list); + atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */ + null_cli_ctx.cc_sec = &null_sec; + null_cli_ctx.cc_ops = &null_ctx_ops; + null_cli_ctx.cc_expire = 0; + null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL | + PTLRPC_CTX_UPTODATE; + null_cli_ctx.cc_vcred.vc_uid = 0; + spin_lock_init(&null_cli_ctx.cc_lock); + INIT_LIST_HEAD(&null_cli_ctx.cc_req_list); + INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain); +} + +int sptlrpc_null_init(void) +{ + int rc; + + null_init_internal(); + + rc = sptlrpc_register_policy(&null_policy); + if (rc) + CERROR("failed to register %s: %d\n", null_policy.sp_name, rc); + + return rc; +} + +void sptlrpc_null_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&null_policy); + if (rc) + CERROR("failed to unregister %s: %d\n", null_policy.sp_name,rc); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c new file mode 100644 index 000000000000..f552d2f182b1 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c @@ -0,0 +1,1021 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/ptlrpc/sec_plain.c + * + * Author: Eric Mei + */ + +#define DEBUG_SUBSYSTEM S_SEC + + +#include +#include +#include +#include +#include + +struct plain_sec { + struct ptlrpc_sec pls_base; + rwlock_t pls_lock; + struct ptlrpc_cli_ctx *pls_ctx; +}; + +static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec) +{ + return container_of(sec, struct plain_sec, pls_base); +} + +static struct ptlrpc_sec_policy plain_policy; +static struct ptlrpc_ctx_ops plain_ctx_ops; +static struct ptlrpc_svc_ctx plain_svc_ctx; + +static unsigned int plain_at_offset; + +/* + * for simplicity, plain policy rpc use fixed layout. + */ +#define PLAIN_PACK_SEGMENTS (4) + +#define PLAIN_PACK_HDR_OFF (0) +#define PLAIN_PACK_MSG_OFF (1) +#define PLAIN_PACK_USER_OFF (2) +#define PLAIN_PACK_BULK_OFF (3) + +#define PLAIN_FL_USER (0x01) +#define PLAIN_FL_BULK (0x02) + +struct plain_header { + __u8 ph_ver; /* 0 */ + __u8 ph_flags; + __u8 ph_sp; /* source */ + __u8 ph_bulk_hash_alg; /* complete flavor desc */ + __u8 ph_pad[4]; +}; + +struct plain_bulk_token { + __u8 pbt_hash[8]; +}; + +#define PLAIN_BSD_SIZE \ + (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) + +/**************************************** + * bulk checksum helpers * + ****************************************/ + +static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed) +{ + struct ptlrpc_bulk_sec_desc *bsd; + + if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed)) + return -EPROTO; + + bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); + if (bsd == NULL) { + CERROR("bulk sec desc has short size %d\n", + lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); + return -EPROTO; + } + + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { + CERROR("invalid bulk svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; +} + +static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *token) +{ + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); + return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, + sizeof(token->pbt_hash)); +} + +static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *tokenr) +{ + struct plain_bulk_token tokenv; + int rc; + + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); + rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, + sizeof(tokenv.pbt_hash)); + if (rc) + return rc; + + if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) + return -EACCES; + return 0; +} + +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + char *ptr; + unsigned int off, i; + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len == 0) + continue; + + ptr = kmap(desc->bd_iov[i].kiov_page); + off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; + ptr[off] ^= 0x1; + kunmap(desc->bd_iov[i].kiov_page); + return; + } +} + +/**************************************** + * cli_ctx apis * + ****************************************/ + +static +int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx) +{ + /* should never reach here */ + LBUG(); + return 0; +} + +static +int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) +{ + return 0; +} + +static +int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + ENTRY; + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_sp = ctx->cc_sec->ps_part; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_udesc) + phdr->ph_flags |= PLAIN_FL_USER; + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, + msg->lm_buflens); + RETURN(0); +} + +static +int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_repdata; + struct plain_header *phdr; + __u32 cksum; + int swabbed; + ENTRY; + + if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { + CERROR("unexpected reply buf count %u\n", msg->lm_bufcount); + RETURN(-EPROTO); + } + + swabbed = ptlrpc_rep_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + /* expect no user desc in reply */ + if (phdr->ph_flags & PLAIN_FL_USER) { + CERROR("Unexpected udesc flag in reply\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { + CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, + req->rq_flvr.u_bulk.hash.hash_alg); + RETURN(-EPROTO); + } + + if (unlikely(req->rq_early)) { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&cksum, &hsize); + if (cksum != msg->lm_cksum) { + CDEBUG(D_SEC, + "early reply checksum mismatch: %08x != %08x\n", + cpu_to_le32(cksum), msg->lm_cksum); + RETURN(-EINVAL); + } + } else { + /* whether we sent with bulk or not, we expect the same + * in reply, except for early reply */ + if (!req->rq_early && + !equi(req->rq_pack_bulk == 1, + phdr->ph_flags & PLAIN_FL_BULK)) { + CERROR("%s bulk checksum in reply\n", + req->rq_pack_bulk ? "Missing" : "Unexpected"); + RETURN(-EPROTO); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(-EPROTO); + } + } + + req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); + RETURN(0); +} + +static +int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsd; + struct plain_bulk_token *token; + int rc; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + token = (struct plain_bulk_token *) bsd->bsd_data; + + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + if (req->rq_bulk_read) + RETURN(0); + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + token); + if (rc) { + CERROR("bulk write: failed to compute checksum: %d\n", rc); + } else { + /* + * for sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && + req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) + token->pbt_hash[0] ^= 0x1; + } + + return rc; +} + +static +int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_bulk_sec_desc *bsdv; + struct plain_bulk_token *tokenv; + int rc; + int i, nob; + + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); + LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); + + bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) + return -EIO; + return 0; + } + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) { + desc->bd_iov[i].kiov_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_iov[i].kiov_len; + } + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) + CERROR("bulk read: client verify failed: %d\n", rc); + + return rc; +} + +/**************************************** + * sec apis * + ****************************************/ + +static +struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec) +{ + struct ptlrpc_cli_ctx *ctx, *ctx_new; + + OBD_ALLOC_PTR(ctx_new); + + write_lock(&plsec->pls_lock); + + ctx = plsec->pls_ctx; + if (ctx) { + atomic_inc(&ctx->cc_refcount); + + if (ctx_new) + OBD_FREE_PTR(ctx_new); + } else if (ctx_new) { + ctx = ctx_new; + + atomic_set(&ctx->cc_refcount, 1); /* for cache */ + ctx->cc_sec = &plsec->pls_base; + ctx->cc_ops = &plain_ctx_ops; + ctx->cc_expire = 0; + ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE; + ctx->cc_vcred.vc_uid = 0; + spin_lock_init(&ctx->cc_lock); + INIT_LIST_HEAD(&ctx->cc_req_list); + INIT_LIST_HEAD(&ctx->cc_gc_chain); + + plsec->pls_ctx = ctx; + atomic_inc(&plsec->pls_base.ps_nctx); + atomic_inc(&plsec->pls_base.ps_refcount); + + atomic_inc(&ctx->cc_refcount); /* for caller */ + } + + write_unlock(&plsec->pls_lock); + + return ctx; +} + +static +void plain_destroy_sec(struct ptlrpc_sec *sec) +{ + struct plain_sec *plsec = sec2plsec(sec); + ENTRY; + + LASSERT(sec->ps_policy == &plain_policy); + LASSERT(sec->ps_import); + LASSERT(atomic_read(&sec->ps_refcount) == 0); + LASSERT(atomic_read(&sec->ps_nctx) == 0); + LASSERT(plsec->pls_ctx == NULL); + + class_import_put(sec->ps_import); + + OBD_FREE_PTR(plsec); + EXIT; +} + +static +void plain_kill_sec(struct ptlrpc_sec *sec) +{ + sec->ps_dying = 1; +} + +static +struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, + struct ptlrpc_svc_ctx *svc_ctx, + struct sptlrpc_flavor *sf) +{ + struct plain_sec *plsec; + struct ptlrpc_sec *sec; + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); + + OBD_ALLOC_PTR(plsec); + if (plsec == NULL) + RETURN(NULL); + + /* + * initialize plain_sec + */ + rwlock_init(&plsec->pls_lock); + plsec->pls_ctx = NULL; + + sec = &plsec->pls_base; + sec->ps_policy = &plain_policy; + atomic_set(&sec->ps_refcount, 0); + atomic_set(&sec->ps_nctx, 0); + sec->ps_id = sptlrpc_get_next_secid(); + sec->ps_import = class_import_get(imp); + sec->ps_flvr = *sf; + spin_lock_init(&sec->ps_lock); + INIT_LIST_HEAD(&sec->ps_gc_list); + sec->ps_gc_interval = 0; + sec->ps_gc_next = 0; + + /* install ctx immediately if this is a reverse sec */ + if (svc_ctx) { + ctx = plain_sec_install_ctx(plsec); + if (ctx == NULL) { + plain_destroy_sec(sec); + RETURN(NULL); + } + sptlrpc_cli_ctx_put(ctx, 1); + } + + RETURN(sec); +} + +static +struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, int remove_dead) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + read_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + if (ctx) + atomic_inc(&ctx->cc_refcount); + read_unlock(&plsec->pls_lock); + + if (unlikely(ctx == NULL)) + ctx = plain_sec_install_ctx(plsec); + + RETURN(ctx); +} + +static +void plain_release_ctx(struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, int sync) +{ + LASSERT(atomic_read(&sec->ps_refcount) > 0); + LASSERT(atomic_read(&sec->ps_nctx) > 0); + LASSERT(atomic_read(&ctx->cc_refcount) == 0); + LASSERT(ctx->cc_sec == sec); + + OBD_FREE_PTR(ctx); + + atomic_dec(&sec->ps_nctx); + sptlrpc_sec_put(sec); +} + +static +int plain_flush_ctx_cache(struct ptlrpc_sec *sec, + uid_t uid, int grace, int force) +{ + struct plain_sec *plsec = sec2plsec(sec); + struct ptlrpc_cli_ctx *ctx; + ENTRY; + + /* do nothing unless caller want to flush for 'all' */ + if (uid != -1) + RETURN(0); + + write_lock(&plsec->pls_lock); + ctx = plsec->pls_ctx; + plsec->pls_ctx = NULL; + write_unlock(&plsec->pls_lock); + + if (ctx) + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(0); +} + +static +int plain_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_udesc) + buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size(); + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + if (!req->rq_reqbuf) { + LASSERT(!req->rq_pool); + + alloc_len = size_roundup_power2(alloc_len); + OBD_ALLOC_LARGE(req->rq_reqbuf, alloc_len); + if (!req->rq_reqbuf) + RETURN(-ENOMEM); + + req->rq_reqbuf_len = alloc_len; + } else { + LASSERT(req->rq_pool); + LASSERT(req->rq_reqbuf_len >= alloc_len); + memset(req->rq_reqbuf, 0, alloc_len); + } + + lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); + + if (req->rq_pack_udesc) + sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF); + + RETURN(0); +} + +static +void plain_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + if (!req->rq_pool) { + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqbuf_len = 0; + } + EXIT; +} + +static +int plain_alloc_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int alloc_len; + ENTRY; + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk) { + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + } + + alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + /* add space for early reply */ + alloc_len += plain_at_offset; + + alloc_len = size_roundup_power2(alloc_len); + + OBD_ALLOC_LARGE(req->rq_repbuf, alloc_len); + if (!req->rq_repbuf) + RETURN(-ENOMEM); + + req->rq_repbuf_len = alloc_len; + RETURN(0); +} + +static +void plain_free_repbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + ENTRY; + OBD_FREE_LARGE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repbuf_len = 0; + EXIT; +} + +static +int plain_enlarge_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize) +{ + struct lustre_msg *newbuf; + int oldsize; + int newmsg_size, newbuf_size; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); + LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) == + req->rq_reqmsg); + + /* compute new embedded msg size. */ + oldsize = req->rq_reqmsg->lm_buflens[segment]; + req->rq_reqmsg->lm_buflens[segment] = newsize; + newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount, + req->rq_reqmsg->lm_buflens); + req->rq_reqmsg->lm_buflens[segment] = oldsize; + + /* compute new wrapper msg size. */ + oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF]; + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size; + newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount, + req->rq_reqbuf->lm_buflens); + req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize; + + /* request from pool should always have enough buffer */ + LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); + + if (req->rq_reqbuf_len < newbuf_size) { + newbuf_size = size_roundup_power2(newbuf_size); + + OBD_ALLOC_LARGE(newbuf, newbuf_size); + if (newbuf == NULL) + RETURN(-ENOMEM); + + memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); + + OBD_FREE_LARGE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = newbuf; + req->rq_reqbuf_len = newbuf_size; + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, + PLAIN_PACK_MSG_OFF, 0); + } + + _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, + newmsg_size); + _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); + + req->rq_reqlen = newmsg_size; + RETURN(0); +} + +/**************************************** + * service apis * + ****************************************/ + +static struct ptlrpc_svc_ctx plain_svc_ctx = { + .sc_refcount = ATOMIC_INIT(1), + .sc_policy = &plain_policy, +}; + +static +int plain_accept(struct ptlrpc_request *req) +{ + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; + int swabbed; + ENTRY; + + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_PLAIN); + + if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || + SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { + CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } + + if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { + CERROR("unexpected request buf count %u\n", msg->lm_bufcount); + RETURN(SECSVC_DROP); + } + + swabbed = ptlrpc_req_need_swab(req); + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { + CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); + RETURN(-EPROTO); + } + + req->rq_sp_from = phdr->ph_sp; + req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; + + if (phdr->ph_flags & PLAIN_FL_USER) { + if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF, + swabbed)) { + CERROR("Mal-formed user descriptor\n"); + RETURN(SECSVC_DROP); + } + + req->rq_pack_udesc = 1; + req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); + } + + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg, swabbed)) + RETURN(SECSVC_DROP); + + req->rq_pack_bulk = 1; + } + + req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); + req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; + + req->rq_svc_ctx = &plain_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + + RETURN(SECSVC_OK); +} + +static +int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_reply_state *rs; + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rs_size = sizeof(*rs); + ENTRY; + + LASSERT(msgsize % 8 == 0); + + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); + buflens[PLAIN_PACK_MSG_OFF] = msgsize; + + if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; + + rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rs = req->rq_reply_state; + + if (rs) { + /* pre-allocated */ + LASSERT(rs->rs_size >= rs_size); + } else { + OBD_ALLOC_LARGE(rs, rs_size); + if (rs == NULL) + RETURN(-ENOMEM); + + rs->rs_size = rs_size; + } + + rs->rs_svc_ctx = req->rq_svc_ctx; + atomic_inc(&req->rq_svc_ctx->sc_refcount); + rs->rs_repbuf = (struct lustre_msg *) (rs + 1); + rs->rs_repbuf_len = rs_size - sizeof(*rs); + + lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); + rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0); + + req->rq_reply_state = rs; + RETURN(0); +} + +static +void plain_free_rs(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1); + atomic_dec(&rs->rs_svc_ctx->sc_refcount); + + if (!rs->rs_prealloc) + OBD_FREE_LARGE(rs, rs->rs_size); + EXIT; +} + +static +int plain_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct lustre_msg_v2 *msg = rs->rs_repbuf; + struct plain_header *phdr; + int len; + ENTRY; + + LASSERT(rs); + LASSERT(msg); + + if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF]) + len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF, + req->rq_replen, 1); + else + len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); + + msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; + + rs->rs_repdata_len = len; + + if (likely(req->rq_packed_final)) { + if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) + req->rq_reply_off = plain_at_offset; + else + req->rq_reply_off = 0; + } else { + unsigned int hsize = 4; + + cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, + lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), + lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF), + NULL, 0, (unsigned char *)&msg->lm_cksum, &hsize); + req->rq_reply_off = 0; + } + + RETURN(0); +} + +static +int plain_svc_unwrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr; + int rc; + + LASSERT(req->rq_bulk_write); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenr); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk write: server verify failed: %d\n", rc); + } + + return rc; +} + +static +int plain_svc_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenv; + int rc; + + LASSERT(req->rq_bulk_read); + LASSERT(req->rq_pack_bulk); + + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) { + CERROR("bulk read: server failed to compute " + "checksum: %d\n", rc); + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) + corrupt_bulk_data(desc); + } + + return rc; +} + +static struct ptlrpc_ctx_ops plain_ctx_ops = { + .refresh = plain_ctx_refresh, + .validate = plain_ctx_validate, + .sign = plain_ctx_sign, + .verify = plain_ctx_verify, + .wrap_bulk = plain_cli_wrap_bulk, + .unwrap_bulk = plain_cli_unwrap_bulk, +}; + +static struct ptlrpc_sec_cops plain_sec_cops = { + .create_sec = plain_create_sec, + .destroy_sec = plain_destroy_sec, + .kill_sec = plain_kill_sec, + .lookup_ctx = plain_lookup_ctx, + .release_ctx = plain_release_ctx, + .flush_ctx_cache = plain_flush_ctx_cache, + .alloc_reqbuf = plain_alloc_reqbuf, + .free_reqbuf = plain_free_reqbuf, + .alloc_repbuf = plain_alloc_repbuf, + .free_repbuf = plain_free_repbuf, + .enlarge_reqbuf = plain_enlarge_reqbuf, +}; + +static struct ptlrpc_sec_sops plain_sec_sops = { + .accept = plain_accept, + .alloc_rs = plain_alloc_rs, + .authorize = plain_authorize, + .free_rs = plain_free_rs, + .unwrap_bulk = plain_svc_unwrap_bulk, + .wrap_bulk = plain_svc_wrap_bulk, +}; + +static struct ptlrpc_sec_policy plain_policy = { + .sp_owner = THIS_MODULE, + .sp_name = "plain", + .sp_policy = SPTLRPC_POLICY_PLAIN, + .sp_cops = &plain_sec_cops, + .sp_sops = &plain_sec_sops, +}; + +int sptlrpc_plain_init(void) +{ + __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; + int rc; + + buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size(); + plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); + + rc = sptlrpc_register_policy(&plain_policy); + if (rc) + CERROR("failed to register: %d\n", rc); + + return rc; +} + +void sptlrpc_plain_fini(void) +{ + int rc; + + rc = sptlrpc_unregister_policy(&plain_policy); + if (rc) + CERROR("cannot unregister: %d\n", rc); +} diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c new file mode 100644 index 000000000000..80111273b2dc --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/service.c @@ -0,0 +1,3128 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC +#include +#include +#include +#include +#include +#include "ptlrpc_internal.h" + +/* The following are visible and mutable through /sys/module/ptlrpc */ +int test_req_buffer_pressure = 0; +CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444, + "set non-zero to put pressure on request buffer pools"); +CFS_MODULE_PARM(at_min, "i", int, 0644, + "Adaptive timeout minimum (sec)"); +CFS_MODULE_PARM(at_max, "i", int, 0644, + "Adaptive timeout maximum (sec)"); +CFS_MODULE_PARM(at_history, "i", int, 0644, + "Adaptive timeouts remember the slowest event that took place " + "within this period (sec)"); +CFS_MODULE_PARM(at_early_margin, "i", int, 0644, + "How soon before an RPC deadline to send an early reply"); +CFS_MODULE_PARM(at_extra, "i", int, 0644, + "How much extra time to give with each early reply"); + + +/* forward ref */ +static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); +static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); + +/** Holds a list of all PTLRPC services */ +LIST_HEAD(ptlrpc_all_services); +/** Used to protect the \e ptlrpc_all_services list */ +struct mutex ptlrpc_all_services_mutex; + +struct ptlrpc_request_buffer_desc * +ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + + OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt); + if (rqbd == NULL) + return NULL; + + rqbd->rqbd_svcpt = svcpt; + rqbd->rqbd_refcount = 0; + rqbd->rqbd_cbid.cbid_fn = request_in_callback; + rqbd->rqbd_cbid.cbid_arg = rqbd; + INIT_LIST_HEAD(&rqbd->rqbd_reqs); + OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, + svcpt->scp_cpt, svc->srv_buf_size); + if (rqbd->rqbd_buffer == NULL) { + OBD_FREE_PTR(rqbd); + return NULL; + } + + spin_lock(&svcpt->scp_lock); + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + svcpt->scp_nrqbds_total++; + spin_unlock(&svcpt->scp_lock); + + return rqbd; +} + +void +ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) +{ + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + + LASSERT(rqbd->rqbd_refcount == 0); + LASSERT(list_empty(&rqbd->rqbd_reqs)); + + spin_lock(&svcpt->scp_lock); + list_del(&rqbd->rqbd_list); + svcpt->scp_nrqbds_total--; + spin_unlock(&svcpt->scp_lock); + + OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size); + OBD_FREE_PTR(rqbd); +} + +int +ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request_buffer_desc *rqbd; + int rc = 0; + int i; + + if (svcpt->scp_rqbd_allocating) + goto try_post; + + spin_lock(&svcpt->scp_lock); + /* check again with lock */ + if (svcpt->scp_rqbd_allocating) { + /* NB: we might allow more than one thread in the future */ + LASSERT(svcpt->scp_rqbd_allocating == 1); + spin_unlock(&svcpt->scp_lock); + goto try_post; + } + + svcpt->scp_rqbd_allocating++; + spin_unlock(&svcpt->scp_lock); + + + for (i = 0; i < svc->srv_nbuf_per_group; i++) { + /* NB: another thread might have recycled enough rqbds, we + * need to make sure it wouldn't over-allocate, see LU-1212. */ + if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) + break; + + rqbd = ptlrpc_alloc_rqbd(svcpt); + + if (rqbd == NULL) { + CERROR("%s: Can't allocate request buffer\n", + svc->srv_name); + rc = -ENOMEM; + break; + } + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(svcpt->scp_rqbd_allocating == 1); + svcpt->scp_rqbd_allocating--; + + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_RPCTRACE, + "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", + svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, + svcpt->scp_nrqbds_total, rc); + + try_post: + if (post && rc == 0) + rc = ptlrpc_server_post_idle_rqbds(svcpt); + + return rc; +} + +/** + * Part of Rep-Ack logic. + * Puts a lock and its mode into reply state assotiated to request reply. + */ +void +ptlrpc_save_lock(struct ptlrpc_request *req, + struct lustre_handle *lock, int mode, int no_ack) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + int idx; + + LASSERT(rs != NULL); + LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); + + if (req->rq_export->exp_disconnected) { + ldlm_lock_decref(lock, mode); + } else { + idx = rs->rs_nlocks++; + rs->rs_locks[idx] = *lock; + rs->rs_modes[idx] = mode; + rs->rs_difficult = 1; + rs->rs_no_ack = !!no_ack; + } +} +EXPORT_SYMBOL(ptlrpc_save_lock); + + +struct ptlrpc_hr_partition; + +struct ptlrpc_hr_thread { + int hrt_id; /* thread ID */ + spinlock_t hrt_lock; + wait_queue_head_t hrt_waitq; + struct list_head hrt_queue; /* RS queue */ + struct ptlrpc_hr_partition *hrt_partition; +}; + +struct ptlrpc_hr_partition { + /* # of started threads */ + atomic_t hrp_nstarted; + /* # of stopped threads */ + atomic_t hrp_nstopped; + /* cpu partition id */ + int hrp_cpt; + /* round-robin rotor for choosing thread */ + int hrp_rotor; + /* total number of threads on this partition */ + int hrp_nthrs; + /* threads table */ + struct ptlrpc_hr_thread *hrp_thrs; +}; + +#define HRT_RUNNING 0 +#define HRT_STOPPING 1 + +struct ptlrpc_hr_service { + /* CPU partition table, it's just cfs_cpt_table for now */ + struct cfs_cpt_table *hr_cpt_table; + /** controller sleep waitq */ + wait_queue_head_t hr_waitq; + unsigned int hr_stopping; + /** roundrobin rotor for non-affinity service */ + unsigned int hr_rotor; + /* partition data */ + struct ptlrpc_hr_partition **hr_partitions; +}; + +struct rs_batch { + struct list_head rsb_replies; + unsigned int rsb_n_replies; + struct ptlrpc_service_part *rsb_svcpt; +}; + +/** reply handling service. */ +static struct ptlrpc_hr_service ptlrpc_hr; + +/** + * maximum mumber of replies scheduled in one batch + */ +#define MAX_SCHEDULED 256 + +/** + * Initialize a reply batch. + * + * \param b batch + */ +static void rs_batch_init(struct rs_batch *b) +{ + memset(b, 0, sizeof *b); + INIT_LIST_HEAD(&b->rsb_replies); +} + +/** + * Choose an hr thread to dispatch requests to. + */ +static struct ptlrpc_hr_thread * +ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_hr_partition *hrp; + unsigned int rotor; + + if (svcpt->scp_cpt >= 0 && + svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { + /* directly match partition */ + hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; + + } else { + rotor = ptlrpc_hr.hr_rotor++; + rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); + + hrp = ptlrpc_hr.hr_partitions[rotor]; + } + + rotor = hrp->hrp_rotor++; + return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; +} + +/** + * Dispatch all replies accumulated in the batch to one from + * dedicated reply handling threads. + * + * \param b batch + */ +static void rs_batch_dispatch(struct rs_batch *b) +{ + if (b->rsb_n_replies != 0) { + struct ptlrpc_hr_thread *hrt; + + hrt = ptlrpc_hr_select(b->rsb_svcpt); + + spin_lock(&hrt->hrt_lock); + list_splice_init(&b->rsb_replies, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + b->rsb_n_replies = 0; + } +} + +/** + * Add a reply to a batch. + * Add one reply object to a batch, schedule batched replies if overload. + * + * \param b batch + * \param rs reply + */ +static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + + if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } + spin_lock(&svcpt->scp_rep_lock); + b->rsb_svcpt = svcpt; + } + spin_lock(&rs->rs_lock); + rs->rs_scheduled_ever = 1; + if (rs->rs_scheduled == 0) { + list_move(&rs->rs_list, &b->rsb_replies); + rs->rs_scheduled = 1; + b->rsb_n_replies++; + } + rs->rs_committed = 1; + spin_unlock(&rs->rs_lock); +} + +/** + * Reply batch finalization. + * Dispatch remaining replies from the batch + * and release remaining spinlock. + * + * \param b batch + */ +static void rs_batch_fini(struct rs_batch *b) +{ + if (b->rsb_svcpt != NULL) { + rs_batch_dispatch(b); + spin_unlock(&b->rsb_svcpt->scp_rep_lock); + } +} + +#define DECLARE_RS_BATCH(b) struct rs_batch b + + +/** + * Put reply state into a queue for processing because we received + * ACK from the client + */ +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_hr_thread *hrt; + ENTRY; + + LASSERT(list_empty(&rs->rs_list)); + + hrt = ptlrpc_hr_select(rs->rs_svcpt); + + spin_lock(&hrt->hrt_lock); + list_add_tail(&rs->rs_list, &hrt->hrt_queue); + spin_unlock(&hrt->hrt_lock); + + wake_up(&hrt->hrt_waitq); + EXIT; +} + +void +ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) +{ + ENTRY; + + LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock)); + LASSERT(spin_is_locked(&rs->rs_lock)); + LASSERT (rs->rs_difficult); + rs->rs_scheduled_ever = 1; /* flag any notification attempt */ + + if (rs->rs_scheduled) { /* being set up or already notified */ + EXIT; + return; + } + + rs->rs_scheduled = 1; + list_del_init(&rs->rs_list); + ptlrpc_dispatch_difficult_reply(rs); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); + +void ptlrpc_commit_replies(struct obd_export *exp) +{ + struct ptlrpc_reply_state *rs, *nxt; + DECLARE_RS_BATCH(batch); + ENTRY; + + rs_batch_init(&batch); + /* Find any replies that have been committed and get their service + * to attend to complete them. */ + + /* CAVEAT EMPTOR: spinlock ordering!!! */ + spin_lock(&exp->exp_uncommitted_replies_lock); + list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, + rs_obd_list) { + LASSERT (rs->rs_difficult); + /* VBR: per-export last_committed */ + LASSERT(rs->rs_export); + if (rs->rs_transno <= exp->exp_last_committed) { + list_del_init(&rs->rs_obd_list); + rs_batch_add(&batch, rs); + } + } + spin_unlock(&exp->exp_uncommitted_replies_lock); + rs_batch_fini(&batch); + EXIT; +} +EXPORT_SYMBOL(ptlrpc_commit_replies); + +static int +ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request_buffer_desc *rqbd; + int rc; + int posted = 0; + + for (;;) { + spin_lock(&svcpt->scp_lock); + + if (list_empty(&svcpt->scp_rqbd_idle)) { + spin_unlock(&svcpt->scp_lock); + return posted; + } + + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + list_del(&rqbd->rqbd_list); + + /* assume we will post successfully */ + svcpt->scp_nrqbds_posted++; + list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); + + spin_unlock(&svcpt->scp_lock); + + rc = ptlrpc_register_rqbd(rqbd); + if (rc != 0) + break; + + posted = 1; + } + + spin_lock(&svcpt->scp_lock); + + svcpt->scp_nrqbds_posted--; + list_del(&rqbd->rqbd_list); + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); + + /* Don't complain if no request buffers are posted right now; LNET + * won't drop requests because we set the portal lazy! */ + + spin_unlock(&svcpt->scp_lock); + + return -1; +} + +static void ptlrpc_at_timer(unsigned long castmeharder) +{ + struct ptlrpc_service_part *svcpt; + + svcpt = (struct ptlrpc_service_part *)castmeharder; + + svcpt->scp_at_check = 1; + svcpt->scp_at_checktime = cfs_time_current(); + wake_up(&svcpt->scp_waitq); +} + +static void +ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, + struct ptlrpc_service_conf *conf) +{ + struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; + unsigned init; + unsigned total; + unsigned nthrs; + int weight; + + /* + * Common code for estimating & validating threads number. + * CPT affinity service could have percpt thread-pool instead + * of a global thread-pool, which means user might not always + * get the threads number they give it in conf::tc_nthrs_user + * even they did set. It's because we need to validate threads + * number for each CPT to guarantee each pool will have enough + * threads to keep the service healthy. + */ + init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); + init = max_t(int, init, tc->tc_nthrs_init); + + /* NB: please see comments in lustre_lnet.h for definition + * details of these members */ + LASSERT(tc->tc_nthrs_max != 0); + + if (tc->tc_nthrs_user != 0) { + /* In case there is a reason to test a service with many + * threads, we give a less strict check here, it can + * be up to 8 * nthrs_max */ + total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); + nthrs = total / svc->srv_ncpts; + init = max(init, nthrs); + goto out; + } + + total = tc->tc_nthrs_max; + if (tc->tc_nthrs_base == 0) { + /* don't care about base threads number per partition, + * this is most for non-affinity service */ + nthrs = total / svc->srv_ncpts; + goto out; + } + + nthrs = tc->tc_nthrs_base; + if (svc->srv_ncpts == 1) { + int i; + + /* NB: Increase the base number if it's single partition + * and total number of cores/HTs is larger or equal to 4. + * result will always < 2 * nthrs_base */ + weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); + for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ + (tc->tc_nthrs_base >> i) != 0; i++) + nthrs += tc->tc_nthrs_base >> i; + } + + if (tc->tc_thr_factor != 0) { + int factor = tc->tc_thr_factor; + const int fade = 4; + + /* + * User wants to increase number of threads with for + * each CPU core/HT, most likely the factor is larger then + * one thread/core because service threads are supposed to + * be blocked by lock or wait for IO. + */ + /* + * Amdahl's law says that adding processors wouldn't give + * a linear increasing of parallelism, so it's nonsense to + * have too many threads no matter how many cores/HTs + * there are. + */ + if (cfs_cpu_ht_nsiblings(0) > 1) { /* weight is # of HTs */ + /* depress thread factor for hyper-thread */ + factor = factor - (factor >> 1) + (factor >> 3); + } + + weight = cfs_cpt_weight(svc->srv_cptable, 0); + LASSERT(weight > 0); + + for (; factor > 0 && weight > 0; factor--, weight -= fade) + nthrs += min(weight, fade) * factor; + } + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + nthrs = max(tc->tc_nthrs_base, + tc->tc_nthrs_max / svc->srv_ncpts); + } + out: + nthrs = max(nthrs, tc->tc_nthrs_init); + svc->srv_nthrs_cpt_limit = nthrs; + svc->srv_nthrs_cpt_init = init; + + if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { + CDEBUG(D_OTHER, "%s: This service may have more threads (%d) " + "than the given soft limit (%d)\n", + svc->srv_name, nthrs * svc->srv_ncpts, + tc->tc_nthrs_max); + } +} + +/** + * Initialize percpt data for a service + */ +static int +ptlrpc_service_part_init(struct ptlrpc_service *svc, + struct ptlrpc_service_part *svcpt, int cpt) +{ + struct ptlrpc_at_array *array; + int size; + int index; + int rc; + + svcpt->scp_cpt = cpt; + INIT_LIST_HEAD(&svcpt->scp_threads); + + /* rqbd and incoming request queue */ + spin_lock_init(&svcpt->scp_lock); + INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); + INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); + INIT_LIST_HEAD(&svcpt->scp_req_incoming); + init_waitqueue_head(&svcpt->scp_waitq); + /* history request & rqbd list */ + INIT_LIST_HEAD(&svcpt->scp_hist_reqs); + INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); + + /* acitve requests and hp requests */ + spin_lock_init(&svcpt->scp_req_lock); + + /* reply states */ + spin_lock_init(&svcpt->scp_rep_lock); + INIT_LIST_HEAD(&svcpt->scp_rep_active); + INIT_LIST_HEAD(&svcpt->scp_rep_idle); + init_waitqueue_head(&svcpt->scp_rep_waitq); + atomic_set(&svcpt->scp_nreps_difficult, 0); + + /* adaptive timeout */ + spin_lock_init(&svcpt->scp_at_lock); + array = &svcpt->scp_at_array; + + size = at_est2timeout(at_max); + array->paa_size = size; + array->paa_count = 0; + array->paa_deadline = -1; + + /* allocate memory for scp_at_array (ptlrpc_at_array) */ + OBD_CPT_ALLOC(array->paa_reqs_array, + svc->srv_cptable, cpt, sizeof(struct list_head) * size); + if (array->paa_reqs_array == NULL) + return -ENOMEM; + + for (index = 0; index < size; index++) + INIT_LIST_HEAD(&array->paa_reqs_array[index]); + + OBD_CPT_ALLOC(array->paa_reqs_count, + svc->srv_cptable, cpt, sizeof(__u32) * size); + if (array->paa_reqs_count == NULL) + goto failed; + + cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt); + /* At SOW, service time should be quick; 10s seems generous. If client + * timeout is less than this, we'll be sending an early reply. */ + at_init(&svcpt->scp_at_estimate, 10, 0); + + /* assign this before call ptlrpc_grow_req_bufs */ + svcpt->scp_service = svc; + /* Now allocate the request buffers, but don't post them now */ + rc = ptlrpc_grow_req_bufs(svcpt, 0); + /* We shouldn't be under memory pressure at startup, so + * fail if we can't allocate all our buffers at this time. */ + if (rc != 0) + goto failed; + + return 0; + + failed: + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size); + array->paa_reqs_count = NULL; + } + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + return -ENOMEM; +} + +/** + * Initialize service on a given portal. + * This includes starting serving threads , allocating and posting rqbds and + * so on. + */ +struct ptlrpc_service * +ptlrpc_register_service(struct ptlrpc_service_conf *conf, + proc_dir_entry_t *proc_entry) +{ + struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; + struct ptlrpc_service *service; + struct ptlrpc_service_part *svcpt; + struct cfs_cpt_table *cptable; + __u32 *cpts = NULL; + int ncpts; + int cpt; + int rc; + int i; + ENTRY; + + LASSERT(conf->psc_buf.bc_nbufs > 0); + LASSERT(conf->psc_buf.bc_buf_size >= + conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); + LASSERT(conf->psc_thr.tc_ctx_tags != 0); + + cptable = cconf->cc_cptable; + if (cptable == NULL) + cptable = cfs_cpt_table; + + if (!conf->psc_thr.tc_cpu_affinity) { + ncpts = 1; + } else { + ncpts = cfs_cpt_number(cptable); + if (cconf->cc_pattern != NULL) { + struct cfs_expr_list *el; + + rc = cfs_expr_list_parse(cconf->cc_pattern, + strlen(cconf->cc_pattern), + 0, ncpts - 1, &el); + if (rc != 0) { + CERROR("%s: invalid CPT pattern string: %s", + conf->psc_name, cconf->cc_pattern); + RETURN(ERR_PTR(-EINVAL)); + } + + rc = cfs_expr_list_values(el, ncpts, &cpts); + cfs_expr_list_free(el); + if (rc <= 0) { + CERROR("%s: failed to parse CPT array %s: %d\n", + conf->psc_name, cconf->cc_pattern, rc); + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + RETURN(ERR_PTR(rc < 0 ? rc : -EINVAL)); + } + ncpts = rc; + } + } + + OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts])); + if (service == NULL) { + if (cpts != NULL) + OBD_FREE(cpts, sizeof(*cpts) * ncpts); + RETURN(ERR_PTR(-ENOMEM)); + } + + service->srv_cptable = cptable; + service->srv_cpts = cpts; + service->srv_ncpts = ncpts; + + service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ + while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) + service->srv_cpt_bits++; + + /* public members */ + spin_lock_init(&service->srv_lock); + service->srv_name = conf->psc_name; + service->srv_watchdog_factor = conf->psc_watchdog_factor; + INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */ + + /* buffer configuration */ + service->srv_nbuf_per_group = test_req_buffer_pressure ? + 1 : conf->psc_buf.bc_nbufs; + service->srv_max_req_size = conf->psc_buf.bc_req_max_size + + SPTLRPC_MAX_PAYLOAD; + service->srv_buf_size = conf->psc_buf.bc_buf_size; + service->srv_rep_portal = conf->psc_buf.bc_rep_portal; + service->srv_req_portal = conf->psc_buf.bc_req_portal; + + /* Increase max reply size to next power of two */ + service->srv_max_reply_size = 1; + while (service->srv_max_reply_size < + conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) + service->srv_max_reply_size <<= 1; + + service->srv_thread_name = conf->psc_thr.tc_thr_name; + service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; + service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; + service->srv_ops = conf->psc_ops; + + for (i = 0; i < ncpts; i++) { + if (!conf->psc_thr.tc_cpu_affinity) + cpt = CFS_CPT_ANY; + else + cpt = cpts != NULL ? cpts[i] : i; + + OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt)); + if (svcpt == NULL) + GOTO(failed, rc = -ENOMEM); + + service->srv_parts[i] = svcpt; + rc = ptlrpc_service_part_init(service, svcpt, cpt); + if (rc != 0) + GOTO(failed, rc); + } + + ptlrpc_server_nthreads_check(service, conf); + + rc = LNetSetLazyPortal(service->srv_req_portal); + LASSERT(rc == 0); + + mutex_lock(&ptlrpc_all_services_mutex); + list_add (&service->srv_list, &ptlrpc_all_services); + mutex_unlock(&ptlrpc_all_services_mutex); + + if (proc_entry != NULL) + ptlrpc_lprocfs_register_service(proc_entry, service); + + rc = ptlrpc_service_nrs_setup(service); + if (rc != 0) + GOTO(failed, rc); + + CDEBUG(D_NET, "%s: Started, listening on portal %d\n", + service->srv_name, service->srv_req_portal); + + rc = ptlrpc_start_threads(service); + if (rc != 0) { + CERROR("Failed to start threads for service %s: %d\n", + service->srv_name, rc); + GOTO(failed, rc); + } + + RETURN(service); +failed: + ptlrpc_unregister_service(service); + RETURN(ERR_PTR(rc)); +} +EXPORT_SYMBOL(ptlrpc_register_service); + +/** + * to actually free the request, must be called without holding svc_lock. + * note it's caller's responsibility to unlink req->rq_list. + */ +static void ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + LASSERT(atomic_read(&req->rq_refcount) == 0); + LASSERT(list_empty(&req->rq_timed_list)); + + /* DEBUG_REQ() assumes the reply state of a request with a valid + * ref will not be destroyed until that reference is dropped. */ + ptlrpc_req_drop_rs(req); + + sptlrpc_svc_ctx_decref(req); + + if (req != &req->rq_rqbd->rqbd_req) { + /* NB request buffers use an embedded + * req if the incoming req unlinked the + * MD; this isn't one of them! */ + OBD_FREE(req, sizeof(*req)); + } +} + +/** + * drop a reference count of the request. if it reaches 0, we either + * put it into history list, or free it immediately. + */ +void ptlrpc_server_drop_request(struct ptlrpc_request *req) +{ + struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; + struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + int refcount; + struct list_head *tmp; + struct list_head *nxt; + + if (!atomic_dec_and_test(&req->rq_refcount)) + return; + + if (req->rq_at_linked) { + spin_lock(&svcpt->scp_at_lock); + /* recheck with lock, in case it's unlinked by + * ptlrpc_at_check_timed() */ + if (likely(req->rq_at_linked)) + ptlrpc_at_remove_timed(req); + spin_unlock(&svcpt->scp_at_lock); + } + + LASSERT(list_empty(&req->rq_timed_list)); + + /* finalize request */ + if (req->rq_export) { + class_export_put(req->rq_export); + req->rq_export = NULL; + } + + spin_lock(&svcpt->scp_lock); + + list_add(&req->rq_list, &rqbd->rqbd_reqs); + + refcount = --(rqbd->rqbd_refcount); + if (refcount == 0) { + /* request buffer is now idle: add to history */ + list_del(&rqbd->rqbd_list); + + list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); + svcpt->scp_hist_nrqbds++; + + /* cull some history? + * I expect only about 1 or 2 rqbds need to be recycled here */ + while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { + rqbd = list_entry(svcpt->scp_hist_rqbds.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + + list_del(&rqbd->rqbd_list); + svcpt->scp_hist_nrqbds--; + + /* remove rqbd's reqs from svc's req history while + * I've got the service lock */ + list_for_each(tmp, &rqbd->rqbd_reqs) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + /* Track the highest culled req seq */ + if (req->rq_history_seq > + svcpt->scp_hist_seq_culled) { + svcpt->scp_hist_seq_culled = + req->rq_history_seq; + } + list_del(&req->rq_history_list); + } + + spin_unlock(&svcpt->scp_lock); + + list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { + req = list_entry(rqbd->rqbd_reqs.next, + struct ptlrpc_request, + rq_list); + list_del(&req->rq_list); + ptlrpc_server_free_request(req); + } + + spin_lock(&svcpt->scp_lock); + /* + * now all reqs including the embedded req has been + * disposed, schedule request buffer for re-use. + */ + LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == + 0); + list_add_tail(&rqbd->rqbd_list, + &svcpt->scp_rqbd_idle); + } + + spin_unlock(&svcpt->scp_lock); + } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { + /* If we are low on memory, we are not interested in history */ + list_del(&req->rq_list); + list_del_init(&req->rq_history_list); + + /* Track the highest culled req seq */ + if (req->rq_history_seq > svcpt->scp_hist_seq_culled) + svcpt->scp_hist_seq_culled = req->rq_history_seq; + + spin_unlock(&svcpt->scp_lock); + + ptlrpc_server_free_request(req); + } else { + spin_unlock(&svcpt->scp_lock); + } +} + +/** Change request export and move hp request from old export to new */ +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export) +{ + if (req->rq_export != NULL) { + if (!list_empty(&req->rq_exp_list)) { + /* remove rq_exp_list from last export */ + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + + /* export has one reference already, so it`s safe to + * add req to export queue here and get another + * reference for request later */ + spin_lock_bh(&export->exp_rpc_lock); + list_add(&req->rq_exp_list, &export->exp_hp_rpcs); + spin_unlock_bh(&export->exp_rpc_lock); + } + class_export_rpc_dec(req->rq_export); + class_export_put(req->rq_export); + } + + /* request takes one export refcount */ + req->rq_export = class_export_get(export); + class_export_rpc_inc(export); + + return; +} + +/** + * to finish a request: stop sending more early replies, and release + * the request. + */ +static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + ptlrpc_server_hpreq_fini(req); + + ptlrpc_server_drop_request(req); +} + +/** + * to finish a active request: stop sending more early replies, and release + * the request. should be called after we finished handling the request. + */ +static void ptlrpc_server_finish_active_request( + struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + spin_lock(&svcpt->scp_req_lock); + ptlrpc_nrs_req_stop_nolock(req); + svcpt->scp_nreqs_active--; + if (req->rq_hp) + svcpt->scp_nhreqs_active--; + spin_unlock(&svcpt->scp_req_lock); + + ptlrpc_nrs_req_finalize(req); + + if (req->rq_export != NULL) + class_export_rpc_dec(req->rq_export); + + ptlrpc_server_finish_request(svcpt, req); +} + +/** + * This function makes sure dead exports are evicted in a timely manner. + * This function is only called when some export receives a message (i.e., + * the network is up.) + */ +static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) +{ + struct obd_export *oldest_exp; + time_t oldest_time, new_time; + + ENTRY; + + LASSERT(exp); + + /* Compensate for slow machines, etc, by faking our request time + into the future. Although this can break the strict time-ordering + of the list, we can be really lazy here - we don't have to evict + at the exact right moment. Eventually, all silent exports + will make it to the top of the list. */ + + /* Do not pay attention on 1sec or smaller renewals. */ + new_time = cfs_time_current_sec() + extra_delay; + if (exp->exp_last_request_time + 1 /*second */ >= new_time) + RETURN_EXIT; + + exp->exp_last_request_time = new_time; + CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n", + exp->exp_client_uuid.uuid, + exp->exp_last_request_time, exp); + + /* exports may get disconnected from the chain even though the + export has references, so we must keep the spin lock while + manipulating the lists */ + spin_lock(&exp->exp_obd->obd_dev_lock); + + if (list_empty(&exp->exp_obd_chain_timed)) { + /* this one is not timed */ + spin_unlock(&exp->exp_obd->obd_dev_lock); + RETURN_EXIT; + } + + list_move_tail(&exp->exp_obd_chain_timed, + &exp->exp_obd->obd_exports_timed); + + oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, + struct obd_export, exp_obd_chain_timed); + oldest_time = oldest_exp->exp_last_request_time; + spin_unlock(&exp->exp_obd->obd_dev_lock); + + if (exp->exp_obd->obd_recovering) { + /* be nice to everyone during recovery */ + EXIT; + return; + } + + /* Note - racing to start/reset the obd_eviction timer is safe */ + if (exp->exp_obd->obd_eviction_timer == 0) { + /* Check if the oldest entry is expired. */ + if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT + + extra_delay)) { + /* We need a second timer, in case the net was down and + * it just came back. Since the pinger may skip every + * other PING_INTERVAL (see note in ptlrpc_pinger_main), + * we better wait for 3. */ + exp->exp_obd->obd_eviction_timer = + cfs_time_current_sec() + 3 * PING_INTERVAL; + CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n", + exp->exp_obd->obd_name, + obd_export_nid2str(oldest_exp), oldest_time); + } + } else { + if (cfs_time_current_sec() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { + /* The evictor won't evict anyone who we've heard from + * recently, so we don't have to check before we start + * it. */ + if (!ping_evictor_wake(exp)) + exp->exp_obd->obd_eviction_timer = 0; + } + } + + EXIT; +} + +/** + * Sanity check request \a req. + * Return 0 if all is ok, error code otherwise. + */ +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + int rc = 0; + + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt)) { + DEBUG_REQ(D_RPCTRACE, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (unlikely(req->rq_export->exp_obd && + req->rq_export->exp_obd->obd_fail)) { + /* Failing over, don't handle any more reqs, send + error response instead. */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, req->rq_export->exp_obd->obd_name); + rc = -ENODEV; + } else if (lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && + !(req->rq_export->exp_obd->obd_recovering)) { + DEBUG_REQ(D_ERROR, req, + "Invalid replay without recovery"); + class_fail_export(req->rq_export); + rc = -ENODEV; + } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && + !(req->rq_export->exp_obd->obd_recovering)) { + DEBUG_REQ(D_ERROR, req, "Invalid req with transno " + LPU64" without recovery", + lustre_msg_get_transno(req->rq_reqmsg)); + class_fail_export(req->rq_export); + rc = -ENODEV; + } + + if (unlikely(rc < 0)) { + req->rq_status = rc; + ptlrpc_error(req); + } + return rc; +} + +static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + __s32 next; + + if (array->paa_count == 0) { + cfs_timer_disarm(&svcpt->scp_at_timer); + return; + } + + /* Set timer for closest deadline */ + next = (__s32)(array->paa_deadline - cfs_time_current_sec() - + at_early_margin); + if (next <= 0) { + ptlrpc_at_timer((unsigned long)svcpt); + } else { + cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next)); + CDEBUG(D_INFO, "armed %s at %+ds\n", + svcpt->scp_service->srv_name, next); + } +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq = NULL; + __u32 index; + + if (AT_OFF) + return(0); + + if (req->rq_no_reply) + return 0; + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) + return(-ENOSYS); + + spin_lock(&svcpt->scp_at_lock); + LASSERT(list_empty(&req->rq_timed_list)); + + index = (unsigned long)req->rq_deadline % array->paa_size; + if (array->paa_reqs_count[index] > 0) { + /* latest rpcs will have the latest deadlines in the list, + * so search backward. */ + list_for_each_entry_reverse(rq, + &array->paa_reqs_array[index], + rq_timed_list) { + if (req->rq_deadline >= rq->rq_deadline) { + list_add(&req->rq_timed_list, + &rq->rq_timed_list); + break; + } + } + } + + /* Add the request at the head of the list */ + if (list_empty(&req->rq_timed_list)) + list_add(&req->rq_timed_list, + &array->paa_reqs_array[index]); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 1; + spin_unlock(&req->rq_lock); + req->rq_at_index = index; + array->paa_reqs_count[index]++; + array->paa_count++; + if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { + array->paa_deadline = req->rq_deadline; + ptlrpc_at_set_timer(svcpt); + } + spin_unlock(&svcpt->scp_at_lock); + + return 0; +} + +static void +ptlrpc_at_remove_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_at_array *array; + + array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; + + /* NB: must call with hold svcpt::scp_at_lock */ + LASSERT(!list_empty(&req->rq_timed_list)); + list_del_init(&req->rq_timed_list); + + spin_lock(&req->rq_lock); + req->rq_at_linked = 0; + spin_unlock(&req->rq_lock); + + array->paa_reqs_count[req->rq_at_index]--; + array->paa_count--; +} + +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; + struct ptlrpc_request *reqcopy; + struct lustre_msg *reqmsg; + cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec(); + time_t newdl; + int rc; + ENTRY; + + /* deadline is when the client expects us to reply, margin is the + difference between clients' and servers' expectations */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+lds, margin %+lds) for " + "%d+%d", AT_OFF ? "AT off - not " : "", + olddl, olddl - at_get(&svcpt->scp_at_estimate), + at_get(&svcpt->scp_at_estimate), at_extra); + + if (AT_OFF) + RETURN(0); + + if (olddl < 0) { + DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), " + "not sending early reply. Consider increasing " + "at_early_margin (%d)?", olddl, at_early_margin); + + /* Return an error so we're not re-added to the timed list. */ + RETURN(-ETIMEDOUT); + } + + if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){ + DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, " + "but no AT support"); + RETURN(-ENOSYS); + } + + if (req->rq_export && + lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { + /* During recovery, we don't want to send too many early + * replies, but on the other hand we want to make sure the + * client has enough time to resend if the rpc is lost. So + * during the recovery period send at least 4 early replies, + * spacing them every at_extra if we can. at_estimate should + * always equal this fixed value during recovery. */ + at_measured(&svcpt->scp_at_estimate, min(at_extra, + req->rq_export->exp_obd->obd_recovery_timeout / 4)); + } else { + /* Fake our processing time into the future to ask the clients + * for some extra amount of time */ + at_measured(&svcpt->scp_at_estimate, at_extra + + cfs_time_current_sec() - + req->rq_arrival_time.tv_sec); + + /* Check to see if we've actually increased the deadline - + * we may be past adaptive_max */ + if (req->rq_deadline >= req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate)) { + DEBUG_REQ(D_WARNING, req, "Couldn't add any time " + "(%ld/%ld), not sending early reply\n", + olddl, req->rq_arrival_time.tv_sec + + at_get(&svcpt->scp_at_estimate) - + cfs_time_current_sec()); + RETURN(-ETIMEDOUT); + } + } + newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate); + + OBD_ALLOC(reqcopy, sizeof *reqcopy); + if (reqcopy == NULL) + RETURN(-ENOMEM); + OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen); + if (!reqmsg) { + OBD_FREE(reqcopy, sizeof *reqcopy); + RETURN(-ENOMEM); + } + + *reqcopy = *req; + reqcopy->rq_reply_state = NULL; + reqcopy->rq_rep_swab_mask = 0; + reqcopy->rq_pack_bulk = 0; + reqcopy->rq_pack_udesc = 0; + reqcopy->rq_packed_final = 0; + sptlrpc_svc_ctx_addref(reqcopy); + /* We only need the reqmsg for the magic */ + reqcopy->rq_reqmsg = reqmsg; + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + + LASSERT(atomic_read(&req->rq_refcount)); + /** if it is last refcount then early reply isn't needed */ + if (atomic_read(&req->rq_refcount) == 1) { + DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, " + "abort sending early reply\n"); + GOTO(out, rc = -EINVAL); + } + + /* Connection ref */ + reqcopy->rq_export = class_conn2export( + lustre_msg_get_handle(reqcopy->rq_reqmsg)); + if (reqcopy->rq_export == NULL) + GOTO(out, rc = -ENODEV); + + /* RPC ref */ + class_export_rpc_inc(reqcopy->rq_export); + if (reqcopy->rq_export->exp_obd && + reqcopy->rq_export->exp_obd->obd_fail) + GOTO(out_put, rc = -ENODEV); + + rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + GOTO(out_put, rc); + + rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = newdl; + req->rq_early_count++; /* number sent, server side */ + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + } + + /* Free the (early) reply state from lustre_pack_reply. + (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ + ptlrpc_req_drop_rs(reqcopy); + +out_put: + class_export_rpc_dec(reqcopy->rq_export); + class_export_put(reqcopy->rq_export); +out: + sptlrpc_svc_ctx_decref(reqcopy); + OBD_FREE_LARGE(reqmsg, req->rq_reqlen); + OBD_FREE(reqcopy, sizeof *reqcopy); + RETURN(rc); +} + +/* Send early replies to everybody expiring within at_early_margin + asking for at_extra time */ +static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_at_array *array = &svcpt->scp_at_array; + struct ptlrpc_request *rq, *n; + struct list_head work_list; + __u32 index, count; + time_t deadline; + time_t now = cfs_time_current_sec(); + cfs_duration_t delay; + int first, counter = 0; + ENTRY; + + spin_lock(&svcpt->scp_at_lock); + if (svcpt->scp_at_check == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); + svcpt->scp_at_check = 0; + + if (array->paa_count == 0) { + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + first = array->paa_deadline - now; + if (first > at_early_margin) { + /* We've still got plenty of time. Reset the timer. */ + ptlrpc_at_set_timer(svcpt); + spin_unlock(&svcpt->scp_at_lock); + RETURN(0); + } + + /* We're close to a timeout, and we don't know how much longer the + server will take. Send early replies to everyone expiring soon. */ + INIT_LIST_HEAD(&work_list); + deadline = -1; + index = (unsigned long)array->paa_deadline % array->paa_size; + count = array->paa_count; + while (count > 0) { + count -= array->paa_reqs_count[index]; + list_for_each_entry_safe(rq, n, + &array->paa_reqs_array[index], + rq_timed_list) { + if (rq->rq_deadline > now + at_early_margin) { + /* update the earliest deadline */ + if (deadline == -1 || + rq->rq_deadline < deadline) + deadline = rq->rq_deadline; + break; + } + + ptlrpc_at_remove_timed(rq); + /** + * ptlrpc_server_drop_request() may drop + * refcount to 0 already. Let's check this and + * don't add entry to work_list + */ + if (likely(atomic_inc_not_zero(&rq->rq_refcount))) + list_add(&rq->rq_timed_list, &work_list); + counter++; + } + + if (++index >= array->paa_size) + index = 0; + } + array->paa_deadline = deadline; + /* we have a new earliest deadline, restart the timer */ + ptlrpc_at_set_timer(svcpt); + + spin_unlock(&svcpt->scp_at_lock); + + CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " + "replies\n", first, at_extra, counter); + if (first < 0) { + /* We're already past request deadlines before we even get a + chance to send early replies */ + LCONSOLE_WARN("%s: This server is not able to keep up with " + "request traffic (cpu-bound).\n", + svcpt->scp_service->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, " + "delay="CFS_DURATION_T"(jiff)\n", + counter, svcpt->scp_nreqs_incoming, + svcpt->scp_nreqs_active, + at_get(&svcpt->scp_at_estimate), delay); + } + + /* we took additional refcount so entries can't be deleted from list, no + * locking is needed */ + while (!list_empty(&work_list)) { + rq = list_entry(work_list.next, struct ptlrpc_request, + rq_timed_list); + list_del_init(&rq->rq_timed_list); + + if (ptlrpc_at_send_early_reply(rq) == 0) + ptlrpc_at_add_timed(rq); + + ptlrpc_server_drop_request(rq); + } + + RETURN(1); /* return "did_something" for liblustre */ +} + +/** + * Put the request to the export list if the request may become + * a high priority one. + */ +static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc = 0; + ENTRY; + + if (svcpt->scp_service->srv_ops.so_hpreq_handler) { + rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); + if (rc < 0) + RETURN(rc); + LASSERT(rc == 0); + } + if (req->rq_export && req->rq_ops) { + /* Perform request specific check. We should do this check + * before the request is added into exp_hp_rpcs list otherwise + * it may hit swab race at LU-1044. */ + if (req->rq_ops->hpreq_check) { + rc = req->rq_ops->hpreq_check(req); + /** + * XXX: Out of all current + * ptlrpc_hpreq_ops::hpreq_check(), only + * ldlm_cancel_hpreq_check() can return an error code; + * other functions assert in similar places, which seems + * odd. What also does not seem right is that handlers + * for those RPCs do not assert on the same checks, but + * rather handle the error cases. e.g. see + * ost_rw_hpreq_check(), and ost_brw_read(), + * ost_brw_write(). + */ + if (rc < 0) + RETURN(rc); + LASSERT(rc == 0 || rc == 1); + } + + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_add(&req->rq_exp_list, + &req->rq_export->exp_hp_rpcs); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + } + + ptlrpc_nrs_req_initialize(svcpt, req, rc); + + RETURN(rc); +} + +/** Remove the request from the export list. */ +static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) +{ + ENTRY; + if (req->rq_export && req->rq_ops) { + /* refresh lock timeout again so that client has more + * room to send lock cancel RPC. */ + if (req->rq_ops->hpreq_fini) + req->rq_ops->hpreq_fini(req); + + spin_lock_bh(&req->rq_export->exp_rpc_lock); + list_del_init(&req->rq_exp_list); + spin_unlock_bh(&req->rq_export->exp_rpc_lock); + } + EXIT; +} + +static int ptlrpc_hpreq_check(struct ptlrpc_request *req) +{ + return 1; +} + +static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { + .hpreq_check = ptlrpc_hpreq_check, +}; + +/* Hi-Priority RPC check by RPC operation code. */ +int ptlrpc_hpreq_handler(struct ptlrpc_request *req) +{ + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Check for export to let only reconnects for not yet evicted + * export to become a HP rpc. */ + if ((req->rq_export != NULL) && + (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) + req->rq_ops = &ptlrpc_hpreq_common; + + return 0; +} +EXPORT_SYMBOL(ptlrpc_hpreq_handler); + +static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, + struct ptlrpc_request *req) +{ + int rc; + ENTRY; + + rc = ptlrpc_server_hpreq_init(svcpt, req); + if (rc < 0) + RETURN(rc); + + ptlrpc_nrs_req_add(svcpt, req, !!rc); + + RETURN(0); +} + +/** + * Allow to handle high priority request + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + + if (!nrs_svcpt_has_hp(svcpt)) + return false; + + if (force) + return true; + + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + if (svcpt->scp_nhreqs_active == 0) + return true; + + return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || + svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; +} + +static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_high(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, true); +} + +/** + * Only allow normal priority requests on a service that has a high-priority + * queue if forced (i.e. cleanup), if there are other high priority requests + * already being processed (i.e. those threads can service more high-priority + * requests), or if there are enough idle threads that a later thread can do + * a high priority request. + * User can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_req_lock to get reliable result + */ +static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, + bool force) +{ + int running = svcpt->scp_nthrs_running; + if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && + CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { + /* leave just 1 thread for normal RPCs */ + running = PTLRPC_NTHRS_INIT; + if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) + running += 1; + } + + if (force || + svcpt->scp_nreqs_active < running - 2) + return true; + + if (svcpt->scp_nreqs_active >= running - 1) + return false; + + return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); +} + +static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, + bool force) +{ + return ptlrpc_server_allow_normal(svcpt, force) && + ptlrpc_nrs_req_pending_nolock(svcpt, false); +} + +/** + * Returns true if there are requests available in incoming + * request queue for processing and it is allowed to fetch them. + * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock + * to get reliable result + * \see ptlrpc_server_allow_normal + * \see ptlrpc_server_allow high + */ +static inline bool +ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) +{ + return ptlrpc_server_high_pending(svcpt, force) || + ptlrpc_server_normal_pending(svcpt, force); +} + +/** + * Fetch a request for processing from queue of unprocessed requests. + * Favors high-priority requests. + * Returns a pointer to fetched request. + */ +static struct ptlrpc_request * +ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) +{ + struct ptlrpc_request *req = NULL; + ENTRY; + + spin_lock(&svcpt->scp_req_lock); + + if (ptlrpc_server_high_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); + if (req != NULL) { + svcpt->scp_hreq_count++; + goto got_request; + } + } + + if (ptlrpc_server_normal_pending(svcpt, force)) { + req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); + if (req != NULL) { + svcpt->scp_hreq_count = 0; + goto got_request; + } + } + + spin_unlock(&svcpt->scp_req_lock); + RETURN(NULL); + +got_request: + svcpt->scp_nreqs_active++; + if (req->rq_hp) + svcpt->scp_nhreqs_active++; + + spin_unlock(&svcpt->scp_req_lock); + + if (likely(req->rq_export)) + class_export_rpc_inc(req->rq_export); + + RETURN(req); +} + +/** + * Handle freshly incoming reqs, add to timed early reply list, + * pass on to regular request queue. + * All incoming requests pass through here before getting into + * ptlrpc_server_handle_req later on. + */ +static int +ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *req; + __u32 deadline; + int rc; + ENTRY; + + spin_lock(&svcpt->scp_lock); + if (list_empty(&svcpt->scp_req_incoming)) { + spin_unlock(&svcpt->scp_lock); + RETURN(0); + } + + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + list_del_init(&req->rq_list); + svcpt->scp_nreqs_incoming--; + /* Consider this still a "queued" request as far as stats are + * concerned */ + spin_unlock(&svcpt->scp_lock); + + /* go through security check/transform */ + rc = sptlrpc_svc_unwrap_request(req); + switch (rc) { + case SECSVC_OK: + break; + case SECSVC_COMPLETE: + target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto err_req; + case SECSVC_DROP: + goto err_req; + default: + LBUG(); + } + + /* + * for null-flavored rpc, msg has been unpacked by sptlrpc, although + * redo it wouldn't be harmful. + */ + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { + rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); + if (rc != 0) { + CERROR("error unpacking request: ptl %d from %s " + "x"LPU64"\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR ("error unpacking ptlrpc body: ptl %d from %s x" + LPU64"\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && + lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { + CERROR("drop incoming rpc opc %u, x"LPU64"\n", + cfs_fail_val, req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + switch(lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_WRITEPAGE: + case OST_WRITE: + req->rq_bulk_write = 1; + break; + case MDS_READPAGE: + case OST_READ: + case MGS_CONFIG_READ: + req->rq_bulk_read = 1; + break; + } + + CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + if (rc == 0) { + rc = sptlrpc_target_export_check(req->rq_export, req); + if (rc) + DEBUG_REQ(D_ERROR, req, "DROPPING req with " + "illegal security flavor,"); + } + + if (rc) + goto err_req; + ptlrpc_update_export_timer(req->rq_export, 0); + } + + /* req_in handling should/must be fast */ + if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s", + cfs_time_sub(cfs_time_current_sec(), + req->rq_arrival_time.tv_sec)); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & + MSGHDR_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } + + req->rq_svc_thread = thread; + + ptlrpc_at_add_timed(req); + + /* Move it over to the request processing queue */ + rc = ptlrpc_server_request_add(svcpt, req); + if (rc) + GOTO(err_req, rc); + + wake_up(&svcpt->scp_waitq); + RETURN(1); + +err_req: + ptlrpc_server_finish_request(svcpt, req); + + RETURN(1); +} + +/** + * Main incoming request handling logic. + * Calls handler function from service to do actual processing. + */ +static int +ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_request *request; + struct timeval work_start; + struct timeval work_end; + long timediff; + int rc; + int fail_opc = 0; + ENTRY; + + request = ptlrpc_server_request_get(svcpt, false); + if (request == NULL) + RETURN(0); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; + else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; + + if (unlikely(fail_opc)) { + if (request->rq_export && request->rq_ops) + OBD_FAIL_TIMEOUT(fail_opc, 4); + } + + ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); + + if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) + libcfs_debug_dumplog(); + + do_gettimeofday(&work_start); + timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL); + if (likely(svc->srv_stats != NULL)) { + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, + timediff); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, + svcpt->scp_nreqs_incoming); + lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, + svcpt->scp_nreqs_active); + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svcpt->scp_at_estimate)); + } + + rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF); + if (rc) { + CERROR("Failure to initialize session: %d\n", rc); + goto out_req; + } + request->rq_session.lc_thread = thread; + request->rq_session.lc_cookie = 0x5; + lu_context_enter(&request->rq_session); + + CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid); + + request->rq_svc_thread = thread; + if (thread) + request->rq_svc_thread->t_env->le_ses = &request->rq_session; + + if (likely(request->rq_export)) { + if (unlikely(ptlrpc_check_req(request))) + goto put_conn; + ptlrpc_update_export_timer(request->rq_export, timediff >> 19); + } + + /* Discard requests queued for longer than the deadline. + The deadline is increased if we send an early reply. */ + if (cfs_time_current_sec() > request->rq_deadline) { + DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s" + ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n", + libcfs_id2str(request->rq_peer), + cfs_time_sub(request->rq_deadline, + request->rq_arrival_time.tv_sec), + cfs_time_sub(cfs_time_current_sec(), + request->rq_deadline)); + goto put_conn; + } + + CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg)); + + if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); + + rc = svc->srv_ops.so_req_handler(request); + + ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); + +put_conn: + lu_context_exit(&request->rq_session); + lu_context_fini(&request->rq_session); + + if (unlikely(cfs_time_current_sec() > request->rq_deadline)) { + DEBUG_REQ(D_WARNING, request, "Request took longer " + "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);" + " client may timeout.", + cfs_time_sub(request->rq_deadline, + request->rq_arrival_time.tv_sec), + cfs_time_sub(cfs_time_current_sec(), + request->rq_deadline)); + } + + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " + "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in " + "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", + current_comm(), + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + (request->rq_export ? + atomic_read(&request->rq_export->exp_refcount) : -99), + lustre_msg_get_status(request->rq_reqmsg), + request->rq_xid, + libcfs_id2str(request->rq_peer), + lustre_msg_get_opc(request->rq_reqmsg), + timediff, + cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), + (request->rq_repmsg ? + lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno), + request->rq_status, + (request->rq_repmsg ? + lustre_msg_get_status(request->rq_repmsg) : -999)); + if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { + __u32 op = lustre_msg_get_opc(request->rq_reqmsg); + int opc = opcode_offset(op); + if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { + LASSERT(opc < LUSTRE_MAX_OPCODES); + lprocfs_counter_add(svc->srv_stats, + opc + EXTRA_MAX_OPCODES, + timediff); + } + } + if (unlikely(request->rq_early_count)) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in " + CFS_DURATION_T"s", + request->rq_early_count, + cfs_time_sub(work_end.tv_sec, + request->rq_arrival_time.tv_sec)); + } + +out_req: + ptlrpc_server_finish_active_request(svcpt, request); + + RETURN(1); +} + +/** + * An internal function to process a single reply state object. + */ +static int +ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) +{ + struct ptlrpc_service_part *svcpt = rs->rs_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct obd_export *exp; + int nlocks; + int been_handled; + ENTRY; + + exp = rs->rs_export; + + LASSERT (rs->rs_difficult); + LASSERT (rs->rs_scheduled); + LASSERT (list_empty(&rs->rs_list)); + + spin_lock(&exp->exp_lock); + /* Noop if removed already */ + list_del_init (&rs->rs_exp_list); + spin_unlock(&exp->exp_lock); + + /* The disk commit callback holds exp_uncommitted_replies_lock while it + * iterates over newly committed replies, removing them from + * exp_uncommitted_replies. It then drops this lock and schedules the + * replies it found for handling here. + * + * We can avoid contention for exp_uncommitted_replies_lock between the + * HRT threads and further commit callbacks by checking rs_committed + * which is set in the commit callback while it holds both + * rs_lock and exp_uncommitted_reples. + * + * If we see rs_committed clear, the commit callback _may_ not have + * handled this reply yet and we race with it to grab + * exp_uncommitted_replies_lock before removing the reply from + * exp_uncommitted_replies. Note that if we lose the race and the + * reply has already been removed, list_del_init() is a noop. + * + * If we see rs_committed set, we know the commit callback is handling, + * or has handled this reply since store reordering might allow us to + * see rs_committed set out of sequence. But since this is done + * holding rs_lock, we can be sure it has all completed once we hold + * rs_lock, which we do right next. + */ + if (!rs->rs_committed) { + spin_lock(&exp->exp_uncommitted_replies_lock); + list_del_init(&rs->rs_obd_list); + spin_unlock(&exp->exp_uncommitted_replies_lock); + } + + spin_lock(&rs->rs_lock); + + been_handled = rs->rs_handled; + rs->rs_handled = 1; + + nlocks = rs->rs_nlocks; /* atomic "steal", but */ + rs->rs_nlocks = 0; /* locks still on rs_locks! */ + + if (nlocks == 0 && !been_handled) { + /* If we see this, we should already have seen the warning + * in mds_steal_ack_locks() */ + CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64 + " o%d NID %s\n", + rs, + rs->rs_xid, rs->rs_transno, rs->rs_opc, + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + } + + if ((!been_handled && rs->rs_on_net) || nlocks > 0) { + spin_unlock(&rs->rs_lock); + + if (!been_handled && rs->rs_on_net) { + LNetMDUnlink(rs->rs_md_h); + /* Ignore return code; we're racing with completion */ + } + + while (nlocks-- > 0) + ldlm_lock_decref(&rs->rs_locks[nlocks], + rs->rs_modes[nlocks]); + + spin_lock(&rs->rs_lock); + } + + rs->rs_scheduled = 0; + + if (!rs->rs_on_net) { + /* Off the net */ + spin_unlock(&rs->rs_lock); + + class_export_put (exp); + rs->rs_export = NULL; + ptlrpc_rs_decref (rs); + if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && + svc->srv_is_stopping) + wake_up_all(&svcpt->scp_waitq); + RETURN(1); + } + + /* still on the net; callback will schedule */ + spin_unlock(&rs->rs_lock); + RETURN(1); +} + + +static void +ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) +{ + int avail = svcpt->scp_nrqbds_posted; + int low_water = test_req_buffer_pressure ? 0 : + svcpt->scp_service->srv_nbuf_per_group / 2; + + /* NB I'm not locking; just looking. */ + + /* CAVEAT EMPTOR: We might be allocating buffers here because we've + * allowed the request history to grow out of control. We could put a + * sanity check on that here and cull some history if we need the + * space. */ + + if (avail <= low_water) + ptlrpc_grow_req_bufs(svcpt, 1); + + if (svcpt->scp_service->srv_stats) { + lprocfs_counter_add(svcpt->scp_service->srv_stats, + PTLRPC_REQBUF_AVAIL_CNTR, avail); + } +} + +static int +ptlrpc_retry_rqbds(void *arg) +{ + struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg; + + svcpt->scp_rqbd_timeout = 0; + return -ETIMEDOUT; +} + +static inline int +ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nreqs_active < + svcpt->scp_nthrs_running - 1 - + (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); +} + +/** + * allowed to create more threads + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_nthrs_running + + svcpt->scp_nthrs_starting < + svcpt->scp_service->srv_nthrs_cpt_limit; +} + +/** + * too many requests and allowed to create more threads + */ +static inline int +ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) +{ + return !ptlrpc_threads_enough(svcpt) && + ptlrpc_threads_increasable(svcpt); +} + +static inline int +ptlrpc_thread_stopping(struct ptlrpc_thread *thread) +{ + return thread_is_stopping(thread) || + thread->t_svcpt->scp_service->srv_is_stopping; +} + +static inline int +ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_rqbd_idle) && + svcpt->scp_rqbd_timeout == 0; +} + +static inline int +ptlrpc_at_check(struct ptlrpc_service_part *svcpt) +{ + return svcpt->scp_at_check; +} + +/** + * requests wait on preprocessing + * user can call it w/o any lock but need to hold + * ptlrpc_service_part::scp_lock to get reliable result + */ +static inline int +ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) +{ + return !list_empty(&svcpt->scp_req_incoming); +} + +static __attribute__((__noinline__)) int +ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, + struct ptlrpc_thread *thread) +{ + /* Don't exit while there are replies to be handled */ + struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, + ptlrpc_retry_rqbds, svcpt); + + lc_watchdog_disable(thread->t_watchdog); + + cond_resched(); + + l_wait_event_exclusive_head(svcpt->scp_waitq, + ptlrpc_thread_stopping(thread) || + ptlrpc_server_request_incoming(svcpt) || + ptlrpc_server_request_pending(svcpt, false) || + ptlrpc_rqbd_pending(svcpt) || + ptlrpc_at_check(svcpt), &lwi); + + if (ptlrpc_thread_stopping(thread)) + return -EINTR; + + lc_watchdog_touch(thread->t_watchdog, + ptlrpc_server_get_timeout(svcpt)); + return 0; +} + +/** + * Main thread body for service threads. + * Waits in a loop waiting for new requests to process to appear. + * Every time an incoming requests is added to its queue, a waitq + * is woken up and one of the threads will handle it. + */ +static int ptlrpc_main(void *arg) +{ + struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; + struct ptlrpc_service_part *svcpt = thread->t_svcpt; + struct ptlrpc_service *svc = svcpt->scp_service; + struct ptlrpc_reply_state *rs; +#ifdef WITH_GROUP_INFO + group_info_t *ginfo = NULL; +#endif + struct lu_env *env; + int counter = 0, rc = 0; + ENTRY; + + thread->t_pid = current_pid(); + unshare_fs_struct(); + + /* NB: we will call cfs_cpt_bind() for all threads, because we + * might want to run lustre server only on a subset of system CPUs, + * in that case ->scp_cpt is CFS_CPT_ANY */ + rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); + if (rc != 0) { + CWARN("%s: failed to bind %s on CPT %d\n", + svc->srv_name, thread->t_name, svcpt->scp_cpt); + } + +#ifdef WITH_GROUP_INFO + ginfo = groups_alloc(0); + if (!ginfo) { + rc = -ENOMEM; + goto out; + } + + set_current_groups(ginfo); + put_group_info(ginfo); +#endif + + if (svc->srv_ops.so_thr_init != NULL) { + rc = svc->srv_ops.so_thr_init(thread); + if (rc) + goto out; + } + + OBD_ALLOC_PTR(env); + if (env == NULL) { + rc = -ENOMEM; + goto out_srv_fini; + } + + rc = lu_context_init(&env->le_ctx, + svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); + if (rc) + goto out_srv_fini; + + thread->t_env = env; + env->le_ctx.lc_thread = thread; + env->le_ctx.lc_cookie = 0x6; + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rc = ptlrpc_server_post_idle_rqbds(svcpt); + if (rc >= 0) + continue; + + CERROR("Failed to post rqbd for %s on CPT %d: %d\n", + svc->srv_name, svcpt->scp_cpt, rc); + goto out_srv_fini; + } + + /* Alloc reply state structure for this one */ + OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size); + if (!rs) { + rc = -ENOMEM; + goto out_srv_fini; + } + + spin_lock(&svcpt->scp_lock); + + LASSERT(thread_is_starting(thread)); + thread_clear_flags(thread, SVC_STARTING); + + LASSERT(svcpt->scp_nthrs_starting == 1); + svcpt->scp_nthrs_starting--; + + /* SVC_STOPPING may already be set here if someone else is trying + * to stop the service while this new thread has been dynamically + * forked. We still set SVC_RUNNING to let our creator know that + * we are now running, however we will exit as soon as possible */ + thread_add_flags(thread, SVC_RUNNING); + svcpt->scp_nthrs_running++; + spin_unlock(&svcpt->scp_lock); + + /* wake up our creator in case he's still waiting. */ + wake_up(&thread->t_ctl_waitq); + + thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), + NULL, NULL); + + spin_lock(&svcpt->scp_rep_lock); + list_add(&rs->rs_list, &svcpt->scp_rep_idle); + wake_up(&svcpt->scp_rep_waitq); + spin_unlock(&svcpt->scp_rep_lock); + + CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, + svcpt->scp_nthrs_running); + + /* XXX maintain a list of all managed devices: insert here */ + while (!ptlrpc_thread_stopping(thread)) { + if (ptlrpc_wait_event(svcpt, thread)) + break; + + ptlrpc_check_rqbd_pool(svcpt); + + if (ptlrpc_threads_need_create(svcpt)) { + /* Ignore return code - we tried... */ + ptlrpc_start_thread(svcpt, 0); + } + + /* Process all incoming reqs before handling any */ + if (ptlrpc_server_request_incoming(svcpt)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_req_in(svcpt, thread); + lu_context_exit(&env->le_ctx); + + /* but limit ourselves in case of flood */ + if (counter++ < 100) + continue; + counter = 0; + } + + if (ptlrpc_at_check(svcpt)) + ptlrpc_at_check_timed(svcpt); + + if (ptlrpc_server_request_pending(svcpt, false)) { + lu_context_enter(&env->le_ctx); + ptlrpc_server_handle_request(svcpt, thread); + lu_context_exit(&env->le_ctx); + } + + if (ptlrpc_rqbd_pending(svcpt) && + ptlrpc_server_post_idle_rqbds(svcpt) < 0) { + /* I just failed to repost request buffers. + * Wait for a timeout (unless something else + * happens) before I try again */ + svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; + CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", + svcpt->scp_nrqbds_posted); + } + } + + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; + +out_srv_fini: + /* + * deconstruct service specific state created by ptlrpc_start_thread() + */ + if (svc->srv_ops.so_thr_done != NULL) + svc->srv_ops.so_thr_done(thread); + + if (env != NULL) { + lu_context_fini(&env->le_ctx); + OBD_FREE_PTR(env); + } +out: + CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", + thread, thread->t_pid, thread->t_id, rc); + + spin_lock(&svcpt->scp_lock); + if (thread_test_and_clear_flags(thread, SVC_STARTING)) + svcpt->scp_nthrs_starting--; + + if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { + /* must know immediately */ + svcpt->scp_nthrs_running--; + } + + thread->t_id = rc; + thread_add_flags(thread, SVC_STOPPED); + + wake_up(&thread->t_ctl_waitq); + spin_unlock(&svcpt->scp_lock); + + return rc; +} + +static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, + struct list_head *replies) +{ + int result; + + spin_lock(&hrt->hrt_lock); + + list_splice_init(&hrt->hrt_queue, replies); + result = ptlrpc_hr.hr_stopping || !list_empty(replies); + + spin_unlock(&hrt->hrt_lock); + return result; +} + +/** + * Main body of "handle reply" function. + * It processes acked reply states + */ +static int ptlrpc_hr_main(void *arg) +{ + struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; + struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; + LIST_HEAD (replies); + char threadname[20]; + int rc; + + snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, hrt->hrt_id); + unshare_fs_struct(); + + rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); + if (rc != 0) { + CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", + threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); + } + + atomic_inc(&hrp->hrp_nstarted); + wake_up(&ptlrpc_hr.hr_waitq); + + while (!ptlrpc_hr.hr_stopping) { + l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); + + while (!list_empty(&replies)) { + struct ptlrpc_reply_state *rs; + + rs = list_entry(replies.prev, + struct ptlrpc_reply_state, + rs_list); + list_del_init(&rs->rs_list); + ptlrpc_handle_rs(rs); + } + } + + atomic_inc(&hrp->hrp_nstopped); + wake_up(&ptlrpc_hr.hr_waitq); + + return 0; +} + +static void ptlrpc_stop_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + + ptlrpc_hr.hr_stopping = 1; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + for (j = 0; j < hrp->hrp_nthrs; j++) + wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); + } + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs == NULL) + continue; /* uninitialized */ + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstopped) == + atomic_read(&hrp->hrp_nstarted)); + } +} + +static int ptlrpc_start_hr_threads(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + int j; + ENTRY; + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + int rc = 0; + + for (j = 0; j < hrp->hrp_nthrs; j++) { + struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; + rc = PTR_ERR(kthread_run(ptlrpc_hr_main, + &hrp->hrp_thrs[j], + "ptlrpc_hr%02d_%03d", + hrp->hrp_cpt, + hrt->hrt_id)); + if (IS_ERR_VALUE(rc)) + break; + } + wait_event(ptlrpc_hr.hr_waitq, + atomic_read(&hrp->hrp_nstarted) == j); + if (!IS_ERR_VALUE(rc)) + continue; + + CERROR("Reply handling thread %d:%d Failed on starting: " + "rc = %d\n", i, j, rc); + ptlrpc_stop_hr_threads(); + RETURN(rc); + } + RETURN(0); +} + +static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + LIST_HEAD (zombie); + + ENTRY; + + CDEBUG(D_INFO, "Stopping threads for service %s\n", + svcpt->scp_service->srv_name); + + spin_lock(&svcpt->scp_lock); + /* let the thread know that we would like it to stop asap */ + list_for_each_entry(thread, &svcpt->scp_threads, t_link) { + CDEBUG(D_INFO, "Stopping thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + thread_add_flags(thread, SVC_STOPPING); + } + + wake_up_all(&svcpt->scp_waitq); + + while (!list_empty(&svcpt->scp_threads)) { + thread = list_entry(svcpt->scp_threads.next, + struct ptlrpc_thread, t_link); + if (thread_is_stopped(thread)) { + list_del(&thread->t_link); + list_add(&thread->t_link, &zombie); + continue; + } + spin_unlock(&svcpt->scp_lock); + + CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", + svcpt->scp_service->srv_thread_name, thread->t_id); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), &lwi); + + spin_lock(&svcpt->scp_lock); + } + + spin_unlock(&svcpt->scp_lock); + + while (!list_empty(&zombie)) { + thread = list_entry(zombie.next, + struct ptlrpc_thread, t_link); + list_del(&thread->t_link); + OBD_FREE_PTR(thread); + } + EXIT; +} + +/** + * Stops all threads of a particular service \a svc + */ +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + ENTRY; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + ptlrpc_svcpt_stop_threads(svcpt); + } + + EXIT; +} +EXPORT_SYMBOL(ptlrpc_stop_all_threads); + +int ptlrpc_start_threads(struct ptlrpc_service *svc) +{ + int rc = 0; + int i; + int j; + ENTRY; + + /* We require 2 threads min, see note in ptlrpc_server_handle_request */ + LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); + + for (i = 0; i < svc->srv_ncpts; i++) { + for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { + rc = ptlrpc_start_thread(svc->srv_parts[i], 1); + if (rc == 0) + continue; + + if (rc != -EMFILE) + goto failed; + /* We have enough threads, don't start more. b=15759 */ + break; + } + } + + RETURN(0); + failed: + CERROR("cannot start %s thread #%d_%d: rc %d\n", + svc->srv_thread_name, i, j, rc); + ptlrpc_stop_all_threads(svc); + RETURN(rc); +} +EXPORT_SYMBOL(ptlrpc_start_threads); + +int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_thread *thread; + struct ptlrpc_service *svc; + int rc; + ENTRY; + + LASSERT(svcpt != NULL); + + svc = svcpt->scp_service; + + CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", + svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, + svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); + + again: + if (unlikely(svc->srv_is_stopping)) + RETURN(-ESRCH); + + if (!ptlrpc_threads_increasable(svcpt) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) + RETURN(-EMFILE); + + OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt); + if (thread == NULL) + RETURN(-ENOMEM); + init_waitqueue_head(&thread->t_ctl_waitq); + + spin_lock(&svcpt->scp_lock); + if (!ptlrpc_threads_increasable(svcpt)) { + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + RETURN(-EMFILE); + } + + if (svcpt->scp_nthrs_starting != 0) { + /* serialize starting because some modules (obdfilter) + * might require unique and contiguous t_id */ + LASSERT(svcpt->scp_nthrs_starting == 1); + spin_unlock(&svcpt->scp_lock); + OBD_FREE_PTR(thread); + if (wait) { + CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + schedule(); + goto again; + } + + CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", + svc->srv_thread_name, svcpt->scp_thr_nextid); + RETURN(-EAGAIN); + } + + svcpt->scp_nthrs_starting++; + thread->t_id = svcpt->scp_thr_nextid++; + thread_add_flags(thread, SVC_STARTING); + thread->t_svcpt = svcpt; + + list_add(&thread->t_link, &svcpt->scp_threads); + spin_unlock(&svcpt->scp_lock); + + if (svcpt->scp_cpt >= 0) { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d", + svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); + } else { + snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d", + svc->srv_thread_name, thread->t_id); + } + + CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); + rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name)); + if (IS_ERR_VALUE(rc)) { + CERROR("cannot start thread '%s': rc %d\n", + thread->t_name, rc); + spin_lock(&svcpt->scp_lock); + list_del(&thread->t_link); + --svcpt->scp_nthrs_starting; + spin_unlock(&svcpt->scp_lock); + + OBD_FREE(thread, sizeof(*thread)); + RETURN(rc); + } + + if (!wait) + RETURN(0); + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + rc = thread_is_stopped(thread) ? thread->t_id : 0; + RETURN(rc); +} + +int ptlrpc_hr_init(void) +{ + struct ptlrpc_hr_partition *hrp; + struct ptlrpc_hr_thread *hrt; + int rc; + int i; + int j; + ENTRY; + + memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); + ptlrpc_hr.hr_cpt_table = cfs_cpt_table; + + ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, + sizeof(*hrp)); + if (ptlrpc_hr.hr_partitions == NULL) + RETURN(-ENOMEM); + + init_waitqueue_head(&ptlrpc_hr.hr_waitq); + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + hrp->hrp_cpt = i; + + atomic_set(&hrp->hrp_nstarted, 0); + atomic_set(&hrp->hrp_nstopped, 0); + + hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); + hrp->hrp_nthrs /= cfs_cpu_ht_nsiblings(0); + + LASSERT(hrp->hrp_nthrs > 0); + OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i, + hrp->hrp_nthrs * sizeof(*hrt)); + if (hrp->hrp_thrs == NULL) + GOTO(out, rc = -ENOMEM); + + for (j = 0; j < hrp->hrp_nthrs; j++) { + hrt = &hrp->hrp_thrs[j]; + + hrt->hrt_id = j; + hrt->hrt_partition = hrp; + init_waitqueue_head(&hrt->hrt_waitq); + spin_lock_init(&hrt->hrt_lock); + INIT_LIST_HEAD(&hrt->hrt_queue); + } + } + + rc = ptlrpc_start_hr_threads(); +out: + if (rc != 0) + ptlrpc_hr_fini(); + RETURN(rc); +} + +void ptlrpc_hr_fini(void) +{ + struct ptlrpc_hr_partition *hrp; + int i; + + if (ptlrpc_hr.hr_partitions == NULL) + return; + + ptlrpc_stop_hr_threads(); + + cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { + if (hrp->hrp_thrs != NULL) { + OBD_FREE(hrp->hrp_thrs, + hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0])); + } + } + + cfs_percpt_free(ptlrpc_hr.hr_partitions); + ptlrpc_hr.hr_partitions = NULL; +} + + +/** + * Wait until all already scheduled replies are processed. + */ +static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) +{ + while (1) { + int rc; + struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), + NULL, NULL); + + rc = l_wait_event(svcpt->scp_waitq, + atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); + if (rc == 0) + break; + CWARN("Unexpectedly long timeout %s %p\n", + svcpt->scp_service->srv_name, svcpt->scp_service); + } +} + +static void +ptlrpc_service_del_atimer(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + /* early disarm AT timer... */ + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service != NULL) + cfs_timer_disarm(&svcpt->scp_at_timer); + } +} + +static void +ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct l_wait_info lwi; + int rc; + int i; + + /* All history will be culled when the next request buffer is + * freed in ptlrpc_service_purge_all() */ + svc->srv_hist_nrqbds_cpt_max = 0; + + rc = LNetClearLazyPortal(svc->srv_req_portal); + LASSERT(rc == 0); + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Unlink all the request buffers. This forces a 'final' + * event with its 'unlink' flag set for each posted rqbd */ + list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, + rqbd_list) { + rc = LNetMDUnlink(rqbd->rqbd_md_h); + LASSERT(rc == 0 || rc == -ENOENT); + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* Wait for the network to release any buffers + * it's currently filling */ + spin_lock(&svcpt->scp_lock); + while (svcpt->scp_nrqbds_posted != 0) { + spin_unlock(&svcpt->scp_lock); + /* Network access will complete in finite time but + * the HUGE timeout lets us CWARN for visibility + * of sluggish NALs */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(svcpt->scp_waitq, + svcpt->scp_nrqbds_posted == 0, &lwi); + if (rc == -ETIMEDOUT) { + CWARN("Service %s waiting for " + "request buffers\n", + svcpt->scp_service->srv_name); + } + spin_lock(&svcpt->scp_lock); + } + spin_unlock(&svcpt->scp_lock); + } +} + +static void +ptlrpc_service_purge_all(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_request_buffer_desc *rqbd; + struct ptlrpc_request *req; + struct ptlrpc_reply_state *rs; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + spin_lock(&svcpt->scp_rep_lock); + while (!list_empty(&svcpt->scp_rep_active)) { + rs = list_entry(svcpt->scp_rep_active.next, + struct ptlrpc_reply_state, rs_list); + spin_lock(&rs->rs_lock); + ptlrpc_schedule_difficult_reply(rs); + spin_unlock(&rs->rs_lock); + } + spin_unlock(&svcpt->scp_rep_lock); + + /* purge the request queue. NB No new replies (rqbds + * all unlinked) and no service threads, so I'm the only + * thread noodling the request queue now */ + while (!list_empty(&svcpt->scp_req_incoming)) { + req = list_entry(svcpt->scp_req_incoming.next, + struct ptlrpc_request, rq_list); + + list_del(&req->rq_list); + svcpt->scp_nreqs_incoming--; + ptlrpc_server_finish_request(svcpt, req); + } + + while (ptlrpc_server_request_pending(svcpt, true)) { + req = ptlrpc_server_request_get(svcpt, true); + ptlrpc_server_finish_active_request(svcpt, req); + } + + LASSERT(list_empty(&svcpt->scp_rqbd_posted)); + LASSERT(svcpt->scp_nreqs_incoming == 0); + LASSERT(svcpt->scp_nreqs_active == 0); + /* history should have been culled by + * ptlrpc_server_finish_request */ + LASSERT(svcpt->scp_hist_nrqbds == 0); + + /* Now free all the request buffers since nothing + * references them any more... */ + + while (!list_empty(&svcpt->scp_rqbd_idle)) { + rqbd = list_entry(svcpt->scp_rqbd_idle.next, + struct ptlrpc_request_buffer_desc, + rqbd_list); + ptlrpc_free_rqbd(rqbd); + } + ptlrpc_wait_replies(svcpt); + + while (!list_empty(&svcpt->scp_rep_idle)) { + rs = list_entry(svcpt->scp_rep_idle.next, + struct ptlrpc_reply_state, + rs_list); + list_del(&rs->rs_list); + OBD_FREE_LARGE(rs, svc->srv_max_reply_size); + } + } +} + +static void +ptlrpc_service_free(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + struct ptlrpc_at_array *array; + int i; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + if (svcpt->scp_service == NULL) + break; + + /* In case somebody rearmed this in the meantime */ + cfs_timer_disarm(&svcpt->scp_at_timer); + array = &svcpt->scp_at_array; + + if (array->paa_reqs_array != NULL) { + OBD_FREE(array->paa_reqs_array, + sizeof(struct list_head) * array->paa_size); + array->paa_reqs_array = NULL; + } + + if (array->paa_reqs_count != NULL) { + OBD_FREE(array->paa_reqs_count, + sizeof(__u32) * array->paa_size); + array->paa_reqs_count = NULL; + } + } + + ptlrpc_service_for_each_part(svcpt, i, svc) + OBD_FREE_PTR(svcpt); + + if (svc->srv_cpts != NULL) + cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); + + OBD_FREE(svc, offsetof(struct ptlrpc_service, + srv_parts[svc->srv_ncpts])); +} + +int ptlrpc_unregister_service(struct ptlrpc_service *service) +{ + ENTRY; + + CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); + + service->srv_is_stopping = 1; + + mutex_lock(&ptlrpc_all_services_mutex); + list_del_init(&service->srv_list); + mutex_unlock(&ptlrpc_all_services_mutex); + + ptlrpc_service_del_atimer(service); + ptlrpc_stop_all_threads(service); + + ptlrpc_service_unlink_rqbd(service); + ptlrpc_service_purge_all(service); + ptlrpc_service_nrs_cleanup(service); + + ptlrpc_lprocfs_unregister_service(service); + + ptlrpc_service_free(service); + + RETURN(0); +} +EXPORT_SYMBOL(ptlrpc_unregister_service); + +/** + * Returns 0 if the service is healthy. + * + * Right now, it just checks to make sure that requests aren't languishing + * in the queue. We'll use this health check to govern whether a node needs + * to be shot, so it's intentionally non-aggressive. */ +int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) +{ + struct ptlrpc_request *request = NULL; + struct timeval right_now; + long timediff; + + do_gettimeofday(&right_now); + + spin_lock(&svcpt->scp_req_lock); + /* How long has the next entry been waiting? */ + if (ptlrpc_server_high_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, true); + else if (ptlrpc_server_normal_pending(svcpt, true)) + request = ptlrpc_nrs_req_peek_nolock(svcpt, false); + + if (request == NULL) { + spin_unlock(&svcpt->scp_req_lock); + return 0; + } + + timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); + spin_unlock(&svcpt->scp_req_lock); + + if ((timediff / ONE_MILLION) > + (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { + CERROR("%s: unhealthy - request has been waiting %lds\n", + svcpt->scp_service->srv_name, timediff / ONE_MILLION); + return -1; + } + + return 0; +} + +int +ptlrpc_service_health_check(struct ptlrpc_service *svc) +{ + struct ptlrpc_service_part *svcpt; + int i; + + if (svc == NULL) + return 0; + + ptlrpc_service_for_each_part(svcpt, i, svc) { + int rc = ptlrpc_svcpt_health_check(svcpt); + + if (rc != 0) + return rc; + } + return 0; +} +EXPORT_SYMBOL(ptlrpc_service_health_check); diff --git a/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c new file mode 100644 index 000000000000..93bc40b422ee --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/wirehdr.c @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +# ifdef CONFIG_FS_POSIX_ACL +# include +# include +# endif + +#include +#include +#include +#include diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c new file mode 100644 index 000000000000..9890bd9cfb93 --- /dev/null +++ b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c @@ -0,0 +1,4474 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +# ifdef CONFIG_FS_POSIX_ACL +# include +# include +# endif + +#include +#include +#include +#include +void lustre_assert_wire_constants(void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * (make -C lustre/utils newwiretest) + * running on Linux deva 2.6.32.279.lustre #5 SMP Tue Apr 9 22:52:17 CST 2013 x86_64 x86_64 x + * with gcc version 4.4.4 20100726 (Red Hat 4.4.4-13) (GCC) */ + + + /* Constants... */ + LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n", + (long long)PTL_RPC_MSG_REQUEST); + LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n", + (long long)PTL_RPC_MSG_ERR); + LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n", + (long long)PTL_RPC_MSG_REPLY); + LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n", + MDS_DIR_END_OFF); + LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n", + DEAD_HANDLE_MAGIC); + CLASSERT(MTI_NAME_MAXLEN == 64); + LASSERTF(OST_REPLY == 0, "found %lld\n", + (long long)OST_REPLY); + LASSERTF(OST_GETATTR == 1, "found %lld\n", + (long long)OST_GETATTR); + LASSERTF(OST_SETATTR == 2, "found %lld\n", + (long long)OST_SETATTR); + LASSERTF(OST_READ == 3, "found %lld\n", + (long long)OST_READ); + LASSERTF(OST_WRITE == 4, "found %lld\n", + (long long)OST_WRITE); + LASSERTF(OST_CREATE == 5, "found %lld\n", + (long long)OST_CREATE); + LASSERTF(OST_DESTROY == 6, "found %lld\n", + (long long)OST_DESTROY); + LASSERTF(OST_GET_INFO == 7, "found %lld\n", + (long long)OST_GET_INFO); + LASSERTF(OST_CONNECT == 8, "found %lld\n", + (long long)OST_CONNECT); + LASSERTF(OST_DISCONNECT == 9, "found %lld\n", + (long long)OST_DISCONNECT); + LASSERTF(OST_PUNCH == 10, "found %lld\n", + (long long)OST_PUNCH); + LASSERTF(OST_OPEN == 11, "found %lld\n", + (long long)OST_OPEN); + LASSERTF(OST_CLOSE == 12, "found %lld\n", + (long long)OST_CLOSE); + LASSERTF(OST_STATFS == 13, "found %lld\n", + (long long)OST_STATFS); + LASSERTF(OST_SYNC == 16, "found %lld\n", + (long long)OST_SYNC); + LASSERTF(OST_SET_INFO == 17, "found %lld\n", + (long long)OST_SET_INFO); + LASSERTF(OST_QUOTACHECK == 18, "found %lld\n", + (long long)OST_QUOTACHECK); + LASSERTF(OST_QUOTACTL == 19, "found %lld\n", + (long long)OST_QUOTACTL); + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LAST_OPC == 21, "found %lld\n", + (long long)OST_LAST_OPC); + LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + OBD_OBJECT_EOF); + LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n", + (long long)OST_MIN_PRECREATE); + LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n", + (long long)OST_MAX_PRECREATE); + LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_INIT); + LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n", + OST_LVB_ERR_MASK); + LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n", + (long long)MDS_FIRST_OPC); + LASSERTF(MDS_GETATTR == 33, "found %lld\n", + (long long)MDS_GETATTR); + LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n", + (long long)MDS_GETATTR_NAME); + LASSERTF(MDS_CLOSE == 35, "found %lld\n", + (long long)MDS_CLOSE); + LASSERTF(MDS_REINT == 36, "found %lld\n", + (long long)MDS_REINT); + LASSERTF(MDS_READPAGE == 37, "found %lld\n", + (long long)MDS_READPAGE); + LASSERTF(MDS_CONNECT == 38, "found %lld\n", + (long long)MDS_CONNECT); + LASSERTF(MDS_DISCONNECT == 39, "found %lld\n", + (long long)MDS_DISCONNECT); + LASSERTF(MDS_GETSTATUS == 40, "found %lld\n", + (long long)MDS_GETSTATUS); + LASSERTF(MDS_STATFS == 41, "found %lld\n", + (long long)MDS_STATFS); + LASSERTF(MDS_PIN == 42, "found %lld\n", + (long long)MDS_PIN); + LASSERTF(MDS_UNPIN == 43, "found %lld\n", + (long long)MDS_UNPIN); + LASSERTF(MDS_SYNC == 44, "found %lld\n", + (long long)MDS_SYNC); + LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n", + (long long)MDS_DONE_WRITING); + LASSERTF(MDS_SET_INFO == 46, "found %lld\n", + (long long)MDS_SET_INFO); + LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n", + (long long)MDS_QUOTACHECK); + LASSERTF(MDS_QUOTACTL == 48, "found %lld\n", + (long long)MDS_QUOTACTL); + LASSERTF(MDS_GETXATTR == 49, "found %lld\n", + (long long)MDS_GETXATTR); + LASSERTF(MDS_SETXATTR == 50, "found %lld\n", + (long long)MDS_SETXATTR); + LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n", + (long long)MDS_WRITEPAGE); + LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n", + (long long)MDS_IS_SUBDIR); + LASSERTF(MDS_GET_INFO == 53, "found %lld\n", + (long long)MDS_GET_INFO); + LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n", + (long long)MDS_HSM_STATE_GET); + LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n", + (long long)MDS_HSM_STATE_SET); + LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n", + (long long)MDS_HSM_ACTION); + LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n", + (long long)MDS_HSM_PROGRESS); + LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n", + (long long)MDS_HSM_REQUEST); + LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n", + (long long)MDS_HSM_CT_REGISTER); + LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n", + (long long)MDS_HSM_CT_UNREGISTER); + LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n", + (long long)MDS_SWAP_LAYOUTS); + LASSERTF(MDS_LAST_OPC == 62, "found %lld\n", + (long long)MDS_LAST_OPC); + LASSERTF(REINT_SETATTR == 1, "found %lld\n", + (long long)REINT_SETATTR); + LASSERTF(REINT_CREATE == 2, "found %lld\n", + (long long)REINT_CREATE); + LASSERTF(REINT_LINK == 3, "found %lld\n", + (long long)REINT_LINK); + LASSERTF(REINT_UNLINK == 4, "found %lld\n", + (long long)REINT_UNLINK); + LASSERTF(REINT_RENAME == 5, "found %lld\n", + (long long)REINT_RENAME); + LASSERTF(REINT_OPEN == 6, "found %lld\n", + (long long)REINT_OPEN); + LASSERTF(REINT_SETXATTR == 7, "found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_RMENTRY == 8, "found %lld\n", + (long long)REINT_RMENTRY); + LASSERTF(REINT_MAX == 9, "found %lld\n", + (long long)REINT_MAX); + LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)DISP_IT_EXECD); + LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_EXECD); + LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_NEG); + LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)DISP_LOOKUP_POS); + LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_CREATE); + LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_OPEN); + LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_COMPLETE); + LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_OPEN_REF); + LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_ENQ_CREATE_REF); + LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", + (unsigned)DISP_OPEN_LOCK); + LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", + (long long)MDS_STATUS_CONN); + LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", + (long long)MDS_STATUS_LOV); + LASSERTF(LUSTRE_BFLAG_UNCOMMITTED_WRITES == 1, "found %lld\n", + (long long)LUSTRE_BFLAG_UNCOMMITTED_WRITES); + LASSERTF(MF_SOM_CHANGE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MF_SOM_CHANGE); + LASSERTF(MF_EPOCH_OPEN == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MF_EPOCH_OPEN); + LASSERTF(MF_EPOCH_CLOSE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MF_EPOCH_CLOSE); + LASSERTF(MF_MDC_CANCEL_FID1 == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID1); + LASSERTF(MF_MDC_CANCEL_FID2 == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID2); + LASSERTF(MF_MDC_CANCEL_FID3 == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID3); + LASSERTF(MF_MDC_CANCEL_FID4 == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MF_MDC_CANCEL_FID4); + LASSERTF(MF_SOM_AU == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MF_SOM_AU); + LASSERTF(MF_GETATTR_LOCK == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MF_GETATTR_LOCK); + LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MODE); + LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_UID); + LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_GID); + LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_SIZE); + LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME); + LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME); + LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME); + LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATIME_SET); + LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_MTIME_SET); + LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FORCE); + LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_ATTR_FLAG); + LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SUID); + LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_KILL_SGID); + LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_CTIME_SET); + LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_FROM_OPEN); + LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", + (long long)MDS_ATTR_BLOCKS); + LASSERTF(FLD_QUERY == 900, "found %lld\n", + (long long)FLD_QUERY); + LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", + (long long)FLD_FIRST_OPC); + LASSERTF(FLD_LAST_OPC == 901, "found %lld\n", + (long long)FLD_LAST_OPC); + LASSERTF(SEQ_QUERY == 700, "found %lld\n", + (long long)SEQ_QUERY); + LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n", + (long long)SEQ_FIRST_OPC); + LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n", + (long long)SEQ_LAST_OPC); + LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n", + (long long)SEQ_ALLOC_SUPER); + LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n", + (long long)SEQ_ALLOC_META); + LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n", + (long long)LDLM_ENQUEUE); + LASSERTF(LDLM_CONVERT == 102, "found %lld\n", + (long long)LDLM_CONVERT); + LASSERTF(LDLM_CANCEL == 103, "found %lld\n", + (long long)LDLM_CANCEL); + LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n", + (long long)LDLM_BL_CALLBACK); + LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n", + (long long)LDLM_CP_CALLBACK); + LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n", + (long long)LDLM_GL_CALLBACK); + LASSERTF(LDLM_SET_INFO == 107, "found %lld\n", + (long long)LDLM_SET_INFO); + LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n", + (long long)LDLM_LAST_OPC); + LASSERTF(LCK_MINMODE == 0, "found %lld\n", + (long long)LCK_MINMODE); + LASSERTF(LCK_EX == 1, "found %lld\n", + (long long)LCK_EX); + LASSERTF(LCK_PW == 2, "found %lld\n", + (long long)LCK_PW); + LASSERTF(LCK_PR == 4, "found %lld\n", + (long long)LCK_PR); + LASSERTF(LCK_CW == 8, "found %lld\n", + (long long)LCK_CW); + LASSERTF(LCK_CR == 16, "found %lld\n", + (long long)LCK_CR); + LASSERTF(LCK_NL == 32, "found %lld\n", + (long long)LCK_NL); + LASSERTF(LCK_GROUP == 64, "found %lld\n", + (long long)LCK_GROUP); + LASSERTF(LCK_COS == 128, "found %lld\n", + (long long)LCK_COS); + LASSERTF(LCK_MAXMODE == 129, "found %lld\n", + (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 8, "found %lld\n", + (long long)LCK_MODE_NUM); + CLASSERT(LDLM_PLAIN == 10); + CLASSERT(LDLM_EXTENT == 11); + CLASSERT(LDLM_FLOCK == 12); + CLASSERT(LDLM_IBITS == 13); + CLASSERT(LDLM_MAX_TYPE == 14); + CLASSERT(LUSTRE_RES_ID_SEQ_OFF == 0); + CLASSERT(LUSTRE_RES_ID_VER_OID_OFF == 1); + LASSERTF(UPDATE_OBJ == 1000, "found %lld\n", + (long long)UPDATE_OBJ); + LASSERTF(UPDATE_LAST_OPC == 1001, "found %lld\n", + (long long)UPDATE_LAST_OPC); + CLASSERT(LUSTRE_RES_ID_QUOTA_SEQ_OFF == 2); + CLASSERT(LUSTRE_RES_ID_QUOTA_VER_OID_OFF == 3); + CLASSERT(LUSTRE_RES_ID_HSH_OFF == 3); + CLASSERT(LQUOTA_TYPE_USR == 0); + CLASSERT(LQUOTA_TYPE_GRP == 1); + CLASSERT(LQUOTA_RES_MD == 1); + CLASSERT(LQUOTA_RES_DT == 2); + LASSERTF(OBD_PING == 400, "found %lld\n", + (long long)OBD_PING); + LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n", + (long long)OBD_LOG_CANCEL); + LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n", + (long long)OBD_QC_CALLBACK); + LASSERTF(OBD_IDX_READ == 403, "found %lld\n", + (long long)OBD_IDX_READ); + LASSERTF(OBD_LAST_OPC == 404, "found %lld\n", + (long long)OBD_LAST_OPC); + LASSERTF(QUOTA_DQACQ == 601, "found %lld\n", + (long long)QUOTA_DQACQ); + LASSERTF(QUOTA_DQREL == 602, "found %lld\n", + (long long)QUOTA_DQREL); + LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n", + (long long)QUOTA_LAST_OPC); + LASSERTF(MGS_CONNECT == 250, "found %lld\n", + (long long)MGS_CONNECT); + LASSERTF(MGS_DISCONNECT == 251, "found %lld\n", + (long long)MGS_DISCONNECT); + LASSERTF(MGS_EXCEPTION == 252, "found %lld\n", + (long long)MGS_EXCEPTION); + LASSERTF(MGS_TARGET_REG == 253, "found %lld\n", + (long long)MGS_TARGET_REG); + LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n", + (long long)MGS_TARGET_DEL); + LASSERTF(MGS_SET_INFO == 255, "found %lld\n", + (long long)MGS_SET_INFO); + LASSERTF(MGS_LAST_OPC == 257, "found %lld\n", + (long long)MGS_LAST_OPC); + LASSERTF(SEC_CTX_INIT == 801, "found %lld\n", + (long long)SEC_CTX_INIT); + LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n", + (long long)SEC_CTX_INIT_CONT); + LASSERTF(SEC_CTX_FINI == 803, "found %lld\n", + (long long)SEC_CTX_FINI); + LASSERTF(SEC_LAST_OPC == 804, "found %lld\n", + (long long)SEC_LAST_OPC); + /* Sizes and Offsets */ + + /* Checks for struct obd_uuid */ + LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(struct obd_uuid)); + + /* Checks for struct lu_seq_range */ + LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_seq_range)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_start)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_end)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_index)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_flags)); + LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags)); + LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n", + (long long)LU_SEQ_RANGE_MDT); + LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n", + (long long)LU_SEQ_RANGE_OST); + + /* Checks for struct lustre_mdt_attrs */ + LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct lustre_mdt_attrs)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat)); + LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid)); + LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); + LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAI_RELEASED); + LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LMAC_HSM); + LASSERTF(LMAC_SOM == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LMAC_SOM); + LASSERTF(OBJ_CREATE == 1, "found %lld\n", + (long long)OBJ_CREATE); + LASSERTF(OBJ_DESTROY == 2, "found %lld\n", + (long long)OBJ_DESTROY); + LASSERTF(OBJ_REF_ADD == 3, "found %lld\n", + (long long)OBJ_REF_ADD); + LASSERTF(OBJ_REF_DEL == 4, "found %lld\n", + (long long)OBJ_REF_DEL); + LASSERTF(OBJ_ATTR_SET == 5, "found %lld\n", + (long long)OBJ_ATTR_SET); + LASSERTF(OBJ_ATTR_GET == 6, "found %lld\n", + (long long)OBJ_ATTR_GET); + LASSERTF(OBJ_XATTR_SET == 7, "found %lld\n", + (long long)OBJ_XATTR_SET); + LASSERTF(OBJ_XATTR_GET == 8, "found %lld\n", + (long long)OBJ_XATTR_GET); + LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OBJ_INDEX_LOOKUP); + LASSERTF(OBJ_INDEX_LOOKUP == 9, "found %lld\n", + (long long)OBJ_INDEX_LOOKUP); + LASSERTF(OBJ_INDEX_INSERT == 10, "found %lld\n", + (long long)OBJ_INDEX_INSERT); + LASSERTF(OBJ_INDEX_DELETE == 11, "found %lld\n", + (long long)OBJ_INDEX_DELETE); + + /* Checks for struct som_attrs */ + LASSERTF((int)sizeof(struct som_attrs) == 40, "found %lld\n", + (long long)(int)sizeof(struct som_attrs)); + LASSERTF((int)offsetof(struct som_attrs, som_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_compat)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_compat)); + LASSERTF((int)offsetof(struct som_attrs, som_incompat) == 4, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_incompat)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_incompat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_incompat)); + LASSERTF((int)offsetof(struct som_attrs, som_ioepoch) == 8, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_ioepoch)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_ioepoch)); + LASSERTF((int)offsetof(struct som_attrs, som_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_size)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_size)); + LASSERTF((int)offsetof(struct som_attrs, som_blocks) == 24, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_blocks)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_blocks)); + LASSERTF((int)offsetof(struct som_attrs, som_mountid) == 32, "found %lld\n", + (long long)(int)offsetof(struct som_attrs, som_mountid)); + LASSERTF((int)sizeof(((struct som_attrs *)0)->som_mountid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct som_attrs *)0)->som_mountid)); + + /* Checks for struct hsm_attrs */ + LASSERTF((int)sizeof(struct hsm_attrs) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_attrs)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_compat) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_compat)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_compat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_compat)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_flags)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_flags)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_id)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_id)); + LASSERTF((int)offsetof(struct hsm_attrs, hsm_arch_ver) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_attrs, hsm_arch_ver)); + LASSERTF((int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_attrs *)0)->hsm_arch_ver)); + + /* Checks for struct ost_id */ + LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", + (long long)(int)sizeof(struct ost_id)); + LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_id, oi)); + LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ost_id *)0)->oi)); + LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n", + (long long)LUSTRE_FID_INIT_OID); + LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n", + (long long)FID_SEQ_OST_MDT0); + LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n", + (long long)FID_SEQ_LLOG); + LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n", + (long long)FID_SEQ_ECHO); + LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n", + (long long)FID_SEQ_OST_MDT1); + LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n", + (long long)FID_SEQ_OST_MAX); + LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n", + (long long)FID_SEQ_RSVD); + LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", + (long long)FID_SEQ_IGIF); + LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IGIF_MAX); + LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF); + LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_IDIF_MAX); + LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_START); + LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOCAL_FILE); + LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_DOT_LUSTRE); + LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_SPECIAL); + LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA); + LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_QUOTA_GLB); + LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_ROOT); + LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_NORMAL); + LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", + (long long)FID_SEQ_LOV_DEFAULT); + LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_SPECIAL_BFL); + LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE); + LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)FID_OID_DOT_LUSTRE_OBF); + + /* Checks for struct lu_dirent */ + LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", + (long long)(int)sizeof(struct lu_dirent)); + LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_fid)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid)); + LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_hash)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash)); + LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_reclen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_namelen)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen)); + LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_attrs)); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs)); + LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lu_dirent, lde_name[0])); + LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); + LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LUDA_FID); + LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LUDA_TYPE); + LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)LUDA_64BITHASH); + + /* Checks for struct luda_type */ + LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", + (long long)(int)sizeof(struct luda_type)); + LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct luda_type, lt_type)); + LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n", + (long long)(int)sizeof(((struct luda_type *)0)->lt_type)); + + /* Checks for struct lu_dirpage */ + LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n", + (long long)(int)sizeof(struct lu_dirpage)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_flags)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_pad0)); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0)); + LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n", + (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0])); + LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n", + (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0])); + LASSERTF(LDF_EMPTY == 1, "found %lld\n", + (long long)LDF_EMPTY); + LASSERTF(LDF_COLLIDE == 2, "found %lld\n", + (long long)LDF_COLLIDE); + LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n", + (long long)LU_PAGE_SIZE); + /* Checks for union lu_page */ + LASSERTF((int)sizeof(union lu_page) == 4096, "found %lld\n", + (long long)(int)sizeof(union lu_page)); + + /* Checks for struct lustre_handle */ + LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n", + (long long)(int)sizeof(struct lustre_handle)); + LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_handle, cookie)); + LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_handle *)0)->cookie)); + + /* Checks for struct lustre_msg_v2 */ + LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n", + (long long)(int)sizeof(struct lustre_msg_v2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_magic)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0])); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); + LASSERTF(LUSTRE_MSG_MAGIC_V1 == 0x0BD00BD0, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V1); + LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V2); + LASSERTF(LUSTRE_MSG_MAGIC_V1_SWABBED == 0xD00BD00B, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V1_SWABBED); + LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n", + LUSTRE_MSG_MAGIC_V2_SWABBED); + + /* Checks for struct ptlrpc_body */ + LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", + (long long)(int)sizeof(struct ptlrpc_body_v3)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == 32, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_seen)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv)); + CLASSERT(PTLRPC_NUM_VERSIONS == 4); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == 120, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding)); + CLASSERT(JOBSTATS_JOBID_SIZE == 32); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_seen) == (int)offsetof(struct ptlrpc_body_v2, pb_last_seen), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_seen), (int)offsetof(struct ptlrpc_body_v2, pb_last_seen)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_seen), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_seen)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions)); + LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding) == (int)offsetof(struct ptlrpc_body_v2, pb_padding), "%d != %d\n", + (int)offsetof(struct ptlrpc_body_v3, pb_padding), (int)offsetof(struct ptlrpc_body_v2, pb_padding)); + LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding), "%d != %d\n", + (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding)); + LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n", + (long long)MSG_PTLRPC_BODY_OFF); + LASSERTF(REQ_REC_OFF == 1, "found %lld\n", + (long long)REQ_REC_OFF); + LASSERTF(REPLY_REC_OFF == 1, "found %lld\n", + (long long)REPLY_REC_OFF); + LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREQ_OFF); + LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n", + (long long)DLM_REQ_REC_OFF); + LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n", + (long long)DLM_INTENT_IT_OFF); + LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n", + (long long)DLM_INTENT_REC_OFF); + LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n", + (long long)DLM_LOCKREPLY_OFF); + LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n", + (long long)DLM_REPLY_REC_OFF); + LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", + (long long)MSG_PTLRPC_HEADER_OFF); + LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n", + PTLRPC_MSG_VERSION); + LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n", + LUSTRE_VERSION_MASK); + LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n", + LUSTRE_OBD_VERSION); + LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n", + LUSTRE_MDS_VERSION); + LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n", + LUSTRE_OST_VERSION); + LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n", + LUSTRE_DLM_VERSION); + LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n", + LUSTRE_LOG_VERSION); + LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n", + LUSTRE_MGS_VERSION); + LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", + (long long)MSGHDR_AT_SUPPORT); + LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", + (long long)MSGHDR_CKSUM_INCOMPAT18); + LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n", + (unsigned)MSG_OP_FLAG_MASK); + LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n", + (long long)MSG_OP_FLAG_SHIFT); + LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n", + (unsigned)MSG_GEN_FLAG_MASK); + LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_LAST_REPLAY); + LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_RESENT); + LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_REPLAY); + LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_DELAY_REPLAY); + LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_VERSION_REPLAY); + LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_REQ_REPLAY_DONE); + LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_LOCK_REPLAY_DONE); + LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECOVERING); + LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_RECONNECT); + LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_REPLAYABLE); + LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_LIBCLIENT); + LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_INITIAL); + LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_ASYNC); + LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_NEXT_VER); + LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)MSG_CONNECT_TRANSNO); + + /* Checks for struct obd_connect_data */ + LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", + (long long)(int)sizeof(struct obd_connect_data)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_version)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_index)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_unused)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_group)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_instance)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes)); + LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 72, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding1)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1)); + LASSERTF((int)offsetof(struct obd_connect_data, padding2) == 80, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding2)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2)); + LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding3)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3)); + LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding4)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4)); + LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding5)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5)); + LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding6)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6)); + LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding7)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7)); + LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding8)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8)); + LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, padding9)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingA)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingB)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingC)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingD)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingE)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE)); + LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, paddingF)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF)); + LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RDONLY); + LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_INDEX); + LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS); + LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT); + LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SRVLOCK); + LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VERSION); + LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REQPORTAL); + LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ACL); + LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_XATTR); + LASSERTF(OBD_CONNECT_CROW == 0x200ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CROW); + LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRUNCLOCK); + LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_TRANSNO); + LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IBITS); + LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOIN); + LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_ATTRFID); + LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NODEVOH); + LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT); + LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_RMT_CLIENT_FORCE); + LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_BRW_SIZE); + LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_QUOTA64); + LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_CAPA); + LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_OSS_CAPA); + LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CANCELSET); + LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SOM); + LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_AT); + LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LRU_RESIZE); + LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MDS_MDS); + LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_REAL); + LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CHANGE_QS); + LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_CKSUM); + LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FID); + LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_VBR); + LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LOV_V3); + LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_SHRINK); + LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SKIP_ORPHAN); + LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAX_EASIZE); + LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FULL20); + LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LAYOUTLOCK); + LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_64BITHASH); + LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_MAXBYTES); + LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_IMP_RECOV); + LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_JOBSTATS); + LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_UMASK); + LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_EINPROGRESS); + LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_GRANT_PARAM); + LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_FLOCK_OWNER); + LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LVB_TYPE); + LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_NANOSEC_TIME); + LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_LIGHTWEIGHT); + LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_SHORTIO); + LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n", + OBD_CONNECT_PINGLESS); + LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32); + LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_ADLER); + LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)OBD_CKSUM_CRC32C); + + /* Checks for struct obdo */ + LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", + (long long)(int)sizeof(struct obdo)); + LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_valid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_valid)); + LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_oi)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_oi)); + LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_seq)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq)); + LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_size)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_size)); + LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mtime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); + LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_atime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_atime)); + LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ctime)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); + LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blocks)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blocks)); + LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_grant)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_grant)); + LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_blksize)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_blksize)); + LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_mode)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_mode)); + LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid)); + LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid)); + LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_flags)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_flags)); + LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_nlink)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_nlink)); + LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_oid)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid)); + LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_misc)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_misc)); + LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_ioepoch)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch)); + LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_stripe_idx)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx)); + LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_parent_ver)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver)); + LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_handle)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_handle)); + LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_lcookie)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_lcookie)); + LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_uid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_uid_h)); + LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_gid_h)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_gid_h)); + LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_data_version)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_data_version)); + LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_4)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_4)); + LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_5)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_5)); + LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n", + (long long)(int)offsetof(struct obdo, o_padding_6)); + LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obdo *)0)->o_padding_6)); + LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n", + OBD_MD_FLID); + LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n", + OBD_MD_FLATIME); + LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMTIME); + LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCTIME); + LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n", + OBD_MD_FLSIZE); + LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLOCKS); + LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n", + OBD_MD_FLBLKSZ); + LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODE); + LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n", + OBD_MD_FLTYPE); + LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUID); + LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGID); + LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFLAGS); + LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLNLINK); + LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGENER); + LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRDEV); + LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEASIZE); + LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n", + OBD_MD_LINKNAME); + LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLHANDLE); + LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSUM); + LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLQOS); + LASSERTF(OBD_MD_FLCOOKIE == (0x00800000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCOOKIE); + LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGROUP); + LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLFID); + LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLEPOCH); + LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRANT); + LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDIREA); + LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLUSRQUOTA); + LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGRPQUOTA); + LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMODEASIZE); + LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MDS); + LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n", + OBD_MD_REINT); + LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n", + OBD_MD_MEA); + LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTR); + LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRLS); + LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLXATTRRM); + LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLACL); + LASSERTF(OBD_MD_FLRMTPERM == (0x0000010000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTPERM); + LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLMDSCAPA); + LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLOSSCAPA); + LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCKSPLIT); + LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLCROSSREF); + LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLGETATTRLOCK); + LASSERTF(OBD_MD_FLRMTLSETFACL == (0x0001000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTLSETFACL); + LASSERTF(OBD_MD_FLRMTLGETFACL == (0x0002000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTLGETFACL); + LASSERTF(OBD_MD_FLRMTRSETFACL == (0x0004000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTRSETFACL); + LASSERTF(OBD_MD_FLRMTRGETFACL == (0x0008000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLRMTRGETFACL); + LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n", + OBD_MD_FLDATAVERSION); + CLASSERT(OBD_FL_INLINEDATA == 0x00000001); + CLASSERT(OBD_FL_OBDMDEXISTS == 0x00000002); + CLASSERT(OBD_FL_DELORPHAN == 0x00000004); + CLASSERT(OBD_FL_NORPC == 0x00000008); + CLASSERT(OBD_FL_IDONLY == 0x00000010); + CLASSERT(OBD_FL_RECREATE_OBJS == 0x00000020); + CLASSERT(OBD_FL_DEBUG_CHECK == 0x00000040); + CLASSERT(OBD_FL_NO_USRQUOTA == 0x00000100); + CLASSERT(OBD_FL_NO_GRPQUOTA == 0x00000200); + CLASSERT(OBD_FL_CREATE_CROW == 0x00000400); + CLASSERT(OBD_FL_SRVLOCK == 0x00000800); + CLASSERT(OBD_FL_CKSUM_CRC32 == 0x00001000); + CLASSERT(OBD_FL_CKSUM_ADLER == 0x00002000); + CLASSERT(OBD_FL_CKSUM_CRC32C == 0x00004000); + CLASSERT(OBD_FL_CKSUM_RSVD2 == 0x00008000); + CLASSERT(OBD_FL_CKSUM_RSVD3 == 0x00010000); + CLASSERT(OBD_FL_SHRINK_GRANT == 0x00020000); + CLASSERT(OBD_FL_MMAP == 0x00040000); + CLASSERT(OBD_FL_RECOV_RESEND == 0x00080000); + CLASSERT(OBD_FL_NOSPC_BLK == 0x00100000); + CLASSERT(OBD_FL_LOCAL_MASK == 0xf0000000); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + + /* Checks for struct lov_mds_md_v1 */ + LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v1)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen)); + LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V1 == 0x0BD10BD0); + + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); + CLASSERT(LOV_MAXPOOLNAME == 16); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16])); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); + CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0); + LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_RAID1); + LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_FIRST); + LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n", + (unsigned)LOV_PATTERN_CMOBD); + + /* Checks for struct obd_statfs */ + LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", + (long long)(int)sizeof(struct obd_statfs)); + LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_type)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_type)); + LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_blocks)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks)); + LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bfree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree)); + LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bavail)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail)); + LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_ffree)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree)); + LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fsid)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid)); + LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_bsize)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize)); + LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_namelen)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen)); + LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_state)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_state)); + LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_fprecreated)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare2)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare3)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare4)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare5)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare6)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare7)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare8)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8)); + LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n", + (long long)(int)offsetof(struct obd_statfs, os_spare9)); + LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9)); + + /* Checks for struct obd_ioobj */ + LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_ioobj)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_oid)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw)); + LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt)); + LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt)); + + /* Checks for union lquota_id */ + LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n", + (long long)(int)sizeof(union lquota_id)); + + LASSERTF(QUOTABLOCK_BITS == 10, "found %lld\n", + (long long)QUOTABLOCK_BITS); + LASSERTF(QUOTABLOCK_SIZE == 1024, "found %lld\n", + (long long)QUOTABLOCK_SIZE); + + /* Checks for struct obd_quotactl */ + LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n", + (long long)(int)sizeof(struct obd_quotactl)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_cmd)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_type)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_id)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_stat)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo)); + LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_quotactl, qc_dqblk)); + LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk)); + + /* Checks for struct obd_dqinfo */ + LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n", + (long long)(int)sizeof(struct obd_dqinfo)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_flags)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags)); + LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n", + (long long)(int)offsetof(struct obd_dqinfo, dqi_valid)); + LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid)); + + /* Checks for struct obd_dqblk */ + LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n", + (long long)(int)sizeof(struct obd_dqblk)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curspace)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_btime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_itime)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); + LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n", + (long long)(int)offsetof(struct obd_dqblk, dqb_padding)); + LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); + LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", + Q_QUOTACHECK); + LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", + Q_INITQUOTA); + LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", + Q_GETOINFO); + LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", + Q_GETOQUOTA); + LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", + Q_FINVALIDATE); + + /* Checks for struct lquota_acct_rec */ + LASSERTF((int)sizeof(struct lquota_acct_rec) == 16, "found %lld\n", + (long long)(int)sizeof(struct lquota_acct_rec)); + LASSERTF((int)offsetof(struct lquota_acct_rec, bspace) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, bspace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->bspace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->bspace)); + LASSERTF((int)offsetof(struct lquota_acct_rec, ispace) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_acct_rec, ispace)); + LASSERTF((int)sizeof(((struct lquota_acct_rec *)0)->ispace) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_acct_rec *)0)->ispace)); + + /* Checks for struct lquota_glb_rec */ + LASSERTF((int)sizeof(struct lquota_glb_rec) == 32, "found %lld\n", + (long long)(int)sizeof(struct lquota_glb_rec)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_hardlimit) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_hardlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_hardlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_softlimit) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_softlimit)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_softlimit)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_time) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_time)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_time)); + LASSERTF((int)offsetof(struct lquota_glb_rec, qbr_granted) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_glb_rec, qbr_granted)); + LASSERTF((int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_glb_rec *)0)->qbr_granted)); + + /* Checks for struct lquota_slv_rec */ + LASSERTF((int)sizeof(struct lquota_slv_rec) == 8, "found %lld\n", + (long long)(int)sizeof(struct lquota_slv_rec)); + LASSERTF((int)offsetof(struct lquota_slv_rec, qsr_granted) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_slv_rec, qsr_granted)); + LASSERTF((int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_slv_rec *)0)->qsr_granted)); + + /* Checks for struct idx_info */ + LASSERTF((int)sizeof(struct idx_info) == 80, "found %lld\n", + (long long)(int)sizeof(struct idx_info)); + LASSERTF((int)offsetof(struct idx_info, ii_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_magic)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_magic)); + LASSERTF((int)offsetof(struct idx_info, ii_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_flags)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_flags)); + LASSERTF((int)offsetof(struct idx_info, ii_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_count)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_count) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_count)); + LASSERTF((int)offsetof(struct idx_info, ii_pad0) == 10, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad0)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad0) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad0)); + LASSERTF((int)offsetof(struct idx_info, ii_attrs) == 12, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_attrs)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_attrs) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_attrs)); + LASSERTF((int)offsetof(struct idx_info, ii_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_fid)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_fid)); + LASSERTF((int)offsetof(struct idx_info, ii_version) == 32, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_version)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_version)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_start) == 40, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_start)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_start)); + LASSERTF((int)offsetof(struct idx_info, ii_hash_end) == 48, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_hash_end)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_hash_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_hash_end)); + LASSERTF((int)offsetof(struct idx_info, ii_keysize) == 56, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_keysize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_keysize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_keysize)); + LASSERTF((int)offsetof(struct idx_info, ii_recsize) == 58, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_recsize)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_recsize) == 2, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_recsize)); + LASSERTF((int)offsetof(struct idx_info, ii_pad1) == 60, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad1)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad1)); + LASSERTF((int)offsetof(struct idx_info, ii_pad2) == 64, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad2)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad2)); + LASSERTF((int)offsetof(struct idx_info, ii_pad3) == 72, "found %lld\n", + (long long)(int)offsetof(struct idx_info, ii_pad3)); + LASSERTF((int)sizeof(((struct idx_info *)0)->ii_pad3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct idx_info *)0)->ii_pad3)); + CLASSERT(IDX_INFO_MAGIC == 0x3D37CC37); + + /* Checks for struct lu_idxpage */ + LASSERTF((int)sizeof(struct lu_idxpage) == 16, "found %lld\n", + (long long)(int)sizeof(struct lu_idxpage)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_magic)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_magic)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_flags)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_flags)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_nr) == 6, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_nr)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_nr) == 2, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_nr)); + LASSERTF((int)offsetof(struct lu_idxpage, lip_pad0) == 8, "found %lld\n", + (long long)(int)offsetof(struct lu_idxpage, lip_pad0)); + LASSERTF((int)sizeof(((struct lu_idxpage *)0)->lip_pad0) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lu_idxpage *)0)->lip_pad0)); + CLASSERT(LIP_MAGIC == 0x8A6D6B6C); + LASSERTF(LIP_HDR_SIZE == 16, "found %lld\n", + (long long)LIP_HDR_SIZE); + LASSERTF(II_FL_NOHASH == 1, "found %lld\n", + (long long)II_FL_NOHASH); + LASSERTF(II_FL_VARKEY == 2, "found %lld\n", + (long long)II_FL_VARKEY); + LASSERTF(II_FL_VARREC == 4, "found %lld\n", + (long long)II_FL_VARREC); + LASSERTF(II_FL_NONUNQ == 8, "found %lld\n", + (long long)II_FL_NONUNQ); + + /* Checks for struct niobuf_remote */ + LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", + (long long)(int)sizeof(struct niobuf_remote)); + LASSERTF((int)offsetof(struct niobuf_remote, offset) == 0, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, offset)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->offset)); + LASSERTF((int)offsetof(struct niobuf_remote, len) == 8, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, len)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->len)); + LASSERTF((int)offsetof(struct niobuf_remote, flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct niobuf_remote, flags)); + LASSERTF((int)sizeof(((struct niobuf_remote *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct niobuf_remote *)0)->flags)); + LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", + OBD_BRW_READ); + LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", + OBD_BRW_WRITE); + LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", + OBD_BRW_SYNC); + LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", + OBD_BRW_CHECK); + LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", + OBD_BRW_FROM_GRANT); + LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", + OBD_BRW_GRANTED); + LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", + OBD_BRW_NOCACHE); + LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", + OBD_BRW_NOQUOTA); + LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", + OBD_BRW_SRVLOCK); + LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", + OBD_BRW_ASYNC); + LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", + OBD_BRW_MEMALLOC); + + /* Checks for struct ost_body */ + LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", + (long long)(int)sizeof(struct ost_body)); + LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_body, oa)); + LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ost_body *)0)->oa)); + + /* Checks for struct ll_fid */ + LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n", + (long long)(int)sizeof(struct ll_fid)); + LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, id)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->id)); + LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, generation)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->generation)); + LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n", + (long long)(int)offsetof(struct ll_fid, f_type)); + LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fid *)0)->f_type)); + + /* Checks for struct mdt_body */ + LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", + (long long)(int)sizeof(struct mdt_body)); + LASSERTF((int)offsetof(struct mdt_body, fid1) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fid1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fid1)); + LASSERTF((int)offsetof(struct mdt_body, fid2) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fid2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fid2)); + LASSERTF((int)offsetof(struct mdt_body, handle) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, handle)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->handle)); + LASSERTF((int)offsetof(struct mdt_body, valid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, valid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->valid)); + LASSERTF((int)offsetof(struct mdt_body, size) == 48, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, size)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->size)); + LASSERTF((int)offsetof(struct mdt_body, mtime) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mtime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mtime)); + LASSERTF((int)offsetof(struct mdt_body, atime) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, atime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->atime)); + LASSERTF((int)offsetof(struct mdt_body, ctime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, ctime)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->ctime)); + LASSERTF((int)offsetof(struct mdt_body, blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, blocks)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->blocks)); + LASSERTF((int)offsetof(struct mdt_body, unused1) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, unused1)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->unused1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->unused1)); + LASSERTF((int)offsetof(struct mdt_body, fsuid) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fsuid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fsuid)); + LASSERTF((int)offsetof(struct mdt_body, fsgid) == 108, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, fsgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->fsgid)); + LASSERTF((int)offsetof(struct mdt_body, capability) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, capability)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->capability) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->capability)); + LASSERTF((int)offsetof(struct mdt_body, mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, mode)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->mode)); + LASSERTF((int)offsetof(struct mdt_body, uid) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, uid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->uid)); + LASSERTF((int)offsetof(struct mdt_body, gid) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, gid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->gid)); + LASSERTF((int)offsetof(struct mdt_body, flags) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, flags)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->flags)); + LASSERTF((int)offsetof(struct mdt_body, rdev) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, rdev)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->rdev) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->rdev)); + LASSERTF((int)offsetof(struct mdt_body, nlink) == 136, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, nlink)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->nlink) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->nlink)); + LASSERTF((int)offsetof(struct mdt_body, unused2) == 140, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, unused2)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->unused2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->unused2)); + LASSERTF((int)offsetof(struct mdt_body, suppgid) == 144, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, suppgid)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->suppgid)); + LASSERTF((int)offsetof(struct mdt_body, eadatasize) == 148, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, eadatasize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->eadatasize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->eadatasize)); + LASSERTF((int)offsetof(struct mdt_body, aclsize) == 152, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, aclsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->aclsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->aclsize)); + LASSERTF((int)offsetof(struct mdt_body, max_mdsize) == 156, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, max_mdsize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->max_mdsize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->max_mdsize)); + LASSERTF((int)offsetof(struct mdt_body, max_cookiesize) == 160, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, max_cookiesize)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->max_cookiesize) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->max_cookiesize)); + LASSERTF((int)offsetof(struct mdt_body, uid_h) == 164, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, uid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->uid_h)); + LASSERTF((int)offsetof(struct mdt_body, gid_h) == 168, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, gid_h)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->gid_h)); + LASSERTF((int)offsetof(struct mdt_body, padding_5) == 172, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_5)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_5)); + LASSERTF((int)offsetof(struct mdt_body, padding_6) == 176, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_6)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_6)); + LASSERTF((int)offsetof(struct mdt_body, padding_7) == 184, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_7)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_7)); + LASSERTF((int)offsetof(struct mdt_body, padding_8) == 192, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_8)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_8) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_8)); + LASSERTF((int)offsetof(struct mdt_body, padding_9) == 200, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_9)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_9) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_9)); + LASSERTF((int)offsetof(struct mdt_body, padding_10) == 208, "found %lld\n", + (long long)(int)offsetof(struct mdt_body, padding_10)); + LASSERTF((int)sizeof(((struct mdt_body *)0)->padding_10) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_body *)0)->padding_10)); + LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", + MDS_FMODE_CLOSED); + LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", + MDS_FMODE_EXEC); + LASSERTF(MDS_FMODE_EPOCH == 000001000000UL, "found 0%.11oUL\n", + MDS_FMODE_EPOCH); + LASSERTF(MDS_FMODE_TRUNC == 000002000000UL, "found 0%.11oUL\n", + MDS_FMODE_TRUNC); + LASSERTF(MDS_FMODE_SOM == 000004000000UL, "found 0%.11oUL\n", + MDS_FMODE_SOM); + LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", + MDS_OPEN_CREATED); + LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n", + MDS_OPEN_CROSS); + LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", + MDS_OPEN_CREAT); + LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", + MDS_OPEN_EXCL); + LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", + MDS_OPEN_TRUNC); + LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", + MDS_OPEN_APPEND); + LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", + MDS_OPEN_SYNC); + LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", + MDS_OPEN_DIRECTORY); + LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", + MDS_OPEN_BY_FID); + LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", + MDS_OPEN_DELAY_CREATE); + LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", + MDS_OPEN_OWNEROVERRIDE); + LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", + MDS_OPEN_JOIN_FILE); + LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", + MDS_OPEN_LOCK); + LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_EA); + LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", + MDS_OPEN_HAS_OBJS); + LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NORESTORE); + LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_NEWSTRIPE); + LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", + (long long)MDS_OPEN_VOLATILE); + LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n", + LUSTRE_SYNC_FL); + LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n", + LUSTRE_IMMUTABLE_FL); + LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n", + LUSTRE_APPEND_FL); + LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n", + LUSTRE_NOATIME_FL); + LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n", + LUSTRE_DIRSYNC_FL); + LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n", + MDS_INODELOCK_LOOKUP); + LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n", + MDS_INODELOCK_UPDATE); + LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n", + MDS_INODELOCK_OPEN); + LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n", + MDS_INODELOCK_LAYOUT); + + /* Checks for struct mdt_ioepoch */ + LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", + (long long)(int)sizeof(struct mdt_ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, handle) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, handle)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->handle)); + LASSERTF((int)offsetof(struct mdt_ioepoch, ioepoch) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, ioepoch)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->ioepoch)); + LASSERTF((int)offsetof(struct mdt_ioepoch, flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, flags)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->flags)); + LASSERTF((int)offsetof(struct mdt_ioepoch, padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_ioepoch, padding)); + LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_ioepoch *)0)->padding)); + + /* Checks for struct mdt_remote_perm */ + LASSERTF((int)sizeof(struct mdt_remote_perm) == 32, "found %lld\n", + (long long)(int)sizeof(struct mdt_remote_perm)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_uid) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_uid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_uid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_gid) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_gid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_gid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_fsuid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsuid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_fsgid)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_fsgid)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_access_perm) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_access_perm)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_access_perm)); + LASSERTF((int)offsetof(struct mdt_remote_perm, rp_padding) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_remote_perm, rp_padding)); + LASSERTF((int)sizeof(((struct mdt_remote_perm *)0)->rp_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_remote_perm *)0)->rp_padding)); + LASSERTF(CFS_SETUID_PERM == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETUID_PERM); + LASSERTF(CFS_SETGID_PERM == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETGID_PERM); + LASSERTF(CFS_SETGRP_PERM == 0x00000004UL, "found 0x%.8xUL\n", + (unsigned)CFS_SETGRP_PERM); + LASSERTF(CFS_RMTACL_PERM == 0x00000008UL, "found 0x%.8xUL\n", + (unsigned)CFS_RMTACL_PERM); + LASSERTF(CFS_RMTOWN_PERM == 0x00000010UL, "found 0x%.8xUL\n", + (unsigned)CFS_RMTOWN_PERM); + + /* Checks for struct mdt_rec_setattr */ + LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setattr)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5)); + + /* Checks for struct mdt_rec_create */ + LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_create)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_time)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_rdev)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4)); + + /* Checks for struct mdt_rec_link */ + LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_link)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_time)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9)); + + /* Checks for struct mdt_rec_unlink */ + LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_unlink)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_time)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9)); + + /* Checks for struct mdt_rec_rename */ + LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_rename)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_time)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8)); + + /* Checks for struct mdt_rec_setxattr */ + LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_setxattr)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10)); + LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11)); + LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11)); + + /* Checks for struct mdt_rec_reint */ + LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n", + (long long)(int)sizeof(struct mdt_rec_reint)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_cap)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_atime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_size)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_bias)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_mode)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_umask)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask)); + LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n", + (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4)); + LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4)); + + /* Checks for struct lmv_desc */ + LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lmv_desc)); + LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_3)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3)); + LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_padding_4)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4)); + LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lmv_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); + + /* Checks for struct lmv_stripe_md */ + LASSERTF((int)sizeof(struct lmv_stripe_md) == 32, "found %lld\n", + (long long)(int)sizeof(struct lmv_stripe_md)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_magic)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_magic)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_count)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_count)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_master) == 8, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_master)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_master) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_master)); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_padding)); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_padding)); + CLASSERT(LOV_MAXPOOLNAME == 16); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_pool_name[16]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_pool_name[16])); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_pool_name[16])); + LASSERTF((int)offsetof(struct lmv_stripe_md, mea_ids[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct lmv_stripe_md, mea_ids[0])); + LASSERTF((int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0]) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lmv_stripe_md *)0)->mea_ids[0])); + + /* Checks for struct lov_desc */ + LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", + (long long)(int)sizeof(struct lov_desc)); + LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count)); + LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_pattern)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size)); + LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_0)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_1)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_padding_2)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); + LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_uuid)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid)); + CLASSERT(LOV_DESC_MAGIC == 0xB0CCDE5C); + + /* Checks for struct ldlm_res_id */ + LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_res_id)); + CLASSERT(RES_NAME_SIZE == 4); + LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_res_id, name[4])); + LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4])); + + /* Checks for struct ldlm_extent */ + LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n", + (long long)(int)sizeof(struct ldlm_extent)); + LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, start)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->start)); + LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, end)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->end)); + LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_extent, gid)); + LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_extent *)0)->gid)); + + /* Checks for struct ldlm_inodebits */ + LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_inodebits)); + LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_inodebits, bits)); + LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits)); + + /* Checks for struct ldlm_flock_wire */ + LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n", + (long long)(int)sizeof(struct ldlm_flock_wire)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding)); + LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n", + (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid)); + LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid)); + + /* Checks for struct ldlm_intent */ + LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n", + (long long)(int)sizeof(struct ldlm_intent)); + LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_intent, opc)); + LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_intent *)0)->opc)); + + /* Checks for struct ldlm_resource_desc */ + LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n", + (long long)(int)sizeof(struct ldlm_resource_desc)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding)); + LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); + LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name)); + + /* Checks for struct ldlm_lock_desc */ + LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(struct ldlm_lock_desc)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_resource)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode)); + LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data)); + LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data)); + + /* Checks for struct ldlm_request */ + LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n", + (long long)(int)sizeof(struct ldlm_request)); + LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_count)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count)); + LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_request, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle)); + + /* Checks for struct ldlm_reply */ + LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n", + (long long)(int)sizeof(struct ldlm_reply)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_flags)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_padding)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_desc)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_handle)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1)); + LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n", + (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2)); + LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2)); + + /* Checks for struct ost_lvb_v1 */ + LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb_v1)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks)); + + /* Checks for struct ost_lvb */ + LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n", + (long long)(int)sizeof(struct ost_lvb)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_size)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_blocks)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns)); + LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n", + (long long)(int)offsetof(struct ost_lvb, lvb_padding)); + LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding)); + + /* Checks for struct lquota_lvb */ + LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n", + (long long)(int)sizeof(struct lquota_lvb)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_flags)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit)); + LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n", + (long long)(int)offsetof(struct lquota_lvb, lvb_pad1)); + LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1)); + LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n", + (long long)LQUOTA_FL_EDQUOT); + + /* Checks for struct ldlm_gl_lquota_desc */ + LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n", + (long long)(int)sizeof(struct ldlm_gl_lquota_desc)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time)); + LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n", + (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2)); + LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2)); + + /* Checks for struct mgs_send_param */ + LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n", + (long long)(int)sizeof(struct mgs_send_param)); + CLASSERT(MGS_PARAM_MAXLEN == 1024); + LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n", + (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024])); + LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024])); + + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_padding)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + + /* Checks for struct llog_logid */ + LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n", + (long long)(int)sizeof(struct llog_logid)); + LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_oi)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi)); + LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid, lgl_ogen)); + LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen)); + CLASSERT(OST_SZ_REC == 274730752); + CLASSERT(MDS_UNLINK_REC == 274801668); + CLASSERT(MDS_UNLINK64_REC == 275325956); + CLASSERT(MDS_SETATTR64_REC == 275325953); + CLASSERT(OBD_CFG_REC == 274857984); + CLASSERT(LLOG_GEN_REC == 274989056); + CLASSERT(CHANGELOG_REC == 275120128); + CLASSERT(CHANGELOG_USER_REC == 275185664); + CLASSERT(LLOG_HDR_MAGIC == 275010873); + CLASSERT(LLOG_LOGID_MAGIC == 275010875); + + /* Checks for struct llog_catid */ + LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_catid)); + LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_logid)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding1)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding2)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); + LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_catid, lci_padding3)); + LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); + + /* Checks for struct llog_rec_hdr */ + LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_hdr)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_len)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_index)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); + LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_hdr, lrh_id)); + LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id)); + + /* Checks for struct llog_rec_tail */ + LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n", + (long long)(int)sizeof(struct llog_rec_tail)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_len)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len)); + LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct llog_rec_tail, lrt_index)); + LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index)); + + /* Checks for struct llog_logid_rec */ + LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_logid_rec)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_hdr)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_id)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding1)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding2)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_padding3)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3)); + LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); + LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail)); + + /* Checks for struct llog_unlink_rec */ + LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink_rec)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oid)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); + /* Checks for struct llog_unlink64_rec */ + LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_unlink64_rec)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_count)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2)); + LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3)); + LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3)); + + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_padding) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_padding)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_padding)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + + /* Checks for struct llog_size_change_rec */ + LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_size_change_rec)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3)); + LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); + LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail)); + + /* Checks for struct changelog_rec */ + LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct changelog_rec)); + LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); + + /* Checks for struct changelog_ext_rec */ + LASSERTF((int)sizeof(struct changelog_ext_rec) == 96, "found %lld\n", + (long long)(int)sizeof(struct changelog_ext_rec)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_namelen) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_namelen)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_namelen)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_flags) == 2, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_flags)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_flags)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_type) == 4, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_type)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_type)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_index) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_index)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_index) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_index)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_prev) == 16, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_prev)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_prev) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_prev)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_time) == 24, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_time)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_time) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_time)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_tfid) == 32, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_tfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_tfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_pfid) == 48, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_pfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_pfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_sfid) == 64, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_sfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_sfid)); + LASSERTF((int)offsetof(struct changelog_ext_rec, cr_spfid) == 80, "found %lld\n", + (long long)(int)offsetof(struct changelog_ext_rec, cr_spfid)); + LASSERTF((int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct changelog_ext_rec *)0)->cr_spfid)); + + /* Checks for struct changelog_setinfo */ + LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", + (long long)(int)sizeof(struct changelog_setinfo)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_recno)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno)); + LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct changelog_setinfo, cs_id)); + LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id)); + + /* Checks for struct llog_changelog_rec */ + LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_rec)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); + LASSERTF((int)offsetof(struct llog_changelog_rec, cr_tail) == 80, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_rec, cr_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_tail)); + + /* Checks for struct llog_changelog_user_rec */ + LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", + (long long)(int)sizeof(struct llog_changelog_user_rec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec)); + LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail)); + LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail)); + + /* Checks for struct llog_gen */ + LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n", + (long long)(int)sizeof(struct llog_gen)); + LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, mnt_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt)); + LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n", + (long long)(int)offsetof(struct llog_gen, conn_cnt)); + LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt)); + + /* Checks for struct llog_gen_rec */ + LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n", + (long long)(int)sizeof(struct llog_gen_rec)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_gen)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen)); + LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n", + (long long)(int)offsetof(struct llog_gen_rec, lgr_tail)); + LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail)); + + /* Checks for struct llog_log_hdr */ + LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n", + (long long)(int)sizeof(struct llog_log_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_hdr)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_count)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_size)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_flags)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_reserved)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap)); + LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n", + (long long)(int)offsetof(struct llog_log_hdr, llh_tail)); + LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail)); + + /* Checks for struct llog_cookie */ + LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n", + (long long)(int)sizeof(struct llog_cookie)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_lgl)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_subsys)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_index)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index)); + LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n", + (long long)(int)offsetof(struct llog_cookie, lgc_padding)); + LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding)); + + /* Checks for struct llogd_body */ + LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n", + (long long)(int)sizeof(struct llogd_body)); + LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_logid)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid)); + LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx)); + LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_llh_flags)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags)); + LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_saved_index)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index)); + LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_len)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len)); + LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n", + (long long)(int)offsetof(struct llogd_body, lgd_cur_offset)); + LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n", + (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset)); + CLASSERT(LLOG_ORIGIN_HANDLE_CREATE == 501); + CLASSERT(LLOG_ORIGIN_HANDLE_NEXT_BLOCK == 502); + CLASSERT(LLOG_ORIGIN_HANDLE_READ_HEADER == 503); + CLASSERT(LLOG_ORIGIN_HANDLE_WRITE_REC == 504); + CLASSERT(LLOG_ORIGIN_HANDLE_CLOSE == 505); + CLASSERT(LLOG_ORIGIN_CONNECT == 506); + CLASSERT(LLOG_CATINFO == 507); + CLASSERT(LLOG_ORIGIN_HANDLE_PREV_BLOCK == 508); + CLASSERT(LLOG_ORIGIN_HANDLE_DESTROY == 509); + CLASSERT(LLOG_FIRST_OPC == 501); + CLASSERT(LLOG_LAST_OPC == 510); + + /* Checks for struct llogd_conn_body */ + LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", + (long long)(int)sizeof(struct llogd_conn_body)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid)); + LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n", + (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx)); + LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n", + (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); + + /* Checks for struct ll_fiemap_info_key */ + LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_info_key)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, name[8]) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, name[8])); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->name[8]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->name[8])); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, oa) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, oa)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->oa) == 208, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->oa)); + LASSERTF((int)offsetof(struct ll_fiemap_info_key, fiemap) == 216, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_info_key, fiemap)); + LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->fiemap)); + + /* Checks for struct quota_body */ + LASSERTF((int)sizeof(struct quota_body) == 112, "found %lld\n", + (long long)(int)sizeof(struct quota_body)); + LASSERTF((int)offsetof(struct quota_body, qb_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_fid)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_fid)); + LASSERTF((int)offsetof(struct quota_body, qb_id) == 16, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_id)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_id) == 16, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_id)); + LASSERTF((int)offsetof(struct quota_body, qb_flags) == 32, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_flags)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_flags)); + LASSERTF((int)offsetof(struct quota_body, qb_padding) == 36, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding)); + LASSERTF((int)offsetof(struct quota_body, qb_count) == 40, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_count)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_count) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_count)); + LASSERTF((int)offsetof(struct quota_body, qb_usage) == 48, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_usage)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_usage) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_usage)); + LASSERTF((int)offsetof(struct quota_body, qb_slv_ver) == 56, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_slv_ver)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_slv_ver) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_slv_ver)); + LASSERTF((int)offsetof(struct quota_body, qb_lockh) == 64, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_glb_lockh) == 72, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_glb_lockh)); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_glb_lockh) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_glb_lockh)); + LASSERTF((int)offsetof(struct quota_body, qb_padding1[4]) == 112, "found %lld\n", + (long long)(int)offsetof(struct quota_body, qb_padding1[4])); + LASSERTF((int)sizeof(((struct quota_body *)0)->qb_padding1[4]) == 8, "found %lld\n", + (long long)(int)sizeof(((struct quota_body *)0)->qb_padding1[4])); + + /* Checks for struct mgs_target_info */ + LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n", + (long long)(int)sizeof(struct mgs_target_info)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_config_ver)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_flags)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nid_count)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_instance)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_fsname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_svname)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_uuid)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_nids)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids)); + LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n", + (long long)(int)offsetof(struct mgs_target_info, mti_params)); + LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n", + (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params)); + + /* Checks for struct lustre_capa */ + LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa)); + LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_fid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_opc)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc)); + LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_uid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_gid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_flags)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags)); + LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid)); + LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_timeout)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout)); + LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_expiry)); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry)); + CLASSERT(CAPA_HMAC_MAX_LEN == 64); + LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa, lc_hmac[64])); + LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64])); + + /* Checks for struct lustre_capa_key */ + LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n", + (long long)(int)sizeof(struct lustre_capa_key)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_seq)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_keyid)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid)); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_padding)); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding)); + CLASSERT(CAPA_HMAC_KEY_MAX_LEN == 56); + LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n", + (long long)(int)offsetof(struct lustre_capa_key, lk_key[56])); + LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56])); + + /* Checks for struct getinfo_fid2path */ + LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n", + (long long)(int)sizeof(struct getinfo_fid2path)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_fid)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_recno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen)); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen)); + LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n", + (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0])); + LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n", + (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0])); + + /* Checks for struct ll_user_fiemap */ + LASSERTF((int)sizeof(struct ll_user_fiemap) == 32, "found %lld\n", + (long long)(int)sizeof(struct ll_user_fiemap)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_start) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_start)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_start)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_length) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_length)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_length)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_flags)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_flags)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_mapped_extents) == 20, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_mapped_extents)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_mapped_extents)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extent_count) == 24, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_extent_count)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extent_count)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_reserved) == 28, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_reserved)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_reserved)); + LASSERTF((int)offsetof(struct ll_user_fiemap, fm_extents) == 32, "found %lld\n", + (long long)(int)offsetof(struct ll_user_fiemap, fm_extents)); + LASSERTF((int)sizeof(((struct ll_user_fiemap *)0)->fm_extents) == 0, "found %lld\n", + (long long)(int)sizeof(((struct ll_user_fiemap *)0)->fm_extents)); + CLASSERT(FIEMAP_FLAG_SYNC == 0x00000001); + CLASSERT(FIEMAP_FLAG_XATTR == 0x00000002); + CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000); + + /* Checks for struct ll_fiemap_extent */ + LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, "found %lld\n", + (long long)(int)sizeof(struct ll_fiemap_extent)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_logical)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_physical) == 8, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_physical)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_physical)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_length) == 16, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_length)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags)); + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, "found %lld\n", + (long long)(int)offsetof(struct ll_fiemap_extent, fe_device)); + LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, "found %lld\n", + (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device)); + CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001); + CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002); + CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004); + CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008); + CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080); + CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100); + CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200); + CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400); + CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800); + CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000); + CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000); + CLASSERT(FIEMAP_EXTENT_NET == 0x80000000); + + /* Checks for type posix_acl_xattr_entry */ + LASSERTF((int)sizeof(posix_acl_xattr_entry) == 8, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_entry)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_tag) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_tag)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_tag)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_perm) == 2, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_perm)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_perm)); + LASSERTF((int)offsetof(posix_acl_xattr_entry, e_id) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_entry, e_id)); + LASSERTF((int)sizeof(((posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_entry *)0)->e_id)); + + /* Checks for type posix_acl_xattr_header */ + LASSERTF((int)sizeof(posix_acl_xattr_header) == 4, "found %lld\n", + (long long)(int)sizeof(posix_acl_xattr_header)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_version) == 0, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_version)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_version)); + LASSERTF((int)offsetof(posix_acl_xattr_header, a_entries) == 4, "found %lld\n", + (long long)(int)offsetof(posix_acl_xattr_header, a_entries)); + LASSERTF((int)sizeof(((posix_acl_xattr_header *)0)->a_entries) == 0, "found %lld\n", + (long long)(int)sizeof(((posix_acl_xattr_header *)0)->a_entries)); + + /* Checks for struct link_ea_header */ + LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n", + (long long)(int)sizeof(struct link_ea_header)); + LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_magic)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic)); + LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_reccount)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount)); + LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, leh_len)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len)); + LASSERTF((int)offsetof(struct link_ea_header, padding1) == 16, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, padding1)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->padding1)); + LASSERTF((int)offsetof(struct link_ea_header, padding2) == 20, "found %lld\n", + (long long)(int)offsetof(struct link_ea_header, padding2)); + LASSERTF((int)sizeof(((struct link_ea_header *)0)->padding2) == 4, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_header *)0)->padding2)); + CLASSERT(LINK_EA_MAGIC == 0x11EAF1DFUL); + + /* Checks for struct link_ea_entry */ + LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n", + (long long)(int)sizeof(struct link_ea_entry)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_reclen)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid)); + LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n", + (long long)(int)offsetof(struct link_ea_entry, lee_name)); + LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n", + (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name)); + + /* Checks for struct layout_intent */ + LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n", + (long long)(int)sizeof(struct layout_intent)); + LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_opc)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_opc)); + LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_flags)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_flags)); + LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_start)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_start)); + LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n", + (long long)(int)offsetof(struct layout_intent, li_end)); + LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n", + (long long)(int)sizeof(((struct layout_intent *)0)->li_end)); + LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n", + (long long)LAYOUT_INTENT_ACCESS); + LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n", + (long long)LAYOUT_INTENT_READ); + LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n", + (long long)LAYOUT_INTENT_WRITE); + LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n", + (long long)LAYOUT_INTENT_GLIMPSE); + LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n", + (long long)LAYOUT_INTENT_TRUNC); + LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n", + (long long)LAYOUT_INTENT_RELEASE); + LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n", + (long long)LAYOUT_INTENT_RESTORE); + + /* Checks for struct hsm_action_item */ + LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_item)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_len)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_action)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_fid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_dfid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_extent)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_cookie)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_gid)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid)); + LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_item, hai_data)); + LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data)); + + /* Checks for struct hsm_action_list */ + LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_action_list)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_version)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_count)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_compound_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_flags)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_archive_id)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id)); + LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, padding1)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1)); + LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n", + (long long)(int)offsetof(struct hsm_action_list, hal_fsname)); + LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname)); + + /* Checks for struct hsm_progress */ + LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress)); + LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_fid)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid)); + LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie)); + LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_extent)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent)); + LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_flags)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags)); + LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, hp_errval)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval)); + LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress, padding)); + LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); + LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", + HP_FLAG_COMPLETED); + LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", + HP_FLAG_RETRY); + + LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_data_version)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version)); + LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_flags)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags)); + LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_errval)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval)); + LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, padding)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->padding)); + LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_copy, hc_hai)); + LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n", + (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai)); + + /* Checks for struct hsm_progress_kernel */ + LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n", + (long long)(int)sizeof(struct hsm_progress_kernel)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version)); + LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n", + (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2)); + LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2)); + + /* Checks for struct hsm_user_item */ + LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_item)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_fid)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid)); + LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_item, hui_extent)); + LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent)); + + /* Checks for struct hsm_user_state */ + LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_states)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_archive_id)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action)); + LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location)); + LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location)); + + /* Checks for struct hsm_state_set */ + LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_state_set)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_valid)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_archive_id)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_setmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask)); + LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_state_set, hss_clearmask)); + LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask)); + + /* Checks for struct hsm_current_action */ + LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_current_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_state)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_action)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action)); + LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_current_action, hca_location)); + LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n", + (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location)); + + /* Checks for struct hsm_request */ + LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_request)); + LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_action)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_action)); + LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_archive_id)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id)); + LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_flags)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags)); + LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_itemcount)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount)); + LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n", + (long long)(int)offsetof(struct hsm_request, hr_data_len)); + LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", + (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); + LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", + (unsigned)HSM_FORCE_ACTION); + LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", + (unsigned)HSM_GHOST_COPY); + + /* Checks for struct hsm_user_request */ + LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", + (long long)(int)sizeof(struct hsm_user_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_request)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request)); + LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n", + (long long)(int)offsetof(struct hsm_user_request, hur_user_item)); + LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n", + (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item)); + + /* Checks for struct update_buf */ + LASSERTF((int)sizeof(struct update_buf) == 8, "found %lld\n", + (long long)(int)sizeof(struct update_buf)); + LASSERTF((int)offsetof(struct update_buf, ub_magic) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_magic)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_magic) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_magic)); + LASSERTF((int)offsetof(struct update_buf, ub_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_count)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_count)); + LASSERTF((int)offsetof(struct update_buf, ub_bufs) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_buf, ub_bufs)); + LASSERTF((int)sizeof(((struct update_buf *)0)->ub_bufs) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_buf *)0)->ub_bufs)); + + /* Checks for struct update_reply */ + LASSERTF((int)sizeof(struct update_reply) == 8, "found %lld\n", + (long long)(int)sizeof(struct update_reply)); + LASSERTF((int)offsetof(struct update_reply, ur_version) == 0, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_version)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_version) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_version)); + LASSERTF((int)offsetof(struct update_reply, ur_count) == 4, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_count)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_count) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_count)); + LASSERTF((int)offsetof(struct update_reply, ur_lens) == 8, "found %lld\n", + (long long)(int)offsetof(struct update_reply, ur_lens)); + LASSERTF((int)sizeof(((struct update_reply *)0)->ur_lens) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update_reply *)0)->ur_lens)); + + /* Checks for struct update */ + LASSERTF((int)sizeof(struct update) == 56, "found %lld\n", + (long long)(int)sizeof(struct update)); + LASSERTF((int)offsetof(struct update, u_type) == 0, "found %lld\n", + (long long)(int)offsetof(struct update, u_type)); + LASSERTF((int)sizeof(((struct update *)0)->u_type) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_type)); + LASSERTF((int)offsetof(struct update, u_batchid) == 4, "found %lld\n", + (long long)(int)offsetof(struct update, u_batchid)); + LASSERTF((int)sizeof(((struct update *)0)->u_batchid) == 4, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_batchid)); + LASSERTF((int)offsetof(struct update, u_fid) == 8, "found %lld\n", + (long long)(int)offsetof(struct update, u_fid)); + LASSERTF((int)sizeof(((struct update *)0)->u_fid) == 16, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_fid)); + LASSERTF((int)offsetof(struct update, u_lens) == 24, "found %lld\n", + (long long)(int)offsetof(struct update, u_lens)); + LASSERTF((int)sizeof(((struct update *)0)->u_lens) == 32, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_lens)); + LASSERTF((int)offsetof(struct update, u_bufs) == 56, "found %lld\n", + (long long)(int)offsetof(struct update, u_bufs)); + LASSERTF((int)sizeof(((struct update *)0)->u_bufs) == 0, "found %lld\n", + (long long)(int)sizeof(((struct update *)0)->u_bufs)); +}