2f713b5c7d
A large directory full of differently sized file names triggered this. Most directories, even very large directories with shorter names, would be lucky enough to fit in one server response. Signed-off-by: Martin Brandenburg <martin@omnibond.com> Signed-off-by: Mike Marshall <hubcap@omnibond.com>
405 lines
9.1 KiB
C
405 lines
9.1 KiB
C
/*
|
|
* Copyright 2017 Omnibond Systems, L.L.C.
|
|
*/
|
|
|
|
#include "protocol.h"
|
|
#include "orangefs-kernel.h"
|
|
#include "orangefs-bufmap.h"
|
|
|
|
struct orangefs_dir_part {
|
|
struct orangefs_dir_part *next;
|
|
size_t len;
|
|
};
|
|
|
|
struct orangefs_dir {
|
|
__u64 token;
|
|
struct orangefs_dir_part *part;
|
|
loff_t end;
|
|
int error;
|
|
};
|
|
|
|
#define PART_SHIFT (24)
|
|
#define PART_SIZE (1<<24)
|
|
#define PART_MASK (~(PART_SIZE - 1))
|
|
|
|
/*
|
|
* There can be up to 512 directory entries. Each entry is encoded as
|
|
* follows:
|
|
* 4 bytes: string size (n)
|
|
* n bytes: string
|
|
* 1 byte: trailing zero
|
|
* padding to 8 bytes
|
|
* 16 bytes: khandle
|
|
* padding to 8 bytes
|
|
*
|
|
* The trailer_buf starts with a struct orangefs_readdir_response_s
|
|
* which must be skipped to get to the directory data.
|
|
*
|
|
* The data which is received from the userspace daemon is termed a
|
|
* part and is stored in a linked list in case more than one part is
|
|
* needed for a large directory.
|
|
*
|
|
* The position pointer (ctx->pos) encodes the part and offset on which
|
|
* to begin reading at. Bits above PART_SHIFT encode the part and bits
|
|
* below PART_SHIFT encode the offset. Parts are stored in a linked
|
|
* list which grows as data is received from the server. The overhead
|
|
* associated with managing the list is presumed to be small compared to
|
|
* the overhead of communicating with the server.
|
|
*
|
|
* As data is received from the server, it is placed at the end of the
|
|
* part list. Data is parsed from the current position as it is needed.
|
|
* When data is determined to be corrupt, it is either because the
|
|
* userspace component has sent back corrupt data or because the file
|
|
* pointer has been moved to an invalid location. Since the two cannot
|
|
* be differentiated, return EIO.
|
|
*
|
|
* Part zero is synthesized to contains `.' and `..'. Part one is the
|
|
* first part of the part list.
|
|
*/
|
|
|
|
static int do_readdir(struct orangefs_inode_s *oi,
|
|
struct orangefs_dir *od, struct dentry *dentry,
|
|
struct orangefs_kernel_op_s *op)
|
|
{
|
|
struct orangefs_readdir_response_s *resp;
|
|
int bufi, r;
|
|
|
|
/*
|
|
* Despite the badly named field, readdir does not use shared
|
|
* memory. However, there are a limited number of readdir
|
|
* slots, which must be allocated here. This flag simply tells
|
|
* the op scheduler to return the op here for retry.
|
|
*/
|
|
op->uses_shared_memory = 1;
|
|
op->upcall.req.readdir.refn = oi->refn;
|
|
op->upcall.req.readdir.token = od->token;
|
|
op->upcall.req.readdir.max_dirent_count =
|
|
ORANGEFS_MAX_DIRENT_COUNT_READDIR;
|
|
|
|
again:
|
|
bufi = orangefs_readdir_index_get();
|
|
if (bufi < 0) {
|
|
od->error = bufi;
|
|
return bufi;
|
|
}
|
|
|
|
op->upcall.req.readdir.buf_index = bufi;
|
|
|
|
r = service_operation(op, "orangefs_readdir",
|
|
get_interruptible_flag(dentry->d_inode));
|
|
|
|
orangefs_readdir_index_put(bufi);
|
|
|
|
if (op_state_purged(op)) {
|
|
if (r == -EAGAIN) {
|
|
vfree(op->downcall.trailer_buf);
|
|
goto again;
|
|
} else if (r == -EIO) {
|
|
vfree(op->downcall.trailer_buf);
|
|
od->error = r;
|
|
return r;
|
|
}
|
|
}
|
|
|
|
if (r < 0) {
|
|
vfree(op->downcall.trailer_buf);
|
|
od->error = r;
|
|
return r;
|
|
} else if (op->downcall.status) {
|
|
vfree(op->downcall.trailer_buf);
|
|
od->error = op->downcall.status;
|
|
return op->downcall.status;
|
|
}
|
|
|
|
/*
|
|
* The maximum size is size per entry times the 512 entries plus
|
|
* the header. This is well under the limit.
|
|
*/
|
|
if (op->downcall.trailer_size > PART_SIZE) {
|
|
vfree(op->downcall.trailer_buf);
|
|
od->error = -EIO;
|
|
return -EIO;
|
|
}
|
|
|
|
resp = (struct orangefs_readdir_response_s *)
|
|
op->downcall.trailer_buf;
|
|
od->token = resp->token;
|
|
return 0;
|
|
}
|
|
|
|
static int parse_readdir(struct orangefs_dir *od,
|
|
struct orangefs_kernel_op_s *op)
|
|
{
|
|
struct orangefs_dir_part *part, *new;
|
|
size_t count;
|
|
|
|
count = 1;
|
|
part = od->part;
|
|
while (part) {
|
|
count++;
|
|
if (part->next)
|
|
part = part->next;
|
|
else
|
|
break;
|
|
}
|
|
|
|
new = (void *)op->downcall.trailer_buf;
|
|
new->next = NULL;
|
|
new->len = op->downcall.trailer_size -
|
|
sizeof(struct orangefs_readdir_response_s);
|
|
if (!od->part)
|
|
od->part = new;
|
|
else
|
|
part->next = new;
|
|
count++;
|
|
od->end = count << PART_SHIFT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int orangefs_dir_more(struct orangefs_inode_s *oi,
|
|
struct orangefs_dir *od, struct dentry *dentry)
|
|
{
|
|
struct orangefs_kernel_op_s *op;
|
|
int r;
|
|
|
|
op = op_alloc(ORANGEFS_VFS_OP_READDIR);
|
|
if (!op) {
|
|
od->error = -ENOMEM;
|
|
return -ENOMEM;
|
|
}
|
|
r = do_readdir(oi, od, dentry, op);
|
|
if (r) {
|
|
od->error = r;
|
|
goto out;
|
|
}
|
|
r = parse_readdir(od, op);
|
|
if (r) {
|
|
od->error = r;
|
|
goto out;
|
|
}
|
|
|
|
od->error = 0;
|
|
out:
|
|
op_release(op);
|
|
return od->error;
|
|
}
|
|
|
|
static int fill_from_part(struct orangefs_dir_part *part,
|
|
struct dir_context *ctx)
|
|
{
|
|
const int offset = sizeof(struct orangefs_readdir_response_s);
|
|
struct orangefs_khandle *khandle;
|
|
__u32 *len, padlen;
|
|
loff_t i;
|
|
char *s;
|
|
i = ctx->pos & ~PART_MASK;
|
|
|
|
/* The file offset from userspace is too large. */
|
|
if (i > part->len)
|
|
return 1;
|
|
|
|
/*
|
|
* If the seek pointer is positioned just before an entry it
|
|
* should find the next entry.
|
|
*/
|
|
if (i % 8)
|
|
i = i + (8 - i%8)%8;
|
|
|
|
while (i < part->len) {
|
|
if (part->len < i + sizeof *len)
|
|
break;
|
|
len = (void *)part + offset + i;
|
|
/*
|
|
* len is the size of the string itself. padlen is the
|
|
* total size of the encoded string.
|
|
*/
|
|
padlen = (sizeof *len + *len + 1) +
|
|
(8 - (sizeof *len + *len + 1)%8)%8;
|
|
if (part->len < i + padlen + sizeof *khandle)
|
|
goto next;
|
|
s = (void *)part + offset + i + sizeof *len;
|
|
if (s[*len] != 0)
|
|
goto next;
|
|
khandle = (void *)part + offset + i + padlen;
|
|
if (!dir_emit(ctx, s, *len,
|
|
orangefs_khandle_to_ino(khandle),
|
|
DT_UNKNOWN))
|
|
return 0;
|
|
i += padlen + sizeof *khandle;
|
|
i = i + (8 - i%8)%8;
|
|
BUG_ON(i > part->len);
|
|
ctx->pos = (ctx->pos & PART_MASK) | i;
|
|
continue;
|
|
next:
|
|
i += 8;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static int orangefs_dir_fill(struct orangefs_inode_s *oi,
|
|
struct orangefs_dir *od, struct dentry *dentry,
|
|
struct dir_context *ctx)
|
|
{
|
|
struct orangefs_dir_part *part;
|
|
size_t count;
|
|
|
|
count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
|
|
|
|
part = od->part;
|
|
while (part->next && count) {
|
|
count--;
|
|
part = part->next;
|
|
}
|
|
/* This means the userspace file offset is invalid. */
|
|
if (count) {
|
|
od->error = -EIO;
|
|
return -EIO;
|
|
}
|
|
|
|
while (part && part->len) {
|
|
int r;
|
|
r = fill_from_part(part, ctx);
|
|
if (r < 0) {
|
|
od->error = r;
|
|
return r;
|
|
} else if (r == 0) {
|
|
/* Userspace buffer is full. */
|
|
break;
|
|
} else {
|
|
/*
|
|
* The part ran out of data. Move to the next
|
|
* part. */
|
|
ctx->pos = (ctx->pos & PART_MASK) +
|
|
(1 << PART_SHIFT);
|
|
part = part->next;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
|
|
int whence)
|
|
{
|
|
struct orangefs_dir *od = file->private_data;
|
|
/*
|
|
* Delete the stored data so userspace sees new directory
|
|
* entries.
|
|
*/
|
|
if (!whence && offset < od->end) {
|
|
struct orangefs_dir_part *part = od->part;
|
|
while (part) {
|
|
struct orangefs_dir_part *next = part->next;
|
|
vfree(part);
|
|
part = next;
|
|
}
|
|
od->token = ORANGEFS_ITERATE_START;
|
|
od->part = NULL;
|
|
od->end = 1 << PART_SHIFT;
|
|
}
|
|
return default_llseek(file, offset, whence);
|
|
}
|
|
|
|
static int orangefs_dir_iterate(struct file *file,
|
|
struct dir_context *ctx)
|
|
{
|
|
struct orangefs_inode_s *oi;
|
|
struct orangefs_dir *od;
|
|
struct dentry *dentry;
|
|
int r;
|
|
|
|
dentry = file->f_path.dentry;
|
|
oi = ORANGEFS_I(dentry->d_inode);
|
|
od = file->private_data;
|
|
|
|
if (od->error)
|
|
return od->error;
|
|
|
|
if (ctx->pos == 0) {
|
|
if (!dir_emit_dot(file, ctx))
|
|
return 0;
|
|
ctx->pos++;
|
|
}
|
|
if (ctx->pos == 1) {
|
|
if (!dir_emit_dotdot(file, ctx))
|
|
return 0;
|
|
ctx->pos = 1 << PART_SHIFT;
|
|
}
|
|
|
|
/*
|
|
* The seek position is in the first synthesized part but is not
|
|
* valid.
|
|
*/
|
|
if ((ctx->pos & PART_MASK) == 0)
|
|
return -EIO;
|
|
|
|
r = 0;
|
|
|
|
/*
|
|
* Must read more if the user has sought past what has been read
|
|
* so far. Stop a user who has sought past the end.
|
|
*/
|
|
while (od->token != ORANGEFS_ITERATE_END &&
|
|
ctx->pos > od->end) {
|
|
r = orangefs_dir_more(oi, od, dentry);
|
|
if (r)
|
|
return r;
|
|
}
|
|
if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
|
|
return -EIO;
|
|
|
|
/* Then try to fill if there's any left in the buffer. */
|
|
if (ctx->pos < od->end) {
|
|
r = orangefs_dir_fill(oi, od, dentry, ctx);
|
|
if (r)
|
|
return r;
|
|
}
|
|
|
|
/* Finally get some more and try to fill. */
|
|
if (od->token != ORANGEFS_ITERATE_END) {
|
|
r = orangefs_dir_more(oi, od, dentry);
|
|
if (r)
|
|
return r;
|
|
r = orangefs_dir_fill(oi, od, dentry, ctx);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static int orangefs_dir_open(struct inode *inode, struct file *file)
|
|
{
|
|
struct orangefs_dir *od;
|
|
file->private_data = kmalloc(sizeof(struct orangefs_dir),
|
|
GFP_KERNEL);
|
|
if (!file->private_data)
|
|
return -ENOMEM;
|
|
od = file->private_data;
|
|
od->token = ORANGEFS_ITERATE_START;
|
|
od->part = NULL;
|
|
od->end = 1 << PART_SHIFT;
|
|
od->error = 0;
|
|
return 0;
|
|
}
|
|
|
|
static int orangefs_dir_release(struct inode *inode, struct file *file)
|
|
{
|
|
struct orangefs_dir *od = file->private_data;
|
|
struct orangefs_dir_part *part = od->part;
|
|
orangefs_flush_inode(inode);
|
|
while (part) {
|
|
struct orangefs_dir_part *next = part->next;
|
|
vfree(part);
|
|
part = next;
|
|
}
|
|
kfree(od);
|
|
return 0;
|
|
}
|
|
|
|
const struct file_operations orangefs_dir_operations = {
|
|
.llseek = orangefs_dir_llseek,
|
|
.read = generic_read_dir,
|
|
.iterate = orangefs_dir_iterate,
|
|
.open = orangefs_dir_open,
|
|
.release = orangefs_dir_release
|
|
};
|