orangefs: implement orangefs_readahead
mm/readahead.c/read_pages was quite a bit different back when I put my open-coded readahead logic into orangefs_readpage. It seemed to work as designed then, it is a trainwreck now. This patch implements orangefs_readahead using new xarray and readahead_expand features that have just been pulled and removes all my open-coded readahead logic. This patch results in an extreme read performance improvement, these sample numbers are from my test VM: Here's an example of what's upstream in 5.11.8-200.fc33.x86_64: 30+0 records in 30+0 records out 125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s And here's this version of orangefs_readahead on top of 5.12.0-rc4: 30+0 records in 30+0 records out 125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s There are four xfstest regressions with this patch. David Howells and Matthew Wilcox have been helping me work with this code. One of the regressions has gone away with the most recent version of their code that I'm using. I hope this patch can be pulled even though there are still a few regressions, and that we can try to get them resolved during the RC period. -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEIGSFVdO6eop9nER2z0QOqevODb4FAmCPCUsACgkQz0QOqevO Db77DQ/7B8V7RPlQ8C6HJlSuCED67W9isCG5CdzGobVafBrirbUusanQJRhjrIZO Voy0NYsR/rsM3K1tNk9AE7rlbT4UQibeUXwFVcVjBvtyXBiTgjbROc2AP4pjxAWu erH2McMEbrYjgrevwR/PKxyD8wS6vTX2InnI4yvlkbfEz04u/KkTSu0oN4UCU/8u 8/drWDTIgZz6wffb1RpMFsCP77tfVWIWlRlH39u9OTe4fhPMug8jN+uOBrfyYxdp snJWznyeSYCQ4q/KkPkjfSUTDmx3+E1WeSHMNviHfwENdbcUAojk2O9wepBwJhQn r0DFU2yM+132oRkWO1DF7If1FRfvcmHjE4bmlLBSg+xgKOKpdMCs7Nf+s1Sji+w/ 8xTAPWzdqBeW6z4nIncvZPtjtes3979mJ/Jm/f4GLonAQB6yPJcIzA8gl5EEgXI3 20pAt2JNCgCHVhHQso5fkLINlpND/cwlbOEOjyrNXIoJJngGDRo9FQ/osGBaLv5i n3XWC41lYnX9nqJ2FuVLBuZ+Jv1k5XSQualpyGGVTFaYp/jZVbjUOgJk7QPNsWl7 9cUZAMVdDW6y7z1aZ2bu5y7VFIkPe4nfZNqrgXX+YySq0uOTrQBegkQRp1pu3t8m P3P9lVqcrn/kw+FASZborq921Njw+YDHvZuYfrnbF7J0sUL0fu4= =09Vm -----END PGP SIGNATURE----- Merge tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux Pull orangefs updates from Mike Marshall: "orangefs: implement orangefs_readahead mm/readahead.c/read_pages was quite a bit different back when I put my open-coded readahead logic into orangefs_readpage. That logic seemed to work as designed back then, it is a trainwreck now. This implements orangefs_readahead using the new xarray and readahead_expand features and removes all my open-coded readahead logic. This results in an extreme read performance improvement, these sample numbers are from my test VM: Here's an example of what's upstream in 5.11.8-200.fc33.x86_64: 30+0 records in 30+0 records out 125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s And here's this version of orangefs_readahead on top of 5.12.0-rc4: 30+0 records in 30+0 records out 125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s There are four xfstest regressions with this patch. David Howells and Matthew Wilcox have been helping me work with this code" * tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux: orangefs: leave files in the page cache for a few micro seconds at least Orangef: implement orangefs_readahead.
This commit is contained in:
commit
9ccce092fc
@ -248,21 +248,7 @@ populate_shared_memory:
|
||||
* or it can pointers to struct page's
|
||||
*/
|
||||
|
||||
/*
|
||||
* When reading, readahead_size will only be zero when
|
||||
* we're doing O_DIRECT, otherwise we got here from
|
||||
* orangefs_readpage.
|
||||
*
|
||||
* If we got here from orangefs_readpage we want to
|
||||
* copy either a page or the whole file into the io
|
||||
* vector, whichever is smaller.
|
||||
*/
|
||||
if (readahead_size)
|
||||
copy_amount =
|
||||
min(new_op->downcall.resp.io.amt_complete,
|
||||
(__s64)PAGE_SIZE);
|
||||
else
|
||||
copy_amount = new_op->downcall.resp.io.amt_complete;
|
||||
copy_amount = new_op->downcall.resp.io.amt_complete;
|
||||
|
||||
ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
|
||||
copy_amount);
|
||||
@ -283,19 +269,11 @@ populate_shared_memory:
|
||||
|
||||
out:
|
||||
if (buffer_index >= 0) {
|
||||
if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
|
||||
/* readpage */
|
||||
*index_return = buffer_index;
|
||||
gossip_debug(GOSSIP_FILE_DEBUG,
|
||||
"%s: hold on to buffer_index :%d:\n",
|
||||
__func__, buffer_index);
|
||||
} else {
|
||||
/* O_DIRECT */
|
||||
orangefs_bufmap_put(buffer_index);
|
||||
gossip_debug(GOSSIP_FILE_DEBUG,
|
||||
"%s(%pU): PUT buffer_index %d\n",
|
||||
__func__, handle, buffer_index);
|
||||
}
|
||||
orangefs_bufmap_put(buffer_index);
|
||||
gossip_debug(GOSSIP_FILE_DEBUG,
|
||||
"%s(%pU): PUT buffer_index %d\n",
|
||||
__func__, handle, buffer_index);
|
||||
buffer_index = -1;
|
||||
}
|
||||
op_release(new_op);
|
||||
return ret;
|
||||
|
@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping,
|
||||
|
||||
static int orangefs_launder_page(struct page *);
|
||||
|
||||
static void orangefs_readahead(struct readahead_control *rac)
|
||||
{
|
||||
loff_t offset;
|
||||
struct iov_iter iter;
|
||||
struct file *file = rac->file;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct xarray *i_pages;
|
||||
struct page *page;
|
||||
loff_t new_start = readahead_pos(rac);
|
||||
int ret;
|
||||
size_t new_len = 0;
|
||||
|
||||
loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
|
||||
loff_t pages_remaining = bytes_remaining / PAGE_SIZE;
|
||||
|
||||
if (pages_remaining >= 1024)
|
||||
new_len = 4194304;
|
||||
else if (pages_remaining > readahead_count(rac))
|
||||
new_len = bytes_remaining;
|
||||
|
||||
if (new_len)
|
||||
readahead_expand(rac, new_start, new_len);
|
||||
|
||||
offset = readahead_pos(rac);
|
||||
i_pages = &file->f_mapping->i_pages;
|
||||
|
||||
iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
|
||||
|
||||
/* read in the pages. */
|
||||
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
|
||||
&offset, &iter, readahead_length(rac),
|
||||
inode->i_size, NULL, NULL, file)) < 0)
|
||||
gossip_debug(GOSSIP_FILE_DEBUG,
|
||||
"%s: wait_for_direct_io failed. \n", __func__);
|
||||
else
|
||||
ret = 0;
|
||||
|
||||
/* clean up. */
|
||||
while ((page = readahead_page(rac))) {
|
||||
page_endio(page, false, ret);
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
static int orangefs_readpage(struct file *file, struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page)
|
||||
struct bio_vec bv;
|
||||
ssize_t ret;
|
||||
loff_t off; /* offset into this page */
|
||||
pgoff_t index; /* which page */
|
||||
struct page *next_page;
|
||||
char *kaddr;
|
||||
loff_t read_size;
|
||||
int buffer_index = -1; /* orangefs shared memory slot */
|
||||
int slot_index; /* index into slot */
|
||||
int remaining;
|
||||
|
||||
/*
|
||||
* Get up to this many bytes from Orangefs at a time and try
|
||||
* to fill them into the page cache at once. Tests with dd made
|
||||
* this seem like a reasonable static number, if there was
|
||||
* interest perhaps this number could be made setable through
|
||||
* sysfs...
|
||||
*/
|
||||
read_size = 524288;
|
||||
|
||||
if (PageDirty(page))
|
||||
orangefs_launder_page(page);
|
||||
|
||||
off = page_offset(page);
|
||||
index = off >> PAGE_SHIFT;
|
||||
bv.bv_page = page;
|
||||
bv.bv_len = PAGE_SIZE;
|
||||
bv.bv_offset = 0;
|
||||
iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
|
||||
|
||||
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
|
||||
read_size, inode->i_size, NULL, &buffer_index, file);
|
||||
remaining = ret;
|
||||
PAGE_SIZE, inode->i_size, NULL, NULL, file);
|
||||
/* this will only zero remaining unread portions of the page data */
|
||||
iov_iter_zero(~0U, &iter);
|
||||
/* takes care of potential aliasing */
|
||||
flush_dcache_page(page);
|
||||
if (ret < 0) {
|
||||
SetPageError(page);
|
||||
unlock_page(page);
|
||||
goto out;
|
||||
} else {
|
||||
SetPageUptodate(page);
|
||||
if (PageError(page))
|
||||
@ -298,60 +322,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
|
||||
}
|
||||
/* unlock the page after the ->readpage() routine completes */
|
||||
unlock_page(page);
|
||||
|
||||
if (remaining > PAGE_SIZE) {
|
||||
slot_index = 0;
|
||||
while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
|
||||
remaining -= PAGE_SIZE;
|
||||
/*
|
||||
* It is an optimization to try and fill more than one
|
||||
* page... by now we've already gotten the single
|
||||
* page we were after, if stuff doesn't seem to
|
||||
* be going our way at this point just return
|
||||
* and hope for the best.
|
||||
*
|
||||
* If we look for pages and they're already there is
|
||||
* one reason to give up, and if they're not there
|
||||
* and we can't create them is another reason.
|
||||
*/
|
||||
|
||||
index++;
|
||||
slot_index++;
|
||||
next_page = find_get_page(inode->i_mapping, index);
|
||||
if (next_page) {
|
||||
gossip_debug(GOSSIP_FILE_DEBUG,
|
||||
"%s: found next page, quitting\n",
|
||||
__func__);
|
||||
put_page(next_page);
|
||||
goto out;
|
||||
}
|
||||
next_page = find_or_create_page(inode->i_mapping,
|
||||
index,
|
||||
GFP_KERNEL);
|
||||
/*
|
||||
* I've never hit this, leave it as a printk for
|
||||
* now so it will be obvious.
|
||||
*/
|
||||
if (!next_page) {
|
||||
printk("%s: can't create next page, quitting\n",
|
||||
__func__);
|
||||
goto out;
|
||||
}
|
||||
kaddr = kmap_atomic(next_page);
|
||||
orangefs_bufmap_page_fill(kaddr,
|
||||
buffer_index,
|
||||
slot_index);
|
||||
kunmap_atomic(kaddr);
|
||||
SetPageUptodate(next_page);
|
||||
unlock_page(next_page);
|
||||
put_page(next_page);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (buffer_index != -1)
|
||||
orangefs_bufmap_put(buffer_index);
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int orangefs_write_begin(struct file *file,
|
||||
@ -660,6 +631,7 @@ out:
|
||||
/** ORANGEFS2 implementation of address space operations */
|
||||
static const struct address_space_operations orangefs_address_operations = {
|
||||
.writepage = orangefs_writepage,
|
||||
.readahead = orangefs_readahead,
|
||||
.readpage = orangefs_readpage,
|
||||
.writepages = orangefs_writepages,
|
||||
.set_page_dirty = __set_page_dirty_nobuffers,
|
||||
|
@ -31,7 +31,7 @@ static ulong module_parm_debug_mask;
|
||||
__u64 orangefs_gossip_debug_mask;
|
||||
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
|
||||
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
|
||||
int orangefs_cache_timeout_msecs = 50;
|
||||
int orangefs_cache_timeout_msecs = 500;
|
||||
int orangefs_dcache_timeout_msecs = 50;
|
||||
int orangefs_getattr_timeout_msecs = 50;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user