linux/drivers/nvdimm/pmem.c

580 lines
15 KiB
C
Raw Normal View History

/*
* Persistent Memory Driver
*
* Copyright (c) 2014-2015, Intel Corporation.
* Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
* Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <asm/cacheflush.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/badblocks.h>
#include <linux/memremap.h>
#include <linux/vmalloc.h>
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
#include <linux/blk-mq.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations The pmem driver has a need to transfer data with a persistent memory destination and be able to rely on the fact that the destination writes are not cached. It is sufficient for the writes to be flushed to a cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync() to ensure data-writes have reached a power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn around and fence previous writes with an "sfence". Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and memcpy_flushcache, that guarantee that the destination buffer is not dirty in the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines will be used to replace the "pmem api" (include/linux/pmem.h + arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache() and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE config symbol, and fallback to copy_from_iter_nocache() and plain memcpy() otherwise. This is meant to satisfy the concern from Linus that if a driver wants to do something beyond the normal nocache semantics it should be something private to that driver [1], and Al's concern that anything uaccess related belongs with the rest of the uaccess code [2]. The first consumer of this interface is a new 'copy_from_iter' dax operation so that pmem can inject cache maintenance operations without imposing this overhead on other dax-capable drivers. [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html Cc: <x86@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <mawilcox@microsoft.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2017-05-29 22:22:50 +03:00
#include <linux/uio.h>
#include <linux/dax.h>
#include <linux/nd.h>
#include <linux/backing-dev.h>
#include "pmem.h"
#include "pfn.h"
#include "nd.h"
#include "nd-core.h"
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 05:44:50 +03:00
static struct device *to_dev(struct pmem_device *pmem)
{
/*
* nvdimm bus services need a 'dev' parameter, and we record the device
* at init in bb.dev.
*/
return pmem->bb.dev;
}
static struct nd_region *to_region(struct pmem_device *pmem)
{
return to_nd_region(to_dev(pmem)->parent);
}
static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
phys_addr_t offset, unsigned int len)
{
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 05:44:50 +03:00
struct device *dev = to_dev(pmem);
sector_t sector;
long cleared;
blk_status_t rc = BLK_STS_OK;
sector = (offset - pmem->data_offset) / 512;
cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
if (cleared < len)
rc = BLK_STS_IOERR;
if (cleared > 0 && cleared / 512) {
cleared /= 512;
dev_dbg(dev, "%#llx clear %ld sector%s\n",
(unsigned long long) sector, cleared,
cleared > 1 ? "s" : "");
badblocks_clear(&pmem->bb, sector, cleared);
if (pmem->bb_state)
sysfs_notify_dirent(pmem->bb_state);
}
arch_invalidate_pmem(pmem->virt_addr + offset, len);
return rc;
}
static void write_pmem(void *pmem_addr, struct page *page,
unsigned int off, unsigned int len)
{
unsigned int chunk;
void *mem;
while (len) {
mem = kmap_atomic(page);
chunk = min_t(unsigned int, len, PAGE_SIZE);
memcpy_flushcache(pmem_addr, mem + off, chunk);
kunmap_atomic(mem);
len -= chunk;
off = 0;
page++;
pmem_addr += PAGE_SIZE;
}
}
static blk_status_t read_pmem(struct page *page, unsigned int off,
void *pmem_addr, unsigned int len)
{
unsigned int chunk;
unsigned long rem;
void *mem;
while (len) {
mem = kmap_atomic(page);
chunk = min_t(unsigned int, len, PAGE_SIZE);
rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
kunmap_atomic(mem);
if (rem)
return BLK_STS_IOERR;
len -= chunk;
off = 0;
page++;
pmem_addr += PAGE_SIZE;
}
return BLK_STS_OK;
}
static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
unsigned int len, unsigned int off, bool is_write,
sector_t sector)
{
blk_status_t rc = BLK_STS_OK;
bool bad_pmem = false;
phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
void *pmem_addr = pmem->virt_addr + pmem_off;
if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
bad_pmem = true;
if (!is_write) {
if (unlikely(bad_pmem))
rc = BLK_STS_IOERR;
else {
rc = read_pmem(page, off, pmem_addr, len);
flush_dcache_page(page);
}
} else {
/*
* Note that we write the data both before and after
* clearing poison. The write before clear poison
* handles situations where the latest written data is
* preserved and the clear poison operation simply marks
* the address range as valid without changing the data.
* In this case application software can assume that an
* interrupted write will either return the new good
* data or an error.
*
* However, if pmem_clear_poison() leaves the data in an
* indeterminate state we need to perform the write
* after clear poison.
*/
flush_dcache_page(page);
write_pmem(pmem_addr, page, off, len);
if (unlikely(bad_pmem)) {
rc = pmem_clear_poison(pmem, pmem_off, len);
write_pmem(pmem_addr, page, off, len);
}
}
return rc;
}
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
blk_status_t rc = 0;
bool do_acct;
unsigned long start;
struct bio_vec bvec;
struct bvec_iter iter;
struct pmem_device *pmem = q->queuedata;
struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_PREFLUSH)
nvdimm_flush(nd_region);
do_acct = nd_iostat_start(bio, &start);
bio_for_each_segment(bvec, bio, iter) {
rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
bvec.bv_offset, op_is_write(bio_op(bio)),
iter.bi_sector);
if (rc) {
bio->bi_status = rc;
break;
}
}
if (do_acct)
nd_iostat_end(bio, start);
if (bio->bi_opf & REQ_FUA)
nvdimm_flush(nd_region);
bio_endio(bio);
return BLK_QC_T_NONE;
}
static int pmem_rw_page(struct block_device *bdev, sector_t sector,
struct page *page, bool is_write)
{
struct pmem_device *pmem = bdev->bd_queue->queuedata;
blk_status_t rc;
rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
0, is_write, sector);
/*
* The ->rw_page interface is subtle and tricky. The core
* retries on any error, so we can only invoke page_endio() in
* the successful completion case. Otherwise, we'll see crashes
* caused by double completion.
*/
if (rc == 0)
page_endio(page, is_write, 0);
return blk_status_to_errno(rc);
}
/* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
PFN_PHYS(nr_pages))))
return -EIO;
*kaddr = pmem->virt_addr + offset;
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
/*
* If badblocks are present, limit known good range to the
* requested range.
*/
if (unlikely(pmem->bb.count))
return nr_pages;
return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
}
static const struct block_device_operations pmem_fops = {
.owner = THIS_MODULE,
.rw_page = pmem_rw_page,
.revalidate_disk = nvdimm_revalidate_disk,
};
static long pmem_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
{
struct pmem_device *pmem = dax_get_private(dax_dev);
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}
x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations The pmem driver has a need to transfer data with a persistent memory destination and be able to rely on the fact that the destination writes are not cached. It is sufficient for the writes to be flushed to a cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync() to ensure data-writes have reached a power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn around and fence previous writes with an "sfence". Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and memcpy_flushcache, that guarantee that the destination buffer is not dirty in the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines will be used to replace the "pmem api" (include/linux/pmem.h + arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache() and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE config symbol, and fallback to copy_from_iter_nocache() and plain memcpy() otherwise. This is meant to satisfy the concern from Linus that if a driver wants to do something beyond the normal nocache semantics it should be something private to that driver [1], and Al's concern that anything uaccess related belongs with the rest of the uaccess code [2]. The first consumer of this interface is a new 'copy_from_iter' dax operation so that pmem can inject cache maintenance operations without imposing this overhead on other dax-capable drivers. [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html Cc: <x86@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <mawilcox@microsoft.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2017-05-29 22:22:50 +03:00
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_from_iter_flushcache(addr, bytes, i);
}
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_to_iter_mcsafe(addr, bytes, i);
}
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
x86, uaccess: introduce copy_from_iter_flushcache for pmem / cache-bypass operations The pmem driver has a need to transfer data with a persistent memory destination and be able to rely on the fact that the destination writes are not cached. It is sufficient for the writes to be flushed to a cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect userspace to call fsync() to ensure data-writes have reached a power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or REQ_FLUSH to the pmem driver which will turn around and fence previous writes with an "sfence". Implement a __copy_from_user_inatomic_flushcache, memcpy_page_flushcache, and memcpy_flushcache, that guarantee that the destination buffer is not dirty in the cpu cache on completion. The new copy_from_iter_flushcache and sub-routines will be used to replace the "pmem api" (include/linux/pmem.h + arch/x86/include/asm/pmem.h). The availability of copy_from_iter_flushcache() and memcpy_flushcache() are gated by the CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE config symbol, and fallback to copy_from_iter_nocache() and plain memcpy() otherwise. This is meant to satisfy the concern from Linus that if a driver wants to do something beyond the normal nocache semantics it should be something private to that driver [1], and Al's concern that anything uaccess related belongs with the rest of the uaccess code [2]. The first consumer of this interface is a new 'copy_from_iter' dax operation so that pmem can inject cache maintenance operations without imposing this overhead on other dax-capable drivers. [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html Cc: <x86@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Toshi Kani <toshi.kani@hpe.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Matthew Wilcox <mawilcox@microsoft.com> Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2017-05-29 22:22:50 +03:00
.copy_from_iter = pmem_copy_from_iter,
.copy_to_iter = pmem_copy_to_iter,
};
static const struct attribute_group *pmem_attribute_groups[] = {
&dax_attribute_group,
NULL,
};
static void pmem_release_queue(void *q)
{
blk_cleanup_queue(q);
}
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
static void pmem_freeze_queue(void *q)
{
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 mm updates from Ingo Molnar: "The main x86 MM changes in this cycle were: - continued native kernel PCID support preparation patches to the TLB flushing code (Andy Lutomirski) - various fixes related to 32-bit compat syscall returning address over 4Gb in applications, launched from 64-bit binaries - motivated by C/R frameworks such as Virtuozzo. (Dmitry Safonov) - continued Intel 5-level paging enablement: in particular the conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov) - x86/mpx ABI corner case fixes/enhancements (Joerg Roedel) - ... plus misc updates, fixes and cleanups" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits) mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash x86/mm: Fix flush_tlb_page() on Xen x86/mm: Make flush_tlb_mm_range() more predictable x86/mm: Remove flush_tlb() and flush_tlb_current_task() x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly() x86/mm/64: Fix crash in remove_pagetable() Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation" x86/boot/e820: Remove a redundant self assignment x86/mm: Fix dump pagetables for 4 levels of page tables x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()" x86/espfix: Add support for 5-level paging x86/kasan: Extend KASAN to support 5-level paging x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y x86/paravirt: Add 5-level support to the paravirt code x86/mm: Define virtual memory map for 5-level paging x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert x86/boot: Detect 5-level paging support x86/mm/numa: Remove numa_nodemask_from_meminfo() ...
2017-05-02 09:54:56 +03:00
blk_freeze_queue_start(q);
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
}
static void pmem_release_disk(void *__pmem)
{
struct pmem_device *pmem = __pmem;
kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
del_gendisk(pmem->disk);
put_disk(pmem->disk);
}
mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS In preparation for fixing dax-dma-vs-unmap issues, filesystems need to be able to rely on the fact that they will get wakeups on dev_pagemap page-idle events. Introduce MEMORY_DEVICE_FS_DAX and generic_dax_page_free() as common indicator / infrastructure for dax filesytems to require. With this change there are no users of the MEMORY_DEVICE_HOST designation, so remove it. The HMM sub-system extended dev_pagemap to arrange a callback when a dev_pagemap managed page is freed. Since a dev_pagemap page is free / idle when its reference count is 1 it requires an additional branch to check the page-type at put_page() time. Given put_page() is a hot-path we do not want to incur that check if HMM is not in use, so a static branch is used to avoid that overhead when not necessary. Now, the FS_DAX implementation wants to reuse this mechanism for receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific static-key into a generic mechanism that either HMM or FS_DAX code paths can enable. For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support, care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure. However, we still need to support FS_DAX in the FS_DAX_LIMITED case implemented by the s390/dcssblk driver. Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Reported-by: kbuild test robot <lkp@intel.com> Reported-by: Thomas Meyer <thomas@m3y3r.de> Reported-by: Dave Jiang <dave.jiang@intel.com> Cc: "Jérôme Glisse" <jglisse@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2018-05-16 21:46:08 +03:00
static void pmem_release_pgmap_ops(void *__pgmap)
{
dev_pagemap_put_ops();
}
static void fsdax_pagefree(struct page *page, void *data)
{
wake_up_var(&page->_refcount);
}
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
return 0;
}
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
libnvdimm: introduce nvdimm_flush() and nvdimm_has_flush() nvdimm_flush() is a replacement for the x86 'pcommit' instruction. It is an optional write flushing mechanism that an nvdimm bus can provide for the pmem driver to consume. In the case of the NFIT nvdimm-bus-provider nvdimm_flush() is implemented as a series of flush-hint-address [1] writes to each dimm in the interleave set (region) that backs the namespace. The nvdimm_has_flush() routine relies on platform firmware to describe the flushing capabilities of a platform. It uses the heuristic of whether an nvdimm bus provider provides flush address data to return a ternary result: 1: flush addresses defined 0: dimm topology described without flush addresses (assume ADR) -errno: no topology information, unable to determine flush mechanism The pmem driver is expected to take the following actions on this ternary result: 1: nvdimm_flush() in response to REQ_FUA / REQ_FLUSH and shutdown 0: do not set, WC or FUA on the queue, take no further action -errno: warn and then operate as if nvdimm_has_flush() returned '0' The caveat of this heuristic is that it can not distinguish the "dimm does not have flush address" case from the "platform firmware is broken and failed to describe a flush address". Given we are already explicitly trusting the NFIT there's not much more we can do beyond blacklisting broken firmwares if they are ever encountered. Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2016-07-08 05:44:50 +03:00
struct nd_region *nd_region = to_nd_region(dev->parent);
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
struct resource bb_res;
struct nd_pfn *nd_pfn = NULL;
struct dax_device *dax_dev;
struct nd_pfn_sb *pfn_sb;
struct pmem_device *pmem;
struct request_queue *q;
struct device *gendev;
struct gendisk *disk;
void *addr;
int rc;
pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
if (!pmem)
return -ENOMEM;
/* while nsio_rw_bytes is active, parse a pfn info block if present */
if (is_nd_pfn(dev)) {
nd_pfn = to_nd_pfn(dev);
rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap);
if (rc)
return rc;
}
/* we're attaching a block device, disable raw namespace access */
devm_nsio_disable(dev, nsio);
dev_set_drvdata(dev, pmem);
pmem->phys_addr = res->start;
pmem->size = resource_size(res);
fua = nvdimm_has_flush(nd_region);
if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) {
dev_warn(dev, "unable to guarantee persistence of writes\n");
fua = 0;
}
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) {
dev_warn(dev, "could not reserve region %pR\n", res);
return -EBUSY;
}
q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
if (!q)
return -ENOMEM;
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
return -ENOMEM;
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
if (is_nd_pfn(dev)) {
mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS In preparation for fixing dax-dma-vs-unmap issues, filesystems need to be able to rely on the fact that they will get wakeups on dev_pagemap page-idle events. Introduce MEMORY_DEVICE_FS_DAX and generic_dax_page_free() as common indicator / infrastructure for dax filesytems to require. With this change there are no users of the MEMORY_DEVICE_HOST designation, so remove it. The HMM sub-system extended dev_pagemap to arrange a callback when a dev_pagemap managed page is freed. Since a dev_pagemap page is free / idle when its reference count is 1 it requires an additional branch to check the page-type at put_page() time. Given put_page() is a hot-path we do not want to incur that check if HMM is not in use, so a static branch is used to avoid that overhead when not necessary. Now, the FS_DAX implementation wants to reuse this mechanism for receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific static-key into a generic mechanism that either HMM or FS_DAX code paths can enable. For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support, care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure. However, we still need to support FS_DAX in the FS_DAX_LIMITED case implemented by the s390/dcssblk driver. Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Reported-by: kbuild test robot <lkp@intel.com> Reported-by: Thomas Meyer <thomas@m3y3r.de> Reported-by: Dave Jiang <dave.jiang@intel.com> Cc: "Jérôme Glisse" <jglisse@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2018-05-16 21:46:08 +03:00
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
pmem->pfn_pad = resource_size(res) -
resource_size(&pmem->pgmap.res);
pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
bb_res.start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false;
mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS In preparation for fixing dax-dma-vs-unmap issues, filesystems need to be able to rely on the fact that they will get wakeups on dev_pagemap page-idle events. Introduce MEMORY_DEVICE_FS_DAX and generic_dax_page_free() as common indicator / infrastructure for dax filesytems to require. With this change there are no users of the MEMORY_DEVICE_HOST designation, so remove it. The HMM sub-system extended dev_pagemap to arrange a callback when a dev_pagemap managed page is freed. Since a dev_pagemap page is free / idle when its reference count is 1 it requires an additional branch to check the page-type at put_page() time. Given put_page() is a hot-path we do not want to incur that check if HMM is not in use, so a static branch is used to avoid that overhead when not necessary. Now, the FS_DAX implementation wants to reuse this mechanism for receiving dev_pagemap ->page_free() callbacks. Rework the HMM-specific static-key into a generic mechanism that either HMM or FS_DAX code paths can enable. For ARCH=um builds, and any other arch that lacks ZONE_DEVICE support, care must be taken to compile out the DEV_PAGEMAP_OPS infrastructure. However, we still need to support FS_DAX in the FS_DAX_LIMITED case implemented by the s390/dcssblk driver. Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Reported-by: kbuild test robot <lkp@intel.com> Reported-by: Thomas Meyer <thomas@m3y3r.de> Reported-by: Dave Jiang <dave.jiang@intel.com> Cc: "Jérôme Glisse" <jglisse@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2018-05-16 21:46:08 +03:00
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
} else
addr = devm_memremap(dev, pmem->phys_addr,
pmem->size, ARCH_MEMREMAP_PMEM);
/*
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
* At release time the queue must be frozen before
* devm_memremap_pages is unwound
*/
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Reviewed-by: Logan Gunthorpe <logang@deltatee.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-28 20:23:37 +03:00
if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
return -ENOMEM;
if (IS_ERR(addr))
return PTR_ERR(addr);
pmem->virt_addr = addr;
blk_queue_write_cache(q, true, fua);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
q->queuedata = pmem;
disk = alloc_disk_node(0, nid);
if (!disk)
return -ENOMEM;
pmem->disk = disk;
disk->fops = &pmem_fops;
disk->queue = q;
disk->flags = GENHD_FL_EXT_DEVT;
disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski <luto@amacapital.net> Cc: Boaz Harrosh <boaz@plexistor.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Jens Axboe <axboe@fb.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Neil Brown <neilb@suse.de> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg KH <gregkh@linuxfoundation.org> [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma <vishal.l.verma@linux.intel.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2015-06-25 11:20:32 +03:00
nvdimm_namespace_disk_name(ndns, disk->disk_name);
set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
/ 512);
if (devm_init_badblocks(dev, &pmem->bb))
return -ENOMEM;
nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
disk->bb = &pmem->bb;
dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
if (!dax_dev) {
put_disk(disk);
return -ENOMEM;
}
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
gendev = disk_to_dev(disk);
gendev->groups = pmem_attribute_groups;
device_add_disk(dev, disk);
if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
return -ENOMEM;
revalidate_disk(disk);
pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd,
"badblocks");
if (!pmem->bb_state)
dev_warn(dev, "'badblocks' notification disabled\n");
return 0;
}
static int nd_pmem_probe(struct device *dev)
{
struct nd_namespace_common *ndns;
ndns = nvdimm_namespace_common_probe(dev);
if (IS_ERR(ndns))
return PTR_ERR(ndns);
if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
return -ENXIO;
if (is_nd_btt(dev))
return nvdimm_namespace_attach_btt(ndns);
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
/* if we find a valid info-block we'll come back as that personality */
if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
|| nd_dax_probe(dev, ndns) == 0)
return -ENXIO;
/* ...otherwise we're just a raw pmem device */
return pmem_attach_disk(dev, ndns);
}
static int nd_pmem_remove(struct device *dev)
{
struct pmem_device *pmem = dev_get_drvdata(dev);
if (is_nd_btt(dev))
nvdimm_namespace_detach_btt(to_nd_btt(dev));
else {
/*
* Note, this assumes device_lock() context to not race
* nd_pmem_notify()
*/
sysfs_put(pmem->bb_state);
pmem->bb_state = NULL;
}
nvdimm_flush(to_nd_region(dev->parent));
return 0;
}
static void nd_pmem_shutdown(struct device *dev)
{
nvdimm_flush(to_nd_region(dev->parent));
}
static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
{
struct nd_region *nd_region;
resource_size_t offset = 0, end_trunc = 0;
struct nd_namespace_common *ndns;
struct nd_namespace_io *nsio;
struct resource res;
struct badblocks *bb;
struct kernfs_node *bb_state;
if (event != NVDIMM_REVALIDATE_POISON)
return;
if (is_nd_btt(dev)) {
struct nd_btt *nd_btt = to_nd_btt(dev);
ndns = nd_btt->ndns;
nd_region = to_nd_region(ndns->dev.parent);
nsio = to_nd_namespace_io(&ndns->dev);
bb = &nsio->bb;
bb_state = NULL;
} else {
struct pmem_device *pmem = dev_get_drvdata(dev);
nd_region = to_region(pmem);
bb = &pmem->bb;
bb_state = pmem->bb_state;
if (is_nd_pfn(dev)) {
struct nd_pfn *nd_pfn = to_nd_pfn(dev);
struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
ndns = nd_pfn->ndns;
offset = pmem->data_offset +
__le32_to_cpu(pfn_sb->start_pad);
end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
} else {
ndns = to_ndns(dev);
}
nsio = to_nd_namespace_io(&ndns->dev);
}
res.start = nsio->res.start + offset;
res.end = nsio->res.end - end_trunc;
nvdimm_badblocks_populate(nd_region, bb, &res);
if (bb_state)
sysfs_notify_dirent(bb_state);
}
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
static struct nd_device_driver nd_pmem_driver = {
.probe = nd_pmem_probe,
.remove = nd_pmem_remove,
.notify = nd_pmem_notify,
.shutdown = nd_pmem_shutdown,
.drv = {
.name = "nd_pmem",
},
.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
};
module_nd_driver(nd_pmem_driver);
MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
MODULE_LICENSE("GPL v2");