115 lines
5.0 KiB
C
115 lines
5.0 KiB
C
|
/*
|
||
|
* Copyright (C) 2016 Oracle. All Rights Reserved.
|
||
|
*
|
||
|
* Author: Darrick J. Wong <darrick.wong@oracle.com>
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU General Public License
|
||
|
* as published by the Free Software Foundation; either version 2
|
||
|
* of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it would be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with this program; if not, write the Free Software Foundation,
|
||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
*/
|
||
|
#include "xfs.h"
|
||
|
#include "xfs_fs.h"
|
||
|
#include "xfs_shared.h"
|
||
|
#include "xfs_format.h"
|
||
|
#include "xfs_log_format.h"
|
||
|
#include "xfs_trans_resv.h"
|
||
|
#include "xfs_mount.h"
|
||
|
#include "xfs_defer.h"
|
||
|
#include "xfs_da_format.h"
|
||
|
#include "xfs_da_btree.h"
|
||
|
#include "xfs_inode.h"
|
||
|
#include "xfs_trans.h"
|
||
|
#include "xfs_inode_item.h"
|
||
|
#include "xfs_bmap.h"
|
||
|
#include "xfs_bmap_util.h"
|
||
|
#include "xfs_error.h"
|
||
|
#include "xfs_dir2.h"
|
||
|
#include "xfs_dir2_priv.h"
|
||
|
#include "xfs_ioctl.h"
|
||
|
#include "xfs_trace.h"
|
||
|
#include "xfs_log.h"
|
||
|
#include "xfs_icache.h"
|
||
|
#include "xfs_pnfs.h"
|
||
|
#include "xfs_refcount_btree.h"
|
||
|
#include "xfs_refcount.h"
|
||
|
#include "xfs_bmap_btree.h"
|
||
|
#include "xfs_trans_space.h"
|
||
|
#include "xfs_bit.h"
|
||
|
#include "xfs_alloc.h"
|
||
|
#include "xfs_quota_defs.h"
|
||
|
#include "xfs_quota.h"
|
||
|
#include "xfs_btree.h"
|
||
|
#include "xfs_bmap_btree.h"
|
||
|
#include "xfs_reflink.h"
|
||
|
|
||
|
/*
|
||
|
* Copy on Write of Shared Blocks
|
||
|
*
|
||
|
* XFS must preserve "the usual" file semantics even when two files share
|
||
|
* the same physical blocks. This means that a write to one file must not
|
||
|
* alter the blocks in a different file; the way that we'll do that is
|
||
|
* through the use of a copy-on-write mechanism. At a high level, that
|
||
|
* means that when we want to write to a shared block, we allocate a new
|
||
|
* block, write the data to the new block, and if that succeeds we map the
|
||
|
* new block into the file.
|
||
|
*
|
||
|
* XFS provides a "delayed allocation" mechanism that defers the allocation
|
||
|
* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
|
||
|
* possible. This reduces fragmentation by enabling the filesystem to ask
|
||
|
* for bigger chunks less often, which is exactly what we want for CoW.
|
||
|
*
|
||
|
* The delalloc mechanism begins when the kernel wants to make a block
|
||
|
* writable (write_begin or page_mkwrite). If the offset is not mapped, we
|
||
|
* create a delalloc mapping, which is a regular in-core extent, but without
|
||
|
* a real startblock. (For delalloc mappings, the startblock encodes both
|
||
|
* a flag that this is a delalloc mapping, and a worst-case estimate of how
|
||
|
* many blocks might be required to put the mapping into the BMBT.) delalloc
|
||
|
* mappings are a reservation against the free space in the filesystem;
|
||
|
* adjacent mappings can also be combined into fewer larger mappings.
|
||
|
*
|
||
|
* When dirty pages are being written out (typically in writepage), the
|
||
|
* delalloc reservations are converted into real mappings by allocating
|
||
|
* blocks and replacing the delalloc mapping with real ones. A delalloc
|
||
|
* mapping can be replaced by several real ones if the free space is
|
||
|
* fragmented.
|
||
|
*
|
||
|
* We want to adapt the delalloc mechanism for copy-on-write, since the
|
||
|
* write paths are similar. The first two steps (creating the reservation
|
||
|
* and allocating the blocks) are exactly the same as delalloc except that
|
||
|
* the mappings must be stored in a separate CoW fork because we do not want
|
||
|
* to disturb the mapping in the data fork until we're sure that the write
|
||
|
* succeeded. IO completion in this case is the process of removing the old
|
||
|
* mapping from the data fork and moving the new mapping from the CoW fork to
|
||
|
* the data fork. This will be discussed shortly.
|
||
|
*
|
||
|
* For now, unaligned directio writes will be bounced back to the page cache.
|
||
|
* Block-aligned directio writes will use the same mechanism as buffered
|
||
|
* writes.
|
||
|
*
|
||
|
* CoW remapping must be done after the data block write completes,
|
||
|
* because we don't want to destroy the old data fork map until we're sure
|
||
|
* the new block has been written. Since the new mappings are kept in a
|
||
|
* separate fork, we can simply iterate these mappings to find the ones
|
||
|
* that cover the file blocks that we just CoW'd. For each extent, simply
|
||
|
* unmap the corresponding range in the data fork, map the new range into
|
||
|
* the data fork, and remove the extent from the CoW fork.
|
||
|
*
|
||
|
* Since the remapping operation can be applied to an arbitrary file
|
||
|
* range, we record the need for the remap step as a flag in the ioend
|
||
|
* instead of declaring a new IO type. This is required for direct io
|
||
|
* because we only have ioend for the whole dio, and we have to be able to
|
||
|
* remember the presence of unwritten blocks and CoW blocks with a single
|
||
|
* ioend structure. Better yet, the more ground we can cover with one
|
||
|
* ioend, the better.
|
||
|
*/
|