2005-04-17 02:20:36 +04:00
/*
2005-11-02 06:58:39 +03:00
* Copyright ( c ) 2000 - 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-17 02:20:36 +04:00
*
2005-11-02 06:58:39 +03:00
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License as
2005-04-17 02:20:36 +04:00
* published by the Free Software Foundation .
*
2005-11-02 06:58:39 +03:00
* This program is distributed in the hope that it would be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
2005-04-17 02:20:36 +04:00
*
2005-11-02 06:58:39 +03:00
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write the Free Software Foundation ,
* Inc . , 51 Franklin St , Fifth Floor , Boston , MA 02110 - 1301 USA
2005-04-17 02:20:36 +04:00
*/
# include "xfs.h"
2005-11-02 06:38:42 +03:00
# include "xfs_fs.h"
2013-10-23 03:36:05 +04:00
# include "xfs_shared.h"
2013-10-23 03:50:10 +04:00
# include "xfs_format.h"
2013-10-23 03:51:50 +04:00
# include "xfs_log_format.h"
2013-10-23 03:50:10 +04:00
# include "xfs_trans_resv.h"
2005-04-17 02:20:36 +04:00
# include "xfs_sb.h"
# include "xfs_mount.h"
2016-08-03 04:15:38 +03:00
# include "xfs_defer.h"
2014-06-06 09:15:59 +04:00
# include "xfs_da_format.h"
# include "xfs_da_btree.h"
2005-11-02 06:38:42 +03:00
# include "xfs_inode.h"
2013-10-23 03:50:10 +04:00
# include "xfs_trans.h"
2005-11-02 06:38:42 +03:00
# include "xfs_inode_item.h"
2005-04-17 02:20:36 +04:00
# include "xfs_error.h"
2013-10-23 03:51:50 +04:00
# include "xfs_btree.h"
# include "xfs_alloc_btree.h"
2005-04-17 02:20:36 +04:00
# include "xfs_alloc.h"
2016-08-03 04:36:08 +03:00
# include "xfs_rmap_btree.h"
2005-04-17 02:20:36 +04:00
# include "xfs_ialloc.h"
# include "xfs_fsops.h"
# include "xfs_itable.h"
# include "xfs_trans_space.h"
# include "xfs_rtalloc.h"
2009-12-15 02:14:59 +03:00
# include "xfs_trace.h"
2013-10-23 03:50:10 +04:00
# include "xfs_log.h"
2013-10-23 03:51:50 +04:00
# include "xfs_filestream.h"
2016-08-03 04:33:42 +03:00
# include "xfs_rmap.h"
2016-10-03 19:11:44 +03:00
# include "xfs_ag_resv.h"
2005-04-17 02:20:36 +04:00
/*
* File system operations
*/
int
xfs_fs_geometry (
xfs_mount_t * mp ,
xfs_fsop_geom_t * geo ,
int new_version )
{
2011-02-14 16:45:28 +03:00
memset ( geo , 0 , sizeof ( * geo ) ) ;
2005-04-17 02:20:36 +04:00
geo - > blocksize = mp - > m_sb . sb_blocksize ;
geo - > rtextsize = mp - > m_sb . sb_rextsize ;
geo - > agblocks = mp - > m_sb . sb_agblocks ;
geo - > agcount = mp - > m_sb . sb_agcount ;
geo - > logblocks = mp - > m_sb . sb_logblocks ;
geo - > sectsize = mp - > m_sb . sb_sectsize ;
geo - > inodesize = mp - > m_sb . sb_inodesize ;
geo - > imaxpct = mp - > m_sb . sb_imax_pct ;
geo - > datablocks = mp - > m_sb . sb_dblocks ;
geo - > rtblocks = mp - > m_sb . sb_rblocks ;
geo - > rtextents = mp - > m_sb . sb_rextents ;
geo - > logstart = mp - > m_sb . sb_logstart ;
ASSERT ( sizeof ( geo - > uuid ) = = sizeof ( mp - > m_sb . sb_uuid ) ) ;
memcpy ( geo - > uuid , & mp - > m_sb . sb_uuid , sizeof ( mp - > m_sb . sb_uuid ) ) ;
if ( new_version > = 2 ) {
geo - > sunit = mp - > m_sb . sb_unit ;
geo - > swidth = mp - > m_sb . sb_width ;
}
if ( new_version > = 3 ) {
geo - > version = XFS_FSOP_GEOM_VERSION ;
2014-05-20 01:46:40 +04:00
geo - > flags = XFS_FSOP_GEOM_FLAGS_NLINK |
2014-05-20 01:46:55 +04:00
XFS_FSOP_GEOM_FLAGS_DIRV2 |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasattr ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_ATTR : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasquota ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_QUOTA : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasalign ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_IALIGN : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasdalign ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_DALIGN : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasextflgbit ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hassector ( & mp - > m_sb ) ?
2005-11-02 02:34:53 +03:00
XFS_FSOP_GEOM_FLAGS_SECTOR : 0 ) |
2008-05-21 10:58:55 +04:00
( xfs_sb_version_hasasciici ( & mp - > m_sb ) ?
XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0 ) |
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 09:26:31 +04:00
( xfs_sb_version_haslazysbcount ( & mp - > m_sb ) ?
XFS_FSOP_GEOM_FLAGS_LAZYSB : 0 ) |
2008-03-06 05:44:28 +03:00
( xfs_sb_version_hasattr2 ( & mp - > m_sb ) ?
2012-10-09 23:11:45 +04:00
XFS_FSOP_GEOM_FLAGS_ATTR2 : 0 ) |
( xfs_sb_version_hasprojid32bit ( & mp - > m_sb ) ?
2013-05-27 10:38:26 +04:00
XFS_FSOP_GEOM_FLAGS_PROJID32 : 0 ) |
( xfs_sb_version_hascrc ( & mp - > m_sb ) ?
2013-10-03 01:00:06 +04:00
XFS_FSOP_GEOM_FLAGS_V5SB : 0 ) |
( xfs_sb_version_hasftype ( & mp - > m_sb ) ?
2014-04-24 10:01:41 +04:00
XFS_FSOP_GEOM_FLAGS_FTYPE : 0 ) |
( xfs_sb_version_hasfinobt ( & mp - > m_sb ) ?
2015-05-29 01:58:32 +03:00
XFS_FSOP_GEOM_FLAGS_FINOBT : 0 ) |
( xfs_sb_version_hassparseinodes ( & mp - > m_sb ) ?
2016-08-03 05:16:44 +03:00
XFS_FSOP_GEOM_FLAGS_SPINODES : 0 ) |
( xfs_sb_version_hasrmapbt ( & mp - > m_sb ) ?
2016-10-03 19:11:30 +03:00
XFS_FSOP_GEOM_FLAGS_RMAPBT : 0 ) |
( xfs_sb_version_hasreflink ( & mp - > m_sb ) ?
XFS_FSOP_GEOM_FLAGS_REFLINK : 0 ) ;
2008-03-06 05:44:28 +03:00
geo - > logsectsize = xfs_sb_version_hassector ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
mp - > m_sb . sb_logsectsize : BBSIZE ;
geo - > rtsectsize = mp - > m_sb . sb_blocksize ;
2014-06-06 09:15:59 +04:00
geo - > dirblocksize = mp - > m_dir_geo - > blksize ;
2005-04-17 02:20:36 +04:00
}
if ( new_version > = 4 ) {
geo - > flags | =
2008-03-06 05:44:28 +03:00
( xfs_sb_version_haslogv2 ( & mp - > m_sb ) ?
2005-04-17 02:20:36 +04:00
XFS_FSOP_GEOM_FLAGS_LOGV2 : 0 ) ;
geo - > logsunit = mp - > m_sb . sb_logsunit ;
}
return 0 ;
}
2012-11-12 15:53:59 +04:00
static struct xfs_buf *
xfs_growfs_get_hdr_buf (
struct xfs_mount * mp ,
xfs_daddr_t blkno ,
size_t numblks ,
2012-11-14 10:54:40 +04:00
int flags ,
const struct xfs_buf_ops * ops )
2012-11-12 15:53:59 +04:00
{
struct xfs_buf * bp ;
bp = xfs_buf_get_uncached ( mp - > m_ddev_targp , numblks , flags ) ;
if ( ! bp )
return NULL ;
xfs_buf_zero ( bp , 0 , BBTOB ( bp - > b_length ) ) ;
bp - > b_bn = blkno ;
bp - > b_maps [ 0 ] . bm_bn = blkno ;
2012-11-14 10:54:40 +04:00
bp - > b_ops = ops ;
2012-11-12 15:53:59 +04:00
return bp ;
}
2005-04-17 02:20:36 +04:00
static int
xfs_growfs_data_private (
xfs_mount_t * mp , /* mount point for filesystem */
xfs_growfs_data_t * in ) /* growfs data input struct */
{
xfs_agf_t * agf ;
2012-11-12 15:54:00 +04:00
struct xfs_agfl * agfl ;
2005-04-17 02:20:36 +04:00
xfs_agi_t * agi ;
xfs_agnumber_t agno ;
xfs_extlen_t agsize ;
xfs_extlen_t tmpsize ;
xfs_alloc_rec_t * arec ;
xfs_buf_t * bp ;
int bucket ;
int dpct ;
xfs: don't break from growfs ag update loop on error
When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct. If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.
With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.
And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.
Even with this change, growfs will still say:
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
data blocks changed from 319815680 to 335216640
which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.
And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-11 23:14:05 +04:00
int error , saved_error = 0 ;
2005-04-17 02:20:36 +04:00
xfs_agnumber_t nagcount ;
xfs_agnumber_t nagimax = 0 ;
xfs_rfsblock_t nb , nb_mod ;
xfs_rfsblock_t new ;
xfs_rfsblock_t nfree ;
xfs_agnumber_t oagcount ;
int pct ;
xfs_trans_t * tp ;
nb = in - > newblocks ;
pct = in - > imaxpct ;
if ( nb < mp - > m_sb . sb_dblocks | | pct < 0 | | pct > 100 )
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2007-05-14 12:24:02 +04:00
if ( ( error = xfs_sb_validate_fsb_count ( & mp - > m_sb , nb ) ) )
return error ;
2005-04-17 02:20:36 +04:00
dpct = pct - mp - > m_sb . sb_imax_pct ;
2014-10-02 03:05:32 +04:00
error = xfs_buf_read_uncached ( mp - > m_ddev_targp ,
2010-09-22 04:47:20 +04:00
XFS_FSB_TO_BB ( mp , nb ) - XFS_FSS_TO_BB ( mp , 1 ) ,
2014-10-02 03:05:32 +04:00
XFS_FSS_TO_BB ( mp , 1 ) , 0 , & bp , NULL ) ;
if ( error )
2012-11-12 15:54:02 +04:00
return error ;
2005-04-17 02:20:36 +04:00
xfs_buf_relse ( bp ) ;
new = nb ; /* use new as a temporary here */
nb_mod = do_div ( new , mp - > m_sb . sb_agblocks ) ;
nagcount = new + ( nb_mod ! = 0 ) ;
if ( nb_mod & & nb_mod < XFS_MIN_AG_BLOCKS ) {
nagcount - - ;
2009-05-23 23:30:12 +04:00
nb = ( xfs_rfsblock_t ) nagcount * mp - > m_sb . sb_agblocks ;
2005-04-17 02:20:36 +04:00
if ( nb < mp - > m_sb . sb_dblocks )
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
}
new = nb - mp - > m_sb . sb_dblocks ;
oagcount = mp - > m_sb . sb_agcount ;
2009-07-19 02:14:53 +04:00
2010-01-11 14:47:44 +03:00
/* allocate the new per-ag structures */
if ( nagcount > oagcount ) {
error = xfs_initialize_perag ( mp , nagcount , & nagimax ) ;
if ( error )
return error ;
2005-04-17 02:20:36 +04:00
}
2010-01-11 14:47:44 +03:00
2016-04-06 02:19:55 +03:00
error = xfs_trans_alloc ( mp , & M_RES ( mp ) - > tr_growdata ,
XFS_GROWFS_SPACE_RES ( mp ) , 0 , XFS_TRANS_RESERVE , & tp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
return error ;
2010-01-11 14:47:44 +03:00
/*
* Write new AG headers to disk . Non - transactional , but written
* synchronously so they are completed prior to the growfs transaction
* being logged .
*/
2005-04-17 02:20:36 +04:00
nfree = 0 ;
for ( agno = nagcount - 1 ; agno > = oagcount ; agno - - , new - = agsize ) {
2013-11-21 08:41:06 +04:00
__be32 * agfl_bno ;
2005-04-17 02:20:36 +04:00
/*
2012-11-12 15:54:00 +04:00
* AG freespace header block
2005-04-17 02:20:36 +04:00
*/
2012-11-12 15:53:59 +04:00
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AG_DADDR ( mp , agno , XFS_AGF_DADDR ( mp ) ) ,
2012-11-14 10:54:40 +04:00
XFS_FSS_TO_BB ( mp , 1 ) , 0 ,
& xfs_agf_buf_ops ) ;
2011-09-07 23:37:54 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2011-09-07 23:37:54 +04:00
goto error0 ;
}
2012-11-12 15:53:59 +04:00
2005-04-17 02:20:36 +04:00
agf = XFS_BUF_TO_AGF ( bp ) ;
2005-11-02 07:11:25 +03:00
agf - > agf_magicnum = cpu_to_be32 ( XFS_AGF_MAGIC ) ;
agf - > agf_versionnum = cpu_to_be32 ( XFS_AGF_VERSION ) ;
agf - > agf_seqno = cpu_to_be32 ( agno ) ;
2005-04-17 02:20:36 +04:00
if ( agno = = nagcount - 1 )
agsize =
nb -
( agno * ( xfs_rfsblock_t ) mp - > m_sb . sb_agblocks ) ;
else
agsize = mp - > m_sb . sb_agblocks ;
2005-11-02 07:11:25 +03:00
agf - > agf_length = cpu_to_be32 ( agsize ) ;
agf - > agf_roots [ XFS_BTNUM_BNOi ] = cpu_to_be32 ( XFS_BNO_BLOCK ( mp ) ) ;
agf - > agf_roots [ XFS_BTNUM_CNTi ] = cpu_to_be32 ( XFS_CNT_BLOCK ( mp ) ) ;
agf - > agf_levels [ XFS_BTNUM_BNOi ] = cpu_to_be32 ( 1 ) ;
agf - > agf_levels [ XFS_BTNUM_CNTi ] = cpu_to_be32 ( 1 ) ;
2016-08-03 04:36:08 +03:00
if ( xfs_sb_version_hasrmapbt ( & mp - > m_sb ) ) {
agf - > agf_roots [ XFS_BTNUM_RMAPi ] =
cpu_to_be32 ( XFS_RMAP_BLOCK ( mp ) ) ;
agf - > agf_levels [ XFS_BTNUM_RMAPi ] = cpu_to_be32 ( 1 ) ;
2016-08-17 01:31:49 +03:00
agf - > agf_rmap_blocks = cpu_to_be32 ( 1 ) ;
2016-08-03 04:36:08 +03:00
}
2016-04-06 00:06:20 +03:00
agf - > agf_flfirst = cpu_to_be32 ( 1 ) ;
agf - > agf_fllast = 0 ;
2005-04-17 02:20:36 +04:00
agf - > agf_flcount = 0 ;
2016-08-03 04:31:47 +03:00
tmpsize = agsize - mp - > m_ag_prealloc_blocks ;
2005-11-02 07:11:25 +03:00
agf - > agf_freeblks = cpu_to_be32 ( tmpsize ) ;
agf - > agf_longest = cpu_to_be32 ( tmpsize ) ;
2013-04-03 09:11:13 +04:00
if ( xfs_sb_version_hascrc ( & mp - > m_sb ) )
2015-08-19 03:31:41 +03:00
uuid_copy ( & agf - > agf_uuid , & mp - > m_sb . sb_meta_uuid ) ;
2016-10-03 19:11:18 +03:00
if ( xfs_sb_version_hasreflink ( & mp - > m_sb ) ) {
agf - > agf_refcount_root = cpu_to_be32 (
xfs_refc_block ( mp ) ) ;
agf - > agf_refcount_level = cpu_to_be32 ( 1 ) ;
agf - > agf_refcount_blocks = cpu_to_be32 ( 1 ) ;
}
2013-04-03 09:11:13 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
2011-08-23 12:28:07 +04:00
2012-11-12 15:54:00 +04:00
/*
* AG freelist header block
*/
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AG_DADDR ( mp , agno , XFS_AGFL_DADDR ( mp ) ) ,
2012-11-14 10:54:40 +04:00
XFS_FSS_TO_BB ( mp , 1 ) , 0 ,
& xfs_agfl_buf_ops ) ;
2012-11-12 15:54:00 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2012-11-12 15:54:00 +04:00
goto error0 ;
}
agfl = XFS_BUF_TO_AGFL ( bp ) ;
2013-04-03 09:11:14 +04:00
if ( xfs_sb_version_hascrc ( & mp - > m_sb ) ) {
agfl - > agfl_magicnum = cpu_to_be32 ( XFS_AGFL_MAGIC ) ;
agfl - > agfl_seqno = cpu_to_be32 ( agno ) ;
2015-08-19 03:31:41 +03:00
uuid_copy ( & agfl - > agfl_uuid , & mp - > m_sb . sb_meta_uuid ) ;
2013-04-03 09:11:14 +04:00
}
2013-11-21 08:41:06 +04:00
agfl_bno = XFS_BUF_TO_AGFL_BNO ( mp , bp ) ;
2012-11-12 15:54:00 +04:00
for ( bucket = 0 ; bucket < XFS_AGFL_SIZE ( mp ) ; bucket + + )
2013-11-21 08:41:06 +04:00
agfl_bno [ bucket ] = cpu_to_be32 ( NULLAGBLOCK ) ;
2012-11-12 15:54:00 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
goto error0 ;
2005-04-17 02:20:36 +04:00
/*
* AG inode header block
*/
2012-11-12 15:53:59 +04:00
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AG_DADDR ( mp , agno , XFS_AGI_DADDR ( mp ) ) ,
2012-11-14 10:54:40 +04:00
XFS_FSS_TO_BB ( mp , 1 ) , 0 ,
& xfs_agi_buf_ops ) ;
2011-09-07 23:37:54 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2011-09-07 23:37:54 +04:00
goto error0 ;
}
2012-11-12 15:53:59 +04:00
2005-04-17 02:20:36 +04:00
agi = XFS_BUF_TO_AGI ( bp ) ;
2005-11-02 07:11:25 +03:00
agi - > agi_magicnum = cpu_to_be32 ( XFS_AGI_MAGIC ) ;
agi - > agi_versionnum = cpu_to_be32 ( XFS_AGI_VERSION ) ;
agi - > agi_seqno = cpu_to_be32 ( agno ) ;
agi - > agi_length = cpu_to_be32 ( agsize ) ;
2005-04-17 02:20:36 +04:00
agi - > agi_count = 0 ;
2005-11-02 07:11:25 +03:00
agi - > agi_root = cpu_to_be32 ( XFS_IBT_BLOCK ( mp ) ) ;
agi - > agi_level = cpu_to_be32 ( 1 ) ;
2005-04-17 02:20:36 +04:00
agi - > agi_freecount = 0 ;
2005-11-02 07:11:25 +03:00
agi - > agi_newino = cpu_to_be32 ( NULLAGINO ) ;
agi - > agi_dirino = cpu_to_be32 ( NULLAGINO ) ;
2013-04-03 09:11:15 +04:00
if ( xfs_sb_version_hascrc ( & mp - > m_sb ) )
2015-08-19 03:31:41 +03:00
uuid_copy ( & agi - > agi_uuid , & mp - > m_sb . sb_meta_uuid ) ;
2014-04-24 10:01:39 +04:00
if ( xfs_sb_version_hasfinobt ( & mp - > m_sb ) ) {
agi - > agi_free_root = cpu_to_be32 ( XFS_FIBT_BLOCK ( mp ) ) ;
agi - > agi_free_level = cpu_to_be32 ( 1 ) ;
}
2005-04-17 02:20:36 +04:00
for ( bucket = 0 ; bucket < XFS_AGI_UNLINKED_BUCKETS ; bucket + + )
2005-11-02 07:11:25 +03:00
agi - > agi_unlinked [ bucket ] = cpu_to_be32 ( NULLAGINO ) ;
2013-04-03 09:11:15 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
2011-08-23 12:28:07 +04:00
2005-04-17 02:20:36 +04:00
/*
* BNO btree root block
*/
2012-11-12 15:53:59 +04:00
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_BNO_BLOCK ( mp ) ) ,
2012-11-14 10:54:40 +04:00
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_allocbt_buf_ops ) ;
2012-11-12 15:53:59 +04:00
2011-09-07 23:37:54 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2011-09-07 23:37:54 +04:00
goto error0 ;
}
2012-11-14 02:40:27 +04:00
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_BNO , 0 , 1 , agno , 0 ) ;
2013-04-21 23:53:46 +04:00
2012-11-14 02:40:27 +04:00
arec = XFS_ALLOC_REC_ADDR ( mp , XFS_BUF_TO_BLOCK ( bp ) , 1 ) ;
2016-08-03 04:31:47 +03:00
arec - > ar_startblock = cpu_to_be32 ( mp - > m_ag_prealloc_blocks ) ;
2005-11-02 07:11:25 +03:00
arec - > ar_blockcount = cpu_to_be32 (
agsize - be32_to_cpu ( arec - > ar_startblock ) ) ;
2012-11-14 02:40:27 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
2011-08-23 12:28:07 +04:00
2005-04-17 02:20:36 +04:00
/*
* CNT btree root block
*/
2012-11-12 15:53:59 +04:00
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_CNT_BLOCK ( mp ) ) ,
2012-11-14 10:54:40 +04:00
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_allocbt_buf_ops ) ;
2011-09-07 23:37:54 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2011-09-07 23:37:54 +04:00
goto error0 ;
}
2012-11-14 02:40:27 +04:00
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_CNT , 0 , 1 , agno , 0 ) ;
2013-04-21 23:53:46 +04:00
2012-11-14 02:40:27 +04:00
arec = XFS_ALLOC_REC_ADDR ( mp , XFS_BUF_TO_BLOCK ( bp ) , 1 ) ;
2016-08-03 04:31:47 +03:00
arec - > ar_startblock = cpu_to_be32 ( mp - > m_ag_prealloc_blocks ) ;
2005-11-02 07:11:25 +03:00
arec - > ar_blockcount = cpu_to_be32 (
agsize - be32_to_cpu ( arec - > ar_startblock ) ) ;
nfree + = be32_to_cpu ( arec - > ar_blockcount ) ;
2012-11-14 02:40:27 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
2011-08-23 12:28:07 +04:00
2016-08-03 04:36:08 +03:00
/* RMAP btree root block */
if ( xfs_sb_version_hasrmapbt ( & mp - > m_sb ) ) {
struct xfs_rmap_rec * rrec ;
struct xfs_btree_block * block ;
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_RMAP_BLOCK ( mp ) ) ,
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_rmapbt_buf_ops ) ;
if ( ! bp ) {
error = - ENOMEM ;
goto error0 ;
}
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_RMAP , 0 , 0 ,
2017-01-28 10:16:37 +03:00
agno , 0 ) ;
2016-08-03 04:36:08 +03:00
block = XFS_BUF_TO_BLOCK ( bp ) ;
/*
* mark the AG header regions as static metadata The BNO
* btree block is the first block after the headers , so
* it ' s location defines the size of region the static
* metadata consumes .
*
* Note : unlike mkfs , we never have to account for log
* space when growing the data regions
*/
rrec = XFS_RMAP_REC_ADDR ( block , 1 ) ;
rrec - > rm_startblock = 0 ;
rrec - > rm_blockcount = cpu_to_be32 ( XFS_BNO_BLOCK ( mp ) ) ;
rrec - > rm_owner = cpu_to_be64 ( XFS_RMAP_OWN_FS ) ;
rrec - > rm_offset = 0 ;
be16_add_cpu ( & block - > bb_numrecs , 1 ) ;
/* account freespace btree root blocks */
rrec = XFS_RMAP_REC_ADDR ( block , 2 ) ;
rrec - > rm_startblock = cpu_to_be32 ( XFS_BNO_BLOCK ( mp ) ) ;
rrec - > rm_blockcount = cpu_to_be32 ( 2 ) ;
rrec - > rm_owner = cpu_to_be64 ( XFS_RMAP_OWN_AG ) ;
rrec - > rm_offset = 0 ;
be16_add_cpu ( & block - > bb_numrecs , 1 ) ;
/* account inode btree root blocks */
rrec = XFS_RMAP_REC_ADDR ( block , 3 ) ;
rrec - > rm_startblock = cpu_to_be32 ( XFS_IBT_BLOCK ( mp ) ) ;
rrec - > rm_blockcount = cpu_to_be32 ( XFS_RMAP_BLOCK ( mp ) -
XFS_IBT_BLOCK ( mp ) ) ;
rrec - > rm_owner = cpu_to_be64 ( XFS_RMAP_OWN_INOBT ) ;
rrec - > rm_offset = 0 ;
be16_add_cpu ( & block - > bb_numrecs , 1 ) ;
/* account for rmap btree root */
rrec = XFS_RMAP_REC_ADDR ( block , 4 ) ;
rrec - > rm_startblock = cpu_to_be32 ( XFS_RMAP_BLOCK ( mp ) ) ;
rrec - > rm_blockcount = cpu_to_be32 ( 1 ) ;
rrec - > rm_owner = cpu_to_be64 ( XFS_RMAP_OWN_AG ) ;
rrec - > rm_offset = 0 ;
be16_add_cpu ( & block - > bb_numrecs , 1 ) ;
2016-10-03 19:11:18 +03:00
/* account for refc btree root */
if ( xfs_sb_version_hasreflink ( & mp - > m_sb ) ) {
rrec = XFS_RMAP_REC_ADDR ( block , 5 ) ;
rrec - > rm_startblock = cpu_to_be32 (
xfs_refc_block ( mp ) ) ;
rrec - > rm_blockcount = cpu_to_be32 ( 1 ) ;
rrec - > rm_owner = cpu_to_be64 ( XFS_RMAP_OWN_REFC ) ;
rrec - > rm_offset = 0 ;
be16_add_cpu ( & block - > bb_numrecs , 1 ) ;
}
2016-08-03 04:36:08 +03:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
goto error0 ;
}
2005-04-17 02:20:36 +04:00
/*
* INO btree root block
*/
2012-11-12 15:53:59 +04:00
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_IBT_BLOCK ( mp ) ) ,
2012-11-14 10:54:40 +04:00
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_inobt_buf_ops ) ;
2011-09-07 23:37:54 +04:00
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2011-09-07 23:37:54 +04:00
goto error0 ;
}
2012-11-12 15:53:59 +04:00
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_INO , 0 , 0 , agno , 0 ) ;
2012-11-14 02:40:27 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
2014-04-24 10:01:39 +04:00
/*
* FINO btree root block
*/
if ( xfs_sb_version_hasfinobt ( & mp - > m_sb ) ) {
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_FIBT_BLOCK ( mp ) ) ,
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_inobt_buf_ops ) ;
if ( ! bp ) {
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2014-04-24 10:01:39 +04:00
goto error0 ;
}
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_FINO ,
2017-01-28 10:16:37 +03:00
0 , 0 , agno , 0 ) ;
2014-04-24 10:01:39 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
goto error0 ;
}
2016-10-03 19:11:18 +03:00
/*
* refcount btree root block
*/
if ( xfs_sb_version_hasreflink ( & mp - > m_sb ) ) {
bp = xfs_growfs_get_hdr_buf ( mp ,
XFS_AGB_TO_DADDR ( mp , agno , xfs_refc_block ( mp ) ) ,
BTOBB ( mp - > m_sb . sb_blocksize ) , 0 ,
& xfs_refcountbt_buf_ops ) ;
if ( ! bp ) {
error = - ENOMEM ;
goto error0 ;
}
2017-01-28 10:16:39 +03:00
xfs_btree_init_block ( mp , bp , XFS_BTNUM_REFC ,
2017-01-28 10:16:37 +03:00
0 , 0 , agno , 0 ) ;
2016-10-03 19:11:18 +03:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error )
goto error0 ;
}
2005-04-17 02:20:36 +04:00
}
xfs_trans_agblocks_delta ( tp , nfree ) ;
/*
* There are new blocks in the old last a . g .
*/
if ( new ) {
2016-08-03 04:33:42 +03:00
struct xfs_owner_info oinfo ;
2005-04-17 02:20:36 +04:00
/*
* Change the agi length .
*/
error = xfs_ialloc_read_agi ( mp , tp , agno , & bp ) ;
if ( error ) {
goto error0 ;
}
ASSERT ( bp ) ;
agi = XFS_BUF_TO_AGI ( bp ) ;
2008-02-14 02:03:29 +03:00
be32_add_cpu ( & agi - > agi_length , new ) ;
2005-04-17 02:20:36 +04:00
ASSERT ( nagcount = = oagcount | |
2005-11-02 07:11:25 +03:00
be32_to_cpu ( agi - > agi_length ) = = mp - > m_sb . sb_agblocks ) ;
2005-04-17 02:20:36 +04:00
xfs_ialloc_log_agi ( tp , bp , XFS_AGI_LENGTH ) ;
/*
* Change agf length .
*/
error = xfs_alloc_read_agf ( mp , tp , agno , 0 , & bp ) ;
if ( error ) {
goto error0 ;
}
ASSERT ( bp ) ;
agf = XFS_BUF_TO_AGF ( bp ) ;
2008-02-14 02:03:29 +03:00
be32_add_cpu ( & agf - > agf_length , new ) ;
2005-11-02 07:11:25 +03:00
ASSERT ( be32_to_cpu ( agf - > agf_length ) = =
be32_to_cpu ( agi - > agi_length ) ) ;
2009-12-15 02:14:59 +03:00
2007-06-18 10:50:08 +04:00
xfs_alloc_log_agf ( tp , bp , XFS_AGF_LENGTH ) ;
2016-08-03 04:33:42 +03:00
2005-04-17 02:20:36 +04:00
/*
* Free the new space .
2016-08-03 04:33:42 +03:00
*
* XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
* this doesn ' t actually exist in the rmap btree .
2005-04-17 02:20:36 +04:00
*/
2016-08-03 04:33:42 +03:00
xfs_rmap_ag_owner ( & oinfo , XFS_RMAP_OWN_NULL ) ;
error = xfs_free_extent ( tp ,
XFS_AGB_TO_FSB ( mp , agno ,
be32_to_cpu ( agf - > agf_length ) - new ) ,
xfs: set up per-AG free space reservations
One unfortunate quirk of the reference count and reverse mapping
btrees -- they can expand in size when blocks are written to *other*
allocation groups if, say, one large extent becomes a lot of tiny
extents. Since we don't want to start throwing errors in the middle
of CoWing, we need to reserve some blocks to handle future expansion.
The transaction block reservation counters aren't sufficient here
because we have to have a reserve of blocks in every AG, not just
somewhere in the filesystem.
Therefore, create two per-AG block reservation pools. One feeds the
AGFL so that rmapbt expansion always succeeds, and the other feeds all
other metadata so that refcountbt expansion never fails.
Use the count of how many reserved blocks we need to have on hand to
create a virtual reservation in the AG. Through selective clamping of
the maximum length of allocation requests and of the length of the
longest free extent, we can make it look like there's less free space
in the AG unless the reservation owner is asking for blocks.
In other words, play some accounting tricks in-core to make sure that
we always have blocks available. On the plus side, there's nothing to
clean up if we crash, which is contrast to the strategy that the rough
draft used (actually removing extents from the freespace btrees).
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-19 03:30:52 +03:00
new , & oinfo , XFS_AG_RESV_NONE ) ;
2016-08-03 04:33:42 +03:00
if ( error )
2005-04-17 02:20:36 +04:00
goto error0 ;
}
2010-01-11 14:47:44 +03:00
/*
* Update changed superblock fields transactionally . These are not
* seen by the rest of the world until the transaction commit applies
* them atomically to the superblock .
*/
2005-04-17 02:20:36 +04:00
if ( nagcount > oagcount )
xfs_trans_mod_sb ( tp , XFS_TRANS_SB_AGCOUNT , nagcount - oagcount ) ;
if ( nb > mp - > m_sb . sb_dblocks )
xfs_trans_mod_sb ( tp , XFS_TRANS_SB_DBLOCKS ,
nb - mp - > m_sb . sb_dblocks ) ;
if ( nfree )
xfs_trans_mod_sb ( tp , XFS_TRANS_SB_FDBLOCKS , nfree ) ;
if ( dpct )
xfs_trans_mod_sb ( tp , XFS_TRANS_SB_IMAXPCT , dpct ) ;
2015-02-05 03:13:21 +03:00
xfs_trans_set_sync ( tp ) ;
2015-06-04 06:48:08 +03:00
error = xfs_trans_commit ( tp ) ;
2010-01-11 14:47:44 +03:00
if ( error )
2005-04-17 02:20:36 +04:00
return error ;
2010-01-11 14:47:44 +03:00
2005-04-17 02:20:36 +04:00
/* New allocation groups fully initialized, so update mount struct */
if ( nagimax )
mp - > m_maxagi = nagimax ;
if ( mp - > m_sb . sb_imax_pct ) {
2017-06-16 21:00:05 +03:00
uint64_t icount = mp - > m_sb . sb_dblocks * mp - > m_sb . sb_imax_pct ;
2005-04-17 02:20:36 +04:00
do_div ( icount , 100 ) ;
mp - > m_maxicount = icount < < mp - > m_sb . sb_inopblog ;
} else
mp - > m_maxicount = 0 ;
2011-01-04 03:35:03 +03:00
xfs_set_low_space_thresholds ( mp ) ;
2016-08-03 04:38:24 +03:00
mp - > m_alloc_set_aside = xfs_alloc_set_aside ( mp ) ;
2010-01-11 14:47:44 +03:00
2017-01-04 05:39:33 +03:00
/*
* If we expanded the last AG , free the per - AG reservation
* so we can reinitialize it with the new size .
*/
if ( new ) {
struct xfs_perag * pag ;
pag = xfs_perag_get ( mp , agno ) ;
error = xfs_ag_resv_free ( pag ) ;
xfs_perag_put ( pag ) ;
if ( error )
goto out ;
}
2016-10-03 19:11:44 +03:00
/* Reserve AG metadata blocks. */
error = xfs_fs_reserve_ag_blocks ( mp ) ;
if ( error & & error ! = - ENOSPC )
goto out ;
2010-01-11 14:47:44 +03:00
/* update secondary superblocks. */
2005-04-17 02:20:36 +04:00
for ( agno = 1 ; agno < nagcount ; agno + + ) {
2012-10-09 07:50:52 +04:00
error = 0 ;
/*
* new secondary superblocks need to be zeroed , not read from
* disk as the contents of the new area we are growing into is
* completely unknown .
*/
if ( agno < oagcount ) {
error = xfs_trans_read_buf ( mp , NULL , mp - > m_ddev_targp ,
2005-04-17 02:20:36 +04:00
XFS_AGB_TO_DADDR ( mp , agno , XFS_SB_BLOCK ( mp ) ) ,
2012-11-12 15:54:03 +04:00
XFS_FSS_TO_BB ( mp , 1 ) , 0 , & bp ,
2012-11-14 10:54:40 +04:00
& xfs_sb_buf_ops ) ;
2012-10-09 07:50:52 +04:00
} else {
bp = xfs_trans_get_buf ( NULL , mp - > m_ddev_targp ,
XFS_AGB_TO_DADDR ( mp , agno , XFS_SB_BLOCK ( mp ) ) ,
XFS_FSS_TO_BB ( mp , 1 ) , 0 ) ;
2012-11-14 10:53:49 +04:00
if ( bp ) {
2012-11-14 10:54:40 +04:00
bp - > b_ops = & xfs_sb_buf_ops ;
2012-10-09 07:50:52 +04:00
xfs_buf_zero ( bp , 0 , BBTOB ( bp - > b_length ) ) ;
2012-11-14 10:53:49 +04:00
} else
2014-06-25 08:58:08 +04:00
error = - ENOMEM ;
2012-10-09 07:50:52 +04:00
}
xfs: don't break from growfs ag update loop on error
When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct. If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.
With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.
And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.
Even with this change, growfs will still say:
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
data blocks changed from 319815680 to 335216640
which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.
And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-11 23:14:05 +04:00
/*
* If we get an error reading or writing alternate superblocks ,
* continue . xfs_repair chooses the " best " superblock based
* on most matches ; if we break early , we ' ll leave more
* superblocks un - updated than updated , and xfs_repair may
* pick them over the properly - updated primary .
*/
2005-04-17 02:20:36 +04:00
if ( error ) {
2011-03-07 02:05:35 +03:00
xfs_warn ( mp ,
" error %d reading secondary superblock for ag %d " ,
2005-04-17 02:20:36 +04:00
error , agno ) ;
xfs: don't break from growfs ag update loop on error
When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct. If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.
With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.
And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.
Even with this change, growfs will still say:
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
data blocks changed from 319815680 to 335216640
which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.
And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-11 23:14:05 +04:00
saved_error = error ;
continue ;
2005-04-17 02:20:36 +04:00
}
xfs: remove bitfield based superblock updates
When we log changes to the superblock, we first have to write them
to the on-disk buffer, and then log that. Right now we have a
complex bitfield based arrangement to only write the modified field
to the buffer before we log it.
This used to be necessary as a performance optimisation because we
logged the superblock buffer in every extent or inode allocation or
freeing, and so performance was extremely important. We haven't done
this for years, however, ever since the lazy superblock counters
pulled the superblock logging out of the transaction commit
fast path.
Hence we have a bunch of complexity that is not necessary that makes
writing the in-core superblock to disk much more complex than it
needs to be. We only need to log the superblock now during
management operations (e.g. during mount, unmount or quota control
operations) so it is not a performance critical path anymore.
As such, remove the complex field based logging mechanism and
replace it with a simple conversion function similar to what we use
for all other on-disk structures.
This means we always log the entirity of the superblock, but again
because we rarely modify the superblock this is not an issue for log
bandwidth or CPU time. Indeed, if we do log the superblock
frequently, delayed logging will minimise the impact of this
overhead.
[Fixed gquota/pquota inode sharing regression noticed by bfoster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-01-22 01:10:26 +03:00
xfs_sb_to_disk ( XFS_BUF_TO_SBP ( bp ) , & mp - > m_sb ) ;
2012-11-12 15:54:03 +04:00
2011-08-23 12:28:07 +04:00
error = xfs_bwrite ( bp ) ;
xfs_buf_relse ( bp ) ;
if ( error ) {
2011-03-07 02:05:35 +03:00
xfs_warn ( mp ,
2005-04-17 02:20:36 +04:00
" write error %d updating secondary superblock for ag %d " ,
error , agno ) ;
xfs: don't break from growfs ag update loop on error
When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct. If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.
With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.
And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.
Even with this change, growfs will still say:
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
data blocks changed from 319815680 to 335216640
which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.
And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-11 23:14:05 +04:00
saved_error = error ;
continue ;
2005-04-17 02:20:36 +04:00
}
}
2016-10-03 19:11:44 +03:00
out :
xfs: don't break from growfs ag update loop on error
When xfs_growfs_data_private() is updating backup superblocks,
it bails out on the first error encountered, whether reading or
writing:
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
This can cause a problem later during repair, because repair
looks at all superblocks, and picks the most prevalent one
as correct. If we bail out early in the backup superblock
loop, we can end up with more "bad" matching superblocks than
good, and a post-growfs repair may revert the filesystem to
the old geometry.
With the combination of superblock verifiers and old bugs,
we're more likely to encounter read errors due to verification.
And perhaps even worse, we don't even properly write any of the
newly-added superblocks in the new AGs.
Even with this change, growfs will still say:
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Structure needs cleaning
data blocks changed from 319815680 to 335216640
which might be confusing to the user, but it at least communicates
that something has gone wrong, and dmesg will probably highlight
the need for an xfs_repair.
And this is still best-effort; if verifiers fail on more than
half the backup supers, they may still "win" - but that's probably
best left to repair to more gracefully handle by doing its own
strict verification as part of the backup super "voting."
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-11 23:14:05 +04:00
return saved_error ? saved_error : error ;
2005-04-17 02:20:36 +04:00
error0 :
2015-06-04 06:47:56 +03:00
xfs_trans_cancel ( tp ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
static int
xfs_growfs_log_private (
xfs_mount_t * mp , /* mount point for filesystem */
xfs_growfs_log_t * in ) /* growfs log input struct */
{
xfs_extlen_t nb ;
nb = in - > newblocks ;
if ( nb < XFS_MIN_LOG_BLOCKS | | nb < XFS_B_TO_FSB ( mp , XFS_MIN_LOG_BYTES ) )
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
if ( nb = = mp - > m_sb . sb_logblocks & &
in - > isint = = ( mp - > m_sb . sb_logstart ! = 0 ) )
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
/*
* Moving the log is hard , need new interfaces to sync
* the log first , hold off all activity while moving it .
* Can have shorter or longer log in the same space ,
* or transform internal to external log or vice versa .
*/
2014-06-25 08:58:08 +04:00
return - ENOSYS ;
2005-04-17 02:20:36 +04:00
}
/*
* protected versions of growfs function acquire and release locks on the mount
* point - exported through ioctls : XFS_IOC_FSGROWFSDATA , XFS_IOC_FSGROWFSLOG ,
* XFS_IOC_FSGROWFSRT
*/
int
xfs_growfs_data (
xfs_mount_t * mp ,
xfs_growfs_data_t * in )
{
int error ;
2008-11-26 06:20:06 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
2014-06-25 08:58:08 +04:00
return - EPERM ;
2007-08-30 11:21:54 +04:00
if ( ! mutex_trylock ( & mp - > m_growlock ) )
2014-06-25 08:58:08 +04:00
return - EWOULDBLOCK ;
2005-04-17 02:20:36 +04:00
error = xfs_growfs_data_private ( mp , in ) ;
2015-02-16 03:49:23 +03:00
/*
* Increment the generation unconditionally , the error could be from
* updating the secondary superblocks , in which case the new size
* is live already .
*/
mp - > m_generation + + ;
2007-08-30 11:21:54 +04:00
mutex_unlock ( & mp - > m_growlock ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
int
xfs_growfs_log (
xfs_mount_t * mp ,
xfs_growfs_log_t * in )
{
int error ;
2008-11-26 06:20:06 +03:00
if ( ! capable ( CAP_SYS_ADMIN ) )
2014-06-25 08:58:08 +04:00
return - EPERM ;
2007-08-30 11:21:54 +04:00
if ( ! mutex_trylock ( & mp - > m_growlock ) )
2014-06-25 08:58:08 +04:00
return - EWOULDBLOCK ;
2005-04-17 02:20:36 +04:00
error = xfs_growfs_log_private ( mp , in ) ;
2007-08-30 11:21:54 +04:00
mutex_unlock ( & mp - > m_growlock ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
/*
* exported through ioctl XFS_IOC_FSCOUNTS
*/
int
xfs_fs_counts (
xfs_mount_t * mp ,
xfs_fsop_counts_t * cnt )
{
2015-02-23 13:19:28 +03:00
cnt - > allocino = percpu_counter_read_positive ( & mp - > m_icount ) ;
2015-02-23 13:19:53 +03:00
cnt - > freeino = percpu_counter_read_positive ( & mp - > m_ifree ) ;
2015-02-23 13:22:03 +03:00
cnt - > freedata = percpu_counter_read_positive ( & mp - > m_fdblocks ) -
2016-08-03 04:38:24 +03:00
mp - > m_alloc_set_aside ;
2015-02-23 13:19:28 +03:00
2007-10-11 11:42:32 +04:00
spin_lock ( & mp - > m_sb_lock ) ;
2005-04-17 02:20:36 +04:00
cnt - > freertx = mp - > m_sb . sb_frextents ;
2007-10-11 11:42:32 +04:00
spin_unlock ( & mp - > m_sb_lock ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
/*
* exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
*
* xfs_reserve_blocks is called to set m_resblks
* in the in - core mount table . The number of unused reserved blocks
2006-03-29 02:55:14 +04:00
* is kept in m_resblks_avail .
2005-04-17 02:20:36 +04:00
*
* Reserve the requested number of blocks if available . Otherwise return
* as many as possible to satisfy the request . The actual number
* reserved are returned in outval
*
* A null inval pointer indicates that only the current reserved blocks
* available should be returned no settings are changed .
*/
int
xfs_reserve_blocks (
xfs_mount_t * mp ,
2017-06-16 21:00:05 +03:00
uint64_t * inval ,
2005-04-17 02:20:36 +04:00
xfs_fsop_resblks_t * outval )
{
2017-06-16 21:00:05 +03:00
int64_t lcounter , delta ;
int64_t fdblks_delta = 0 ;
uint64_t request ;
int64_t free ;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
int error = 0 ;
2005-04-17 02:20:36 +04:00
/* If inval is null, report current values and return */
2017-06-16 21:00:05 +03:00
if ( inval = = ( uint64_t * ) NULL ) {
2007-06-18 10:50:27 +04:00
if ( ! outval )
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
outval - > resblks = mp - > m_resblks ;
outval - > resblks_avail = mp - > m_resblks_avail ;
2006-01-15 04:37:08 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
request = * inval ;
2007-02-10 10:36:17 +03:00
/*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
* With per - cpu counters , this becomes an interesting problem . we need
* to work out if we are freeing or allocation blocks first , then we can
* do the modification as necessary .
2007-02-10 10:36:17 +03:00
*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
* We do this under the m_sb_lock so that if we are near ENOSPC , we will
* hold out any changes while we work out what to do . This means that
* the amount of free space can change while we do this , so we need to
* retry if we end up trying to reserve more space than is available .
2007-02-10 10:36:17 +03:00
*/
2007-10-11 11:42:32 +04:00
spin_lock ( & mp - > m_sb_lock ) ;
2005-04-17 02:20:36 +04:00
/*
* If our previous reservation was larger than the current value ,
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
* then move any unused blocks back to the free pool . Modify the resblks
* counters directly since we shouldn ' t have any problems unreserving
* space .
2005-04-17 02:20:36 +04:00
*/
if ( mp - > m_resblks > request ) {
lcounter = mp - > m_resblks_avail - request ;
if ( lcounter > 0 ) { /* release unused blocks */
2007-02-10 10:36:17 +03:00
fdblks_delta = lcounter ;
2005-04-17 02:20:36 +04:00
mp - > m_resblks_avail - = lcounter ;
}
mp - > m_resblks = request ;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
if ( fdblks_delta ) {
spin_unlock ( & mp - > m_sb_lock ) ;
error = xfs_mod_fdblocks ( mp , fdblks_delta , 0 ) ;
spin_lock ( & mp - > m_sb_lock ) ;
}
goto out ;
}
2006-09-07 08:26:50 +04:00
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
/*
* If the request is larger than the current reservation , reserve the
* blocks before we update the reserve counters . Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space .
*/
error = - ENOSPC ;
do {
2015-02-23 13:22:03 +03:00
free = percpu_counter_sum ( & mp - > m_fdblocks ) -
2016-08-03 04:38:24 +03:00
mp - > m_alloc_set_aside ;
2007-02-10 10:36:17 +03:00
if ( ! free )
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
break ;
2007-02-10 10:36:17 +03:00
2005-04-17 02:20:36 +04:00
delta = request - mp - > m_resblks ;
2006-09-07 08:26:50 +04:00
lcounter = free - delta ;
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
if ( lcounter < 0 )
2005-04-17 02:20:36 +04:00
/* We can't satisfy the request, just get what we can */
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
fdblks_delta = free ;
else
fdblks_delta = delta ;
2007-02-10 10:36:17 +03:00
/*
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
* We ' ll either succeed in getting space from the free block
* count or we ' ll get an ENOSPC . If we get a ENOSPC , it means
* things changed while we were calculating fdblks_delta and so
* we should try again to see if there is anything left to
* reserve .
2007-02-10 10:36:17 +03:00
*
* Don ' t set the reserved flag here - we don ' t want to reserve
* the extra reserve blocks from the reserve . . . . .
*/
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
spin_unlock ( & mp - > m_sb_lock ) ;
error = xfs_mod_fdblocks ( mp , - fdblks_delta , 0 ) ;
spin_lock ( & mp - > m_sb_lock ) ;
} while ( error = = - ENOSPC ) ;
/*
* Update the reserve counters if blocks have been successfully
* allocated .
*/
if ( ! error & & fdblks_delta ) {
mp - > m_resblks + = fdblks_delta ;
mp - > m_resblks_avail + = fdblks_delta ;
2007-02-10 10:36:17 +03:00
}
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 04:53:28 +03:00
out :
if ( outval ) {
outval - > resblks = mp - > m_resblks ;
outval - > resblks_avail = mp - > m_resblks_avail ;
}
spin_unlock ( & mp - > m_sb_lock ) ;
return error ;
2005-04-17 02:20:36 +04:00
}
int
xfs_fs_goingdown (
xfs_mount_t * mp ,
2017-06-16 21:00:05 +03:00
uint32_t inflags )
2005-04-17 02:20:36 +04:00
{
switch ( inflags ) {
case XFS_FSOP_GOING_FLAGS_DEFAULT : {
2007-08-30 11:21:30 +04:00
struct super_block * sb = freeze_bdev ( mp - > m_super - > s_bdev ) ;
2005-04-17 02:20:36 +04:00
2005-11-25 08:41:47 +03:00
if ( sb & & ! IS_ERR ( sb ) ) {
2006-06-09 08:58:38 +04:00
xfs_force_shutdown ( mp , SHUTDOWN_FORCE_UMOUNT ) ;
2005-04-17 02:20:36 +04:00
thaw_bdev ( sb - > s_bdev , sb ) ;
}
2008-05-21 10:58:55 +04:00
2005-04-17 02:20:36 +04:00
break ;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH :
2006-06-09 08:58:38 +04:00
xfs_force_shutdown ( mp , SHUTDOWN_FORCE_UMOUNT ) ;
2005-04-17 02:20:36 +04:00
break ;
case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH :
2006-06-09 08:58:38 +04:00
xfs_force_shutdown ( mp ,
SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR ) ;
2005-04-17 02:20:36 +04:00
break ;
default :
2014-06-25 08:58:08 +04:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
}
return 0 ;
}
2012-04-23 09:59:03 +04:00
/*
* Force a shutdown of the filesystem instantly while keeping the filesystem
* consistent . We don ' t do an unmount here ; just shutdown the shop , make sure
* that absolutely nothing persistent happens to this filesystem after this
* point .
*/
void
xfs_do_force_shutdown (
xfs_mount_t * mp ,
int flags ,
char * fname ,
int lnnum )
{
int logerror ;
logerror = flags & SHUTDOWN_LOG_IO_ERROR ;
if ( ! ( flags & SHUTDOWN_FORCE_UMOUNT ) ) {
xfs_notice ( mp ,
" %s(0x%x) called from line %d of file %s. Return address = 0x%p " ,
__func__ , flags , lnnum , fname , __return_address ) ;
}
/*
* No need to duplicate efforts .
*/
if ( XFS_FORCED_SHUTDOWN ( mp ) & & ! logerror )
return ;
/*
* This flags XFS_MOUNT_FS_SHUTDOWN , makes sure that we don ' t
* queue up anybody new on the log reservations , and wakes up
* everybody who ' s sleeping on log reservations to tell them
* the bad news .
*/
if ( xfs_log_force_umount ( mp , logerror ) )
return ;
if ( flags & SHUTDOWN_CORRUPT_INCORE ) {
xfs_alert_tag ( mp , XFS_PTAG_SHUTDOWN_CORRUPT ,
" Corruption of in-memory data detected. Shutting down filesystem " ) ;
if ( XFS_ERRLEVEL_HIGH < = xfs_error_level )
xfs_stack_trace ( ) ;
} else if ( ! ( flags & SHUTDOWN_FORCE_UMOUNT ) ) {
if ( logerror ) {
xfs_alert_tag ( mp , XFS_PTAG_SHUTDOWN_LOGERROR ,
" Log I/O Error Detected. Shutting down filesystem " ) ;
} else if ( flags & SHUTDOWN_DEVICE_REQ ) {
xfs_alert_tag ( mp , XFS_PTAG_SHUTDOWN_IOERROR ,
" All device paths lost. Shutting down filesystem " ) ;
} else if ( ! ( flags & SHUTDOWN_REMOTE_REQ ) ) {
xfs_alert_tag ( mp , XFS_PTAG_SHUTDOWN_IOERROR ,
" I/O Error Detected. Shutting down filesystem " ) ;
}
}
if ( ! ( flags & SHUTDOWN_FORCE_UMOUNT ) ) {
xfs_alert ( mp ,
" Please umount the filesystem and rectify the problem(s) " ) ;
}
}
2016-10-03 19:11:44 +03:00
/*
* Reserve free space for per - AG metadata .
*/
int
xfs_fs_reserve_ag_blocks (
struct xfs_mount * mp )
{
xfs_agnumber_t agno ;
struct xfs_perag * pag ;
int error = 0 ;
int err2 ;
for ( agno = 0 ; agno < mp - > m_sb . sb_agcount ; agno + + ) {
pag = xfs_perag_get ( mp , agno ) ;
err2 = xfs_ag_resv_init ( pag ) ;
xfs_perag_put ( pag ) ;
if ( err2 & & ! error )
error = err2 ;
}
if ( error & & error ! = - ENOSPC ) {
xfs_warn ( mp ,
" Error %d reserving per-AG metadata reserve pool. " , error ) ;
xfs_force_shutdown ( mp , SHUTDOWN_CORRUPT_INCORE ) ;
}
return error ;
}
/*
* Free space reserved for per - AG metadata .
*/
int
xfs_fs_unreserve_ag_blocks (
struct xfs_mount * mp )
{
xfs_agnumber_t agno ;
struct xfs_perag * pag ;
int error = 0 ;
int err2 ;
for ( agno = 0 ; agno < mp - > m_sb . sb_agcount ; agno + + ) {
pag = xfs_perag_get ( mp , agno ) ;
err2 = xfs_ag_resv_free ( pag ) ;
xfs_perag_put ( pag ) ;
if ( err2 & & ! error )
error = err2 ;
}
if ( error )
xfs_warn ( mp ,
" Error %d freeing per-AG metadata reserve pool. " , error ) ;
return error ;
}