2018-06-06 05:42:14 +03:00
// SPDX-License-Identifier: GPL-2.0
2005-04-17 02:20:36 +04:00
/*
2005-11-02 06:58:39 +03:00
* Copyright ( c ) 2000 - 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-17 02:20:36 +04:00
*/
# ifndef __XFS_MOUNT_H__
# define __XFS_MOUNT_H__
2012-06-14 18:22:15 +04:00
struct xlog ;
2005-04-17 02:20:36 +04:00
struct xfs_inode ;
2007-07-11 05:09:12 +04:00
struct xfs_mru_cache ;
2008-10-30 09:38:26 +03:00
struct xfs_ail ;
2009-06-08 17:33:32 +04:00
struct xfs_quotainfo ;
2014-06-06 09:01:58 +04:00
struct xfs_da_geometry ;
2021-06-02 03:48:24 +03:00
struct xfs_perag ;
2009-06-08 17:33:32 +04:00
2011-01-04 03:35:03 +03:00
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
XFS_LOWSP_1_PCNT = 0 ,
XFS_LOWSP_2_PCNT ,
XFS_LOWSP_3_PCNT ,
XFS_LOWSP_4_PCNT ,
XFS_LOWSP_5_PCNT ,
XFS_LOWSP_MAX ,
} ;
2016-05-18 03:58:51 +03:00
/*
* Error Configuration
*
* Error classes define the subsystem the configuration belongs to .
* Error numbers define the errors that are configurable .
*/
enum {
2016-05-18 04:01:00 +03:00
XFS_ERR_METADATA ,
2016-05-18 03:58:51 +03:00
XFS_ERR_CLASS_MAX ,
} ;
enum {
2016-05-18 04:01:00 +03:00
XFS_ERR_DEFAULT ,
2016-05-18 04:09:28 +03:00
XFS_ERR_EIO ,
XFS_ERR_ENOSPC ,
XFS_ERR_ENODEV ,
2016-05-18 03:58:51 +03:00
XFS_ERR_ERRNO_MAX ,
} ;
2016-05-18 04:08:15 +03:00
# define XFS_ERR_RETRY_FOREVER -1
2016-09-14 00:51:30 +03:00
/*
* Although retry_timeout is in jiffies which is normally an unsigned long ,
* we limit the retry timeout to 86400 seconds , or one day . So even a
* signed 32 - bit long is sufficient for a HZ value up to 24855. Making it
* signed lets us store the special " -1 " value , meaning retry forever .
*/
2016-05-18 03:58:51 +03:00
struct xfs_error_cfg {
struct xfs_kobj kobj ;
int max_retries ;
2016-09-14 00:51:30 +03:00
long retry_timeout ; /* in jiffies, -1 = infinite */
2016-05-18 03:58:51 +03:00
} ;
2021-08-06 21:05:39 +03:00
/*
* Per - cpu deferred inode inactivation GC lists .
*/
struct xfs_inodegc {
struct llist_head list ;
struct work_struct work ;
/* approximate count of inodes in the list */
unsigned int items ;
2021-08-06 21:05:43 +03:00
unsigned int shrinker_hits ;
2021-08-06 21:05:39 +03:00
} ;
2020-05-20 23:17:11 +03:00
/*
* The struct xfsmount layout is optimised to separate read - mostly variables
* from variables that are frequently modified . We put the read - mostly variables
* first , then place all the other variables at the end .
*
* Typically , read - mostly variables are those that are set at mount time and
* never changed again , or only change rarely as a result of things like sysfs
* knobs being tweaked .
*/
2005-04-17 02:20:36 +04:00
typedef struct xfs_mount {
2020-05-20 23:17:11 +03:00
struct xfs_sb m_sb ; /* copy of fs superblock */
2007-08-30 11:21:30 +04:00
struct super_block * m_super ;
2008-10-30 09:38:26 +03:00
struct xfs_ail * m_ail ; /* fs active log item list */
2005-04-17 02:20:36 +04:00
struct xfs_buf * m_sb_bp ; /* buffer for superblock */
2005-11-02 03:44:33 +03:00
char * m_rtname ; /* realtime device name */
char * m_logname ; /* external log device name */
2014-06-06 09:01:58 +04:00
struct xfs_da_geometry * m_dir_geo ; /* directory block geometry */
struct xfs_da_geometry * m_attr_geo ; /* attribute block geometry */
2012-06-14 18:22:15 +04:00
struct xlog * m_log ; /* log specific stuff */
2005-04-17 02:20:36 +04:00
struct xfs_inode * m_rbmip ; /* pointer to bitmap inode */
struct xfs_inode * m_rsumip ; /* pointer to summary inode */
struct xfs_inode * m_rootip ; /* pointer to root directory */
struct xfs_quotainfo * m_quotainfo ; /* disk quota information */
xfs_buftarg_t * m_ddev_targp ; /* saves taking the address */
xfs_buftarg_t * m_logdev_targp ; /* ptr to log device */
xfs_buftarg_t * m_rtdev_targp ; /* ptr to rt device */
2021-08-06 21:05:38 +03:00
struct list_head m_mount_list ; /* global mount list */
2021-08-06 21:05:39 +03:00
void __percpu * m_inodegc ; /* percpu inodegc structures */
2020-05-20 23:17:11 +03:00
/*
* Optional cache of rt summary level per bitmap block with the
* invariant that m_rsum_cache [ bbno ] < = the minimum i for which
* rsum [ i ] [ bbno ] ! = 0. Reads and writes are serialized by the rsumip
* inode lock .
*/
uint8_t * m_rsum_cache ;
struct xfs_mru_cache * m_filestream ; /* per-mount filestream data */
struct workqueue_struct * m_buf_workqueue ;
struct workqueue_struct * m_unwritten_workqueue ;
struct workqueue_struct * m_cil_workqueue ;
struct workqueue_struct * m_reclaim_workqueue ;
struct workqueue_struct * m_sync_workqueue ;
2021-08-06 21:05:39 +03:00
struct workqueue_struct * m_blockgc_wq ;
struct workqueue_struct * m_inodegc_wq ;
2020-05-20 23:17:11 +03:00
int m_bsize ; /* fs logical block size */
2017-06-16 21:00:05 +03:00
uint8_t m_blkbit_log ; /* blocklog + NBBY */
uint8_t m_blkbb_log ; /* blocklog - BBSHIFT */
uint8_t m_agno_log ; /* log #ag's */
2020-05-20 23:17:11 +03:00
uint8_t m_sectbb_log ; /* sectlog - BBSHIFT */
2005-04-17 02:20:36 +04:00
uint m_blockmask ; /* sb_blocksize-1 */
uint m_blockwsize ; /* sb_blocksize in words */
uint m_blockwmask ; /* blockwsize-1 */
2008-10-30 09:11:19 +03:00
uint m_alloc_mxr [ 2 ] ; /* max alloc btree records */
uint m_alloc_mnr [ 2 ] ; /* min alloc btree records */
uint m_bmap_dmxr [ 2 ] ; /* max bmap btree records */
uint m_bmap_dmnr [ 2 ] ; /* min bmap btree records */
2016-08-03 04:36:07 +03:00
uint m_rmap_mxr [ 2 ] ; /* max rmap btree records */
uint m_rmap_mnr [ 2 ] ; /* min rmap btree records */
2016-10-03 19:11:18 +03:00
uint m_refc_mxr [ 2 ] ; /* max refc btree records */
uint m_refc_mnr [ 2 ] ; /* min refc btree records */
2005-04-17 02:20:36 +04:00
uint m_ag_maxlevels ; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels [ 2 ] ; /* XFS_BM_MAXLEVELS */
2016-08-03 04:36:07 +03:00
uint m_rmap_maxlevels ; /* max rmap btree levels */
2016-10-03 19:11:18 +03:00
uint m_refc_maxlevels ; /* max refcount btree level */
2016-08-03 04:31:47 +03:00
xfs_extlen_t m_ag_prealloc_blocks ; /* reserved ag blocks */
2016-08-03 04:38:24 +03:00
uint m_alloc_set_aside ; /* space we can't use */
uint m_ag_max_usable ; /* max space per AG */
2020-05-20 23:17:11 +03:00
int m_dalign ; /* stripe unit */
int m_swidth ; /* stripe width */
xfs_agnumber_t m_maxagi ; /* highest inode alloc group */
uint m_allocsize_log ; /* min write size log bytes */
uint m_allocsize_blocks ; /* min write size blocks */
int m_logbufs ; /* number of log buffers */
int m_logbsize ; /* size of each log buffer */
uint m_rsumlevels ; /* rt summary levels */
uint m_rsumsize ; /* size of rt summary, bytes */
2005-04-17 02:20:36 +04:00
int m_fixedfsid [ 2 ] ; /* unchanged for life of FS */
uint m_qflags ; /* quota status flags */
2020-05-20 23:17:11 +03:00
uint64_t m_flags ; /* global mount flags */
2021-08-06 21:05:41 +03:00
uint64_t m_low_space [ XFS_LOWSP_MAX ] ;
uint64_t m_low_rtexts [ XFS_LOWSP_MAX ] ;
2020-05-20 23:17:11 +03:00
struct xfs_ino_geometry m_ino_geo ; /* inode geometry */
2013-08-12 14:49:56 +04:00
struct xfs_trans_resv m_resv ; /* precomputed res values */
2020-05-20 23:17:11 +03:00
/* low free space thresholds */
2021-08-06 21:05:39 +03:00
unsigned long m_opstate ; /* dynamic state flags */
2020-05-20 23:17:11 +03:00
bool m_always_cow ;
bool m_fail_unmount ;
bool m_finobt_nores ; /* no per-AG finobt resv. */
bool m_update_sb ; /* sb needs update in mount */
/*
* Bitsets of per - fs metadata that have been checked and / or are sick .
* Callers must hold m_sb_lock to access these two fields .
*/
uint8_t m_fs_checked ;
uint8_t m_fs_sick ;
/*
* Bitsets of rt metadata that have been checked and / or are sick .
* Callers must hold m_sb_lock to access this field .
*/
uint8_t m_rt_checked ;
uint8_t m_rt_sick ;
/*
* End of read - mostly variables . Frequently written variables and locks
* should be placed below this comment from now on . The first variable
* here is marked as cacheline aligned so they it is separated from
* the read - mostly variables .
*/
spinlock_t ____cacheline_aligned m_sb_lock ; /* sb counter lock */
struct percpu_counter m_icount ; /* allocated inodes counter */
struct percpu_counter m_ifree ; /* free inodes counter */
struct percpu_counter m_fdblocks ; /* free block counter */
/*
* Count of data device blocks reserved for delayed allocations ,
* including indlen blocks . Does not include allocated CoW staging
* extents or anything related to the rt device .
*/
struct percpu_counter m_delalloc_blks ;
2021-04-29 01:05:50 +03:00
/*
* Global count of allocation btree blocks in use across all AGs . Only
* used when perag reservation is enabled . Helps prevent block
* reservation from attempting to reserve allocation btree blocks .
*/
atomic64_t m_allocbt_blks ;
2020-05-20 23:17:11 +03:00
struct radix_tree_root m_perag_tree ; /* per-ag accounting info */
spinlock_t m_perag_lock ; /* lock for m_perag_tree */
2017-06-16 21:00:05 +03:00
uint64_t m_resblks ; /* total reserved blocks */
uint64_t m_resblks_avail ; /* available reserved blocks */
uint64_t m_resblks_save ; /* reserved blks @ remount,ro */
2011-04-08 06:45:07 +04:00
struct delayed_work m_reclaim_work ; /* background inode reclaim */
2014-07-15 02:07:01 +04:00
struct xfs_kobj m_kobj ;
2016-05-18 03:58:51 +03:00
struct xfs_kobj m_error_kobj ;
2016-05-18 04:01:00 +03:00
struct xfs_kobj m_error_meta_kobj ;
2016-05-18 03:58:51 +03:00
struct xfs_error_cfg m_error_cfg [ XFS_ERR_CLASS_MAX ] [ XFS_ERR_ERRNO_MAX ] ;
2015-10-12 10:21:19 +03:00
struct xstats m_stats ; /* per-fs stats */
2020-05-20 23:17:11 +03:00
xfs_agnumber_t m_agfrotor ; /* last ag where space found */
xfs_agnumber_t m_agirotor ; /* last ag dir inode alloced */
spinlock_t m_agirotor_lock ; /* .. and lock protecting it */
2012-02-29 13:53:48 +04:00
2021-08-06 21:05:43 +03:00
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker m_inodegc_shrinker ;
2020-04-12 23:11:10 +03:00
/*
* Workqueue item so that we can coalesce multiple inode flush attempts
* into a single flush .
*/
struct work_struct m_flush_inodes_work ;
2015-02-16 03:49:23 +03:00
/*
* Generation of the filesysyem layout . This is incremented by each
* growfs , and used by the pNFS server to ensure the client updates
* its view of the block device once it gets a layout that might
* reference the newly added blocks . Does not need to be persistent
* as long as we only allow file system size increments , but if we
* ever support shrinks it would have to be persisted in addition
* to various other kinds of pain inflicted on the pNFS server .
*/
2017-06-16 21:00:05 +03:00
uint32_t m_generation ;
2020-05-20 23:17:11 +03:00
struct mutex m_growlock ; /* growfs mutex */
2016-03-15 03:42:44 +03:00
# ifdef DEBUG
2017-06-21 03:54:46 +03:00
/*
* Frequency with which errors are injected . Replaces xfs_etest ; the
* value stored in here is the inverse of the frequency with which the
* error triggers . 1 = always , 2 = half the time , etc .
*/
unsigned int * m_errortag ;
2017-06-21 03:54:47 +03:00
struct xfs_kobj m_errortag_kobj ;
2016-03-15 03:42:44 +03:00
# endif
2005-04-17 02:20:36 +04:00
} xfs_mount_t ;
2019-06-05 21:19:34 +03:00
# define M_IGEO(mp) (&(mp)->m_ino_geo)
2005-04-17 02:20:36 +04:00
/*
* Flags for m_flags .
*/
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 09:26:31 +04:00
# define XFS_MOUNT_WSYNC (1ULL << 0) / * for nfs - all metadata ops
2005-04-17 02:20:36 +04:00
must be synchronous except
for space allocations */
2016-05-18 04:11:27 +03:00
# define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 09:26:31 +04:00
# define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) / * atomic stop of all filesystem
2005-04-17 02:20:36 +04:00
operations , typically for
disk errors in metadata */
2011-05-20 17:45:32 +04:00
# define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_NOALIGN (1ULL << 7) / * turn off stripe alignment
2005-04-17 02:20:36 +04:00
allocations */
2006-01-11 07:32:01 +03:00
# define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
2007-08-30 11:21:12 +04:00
# define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
2019-10-28 18:41:45 +03:00
# define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */
2016-03-02 01:58:09 +03:00
# define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
# define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
2008-02-29 05:58:40 +03:00
# define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_SWALLOC (1ULL << 19) / * turn on stripe width
2005-04-17 02:20:36 +04:00
* allocation */
2007-08-30 11:21:12 +04:00
# define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
2005-11-02 07:09:22 +03:00
# define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
2019-10-28 18:41:46 +03:00
# define XFS_MOUNT_LARGEIO (1ULL << 22) / * report large preferred
2005-11-02 02:33:05 +03:00
* I / O size in stat ( ) */
2007-07-11 05:09:12 +04:00
# define XFS_MOUNT_FILESTREAMS (1ULL << 24) / * enable the filestreams
allocator */
2008-04-30 12:15:28 +04:00
# define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
2020-05-04 19:02:41 +03:00
# define XFS_MOUNT_DAX_ALWAYS (1ULL << 26)
2020-05-04 19:02:42 +03:00
# define XFS_MOUNT_DAX_NEVER (1ULL << 27)
2015-06-04 02:19:18 +03:00
2021-08-06 21:05:39 +03:00
/*
* If set , inactivation worker threads will be scheduled to process queued
* inodegc work . If not , queued inodes remain in memory waiting to be
* processed .
*/
# define XFS_OPSTATE_INODEGC_ENABLED 0
2021-08-06 21:05:42 +03:00
/*
* If set , background speculative prealloc gc worker threads will be scheduled
* to process queued blockgc work . If not , inodes retain their preallocations
* until explicitly deleted .
*/
# define XFS_OPSTATE_BLOCKGC_ENABLED 1
2021-08-06 21:05:39 +03:00
# define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ # # name ( struct xfs_mount * mp ) \
{ \
return test_bit ( XFS_OPSTATE_ # # NAME , & mp - > m_opstate ) ; \
} \
static inline bool xfs_clear_ # # name ( struct xfs_mount * mp ) \
{ \
return test_and_clear_bit ( XFS_OPSTATE_ # # NAME , & mp - > m_opstate ) ; \
} \
static inline bool xfs_set_ # # name ( struct xfs_mount * mp ) \
{ \
return test_and_set_bit ( XFS_OPSTATE_ # # NAME , & mp - > m_opstate ) ; \
}
__XFS_IS_OPSTATE ( inodegc_enabled , INODEGC_ENABLED )
2021-08-06 21:05:42 +03:00
__XFS_IS_OPSTATE ( blockgc_enabled , BLOCKGC_ENABLED )
2021-08-06 21:05:39 +03:00
# define XFS_OPSTATE_STRINGS \
2021-08-06 21:05:42 +03:00
{ ( 1UL < < XFS_OPSTATE_INODEGC_ENABLED ) , " inodegc " } , \
{ ( 1UL < < XFS_OPSTATE_BLOCKGC_ENABLED ) , " blockgc " }
2021-08-06 21:05:39 +03:00
2005-04-17 02:20:36 +04:00
/*
2005-05-06 00:28:29 +04:00
* Max and min values for mount - option defined I / O
* preallocation sizes .
2005-04-17 02:20:36 +04:00
*/
2005-05-06 00:28:29 +04:00
# define XFS_MAX_IO_LOG 30 /* 1G */
2005-04-17 02:20:36 +04:00
# define XFS_MIN_IO_LOG PAGE_SHIFT
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 09:26:31 +04:00
# define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
( ( mp ) - > m_flags & XFS_MOUNT_WAS_CLEAN )
2005-04-17 02:20:36 +04:00
# define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
2007-08-30 11:20:39 +04:00
void xfs_do_force_shutdown ( struct xfs_mount * mp , int flags , char * fname ,
int lnnum ) ;
2005-04-17 02:20:36 +04:00
# define xfs_force_shutdown(m,f) \
2007-08-30 11:20:39 +04:00
xfs_do_force_shutdown ( m , f , __FILE__ , __LINE__ )
2005-04-17 02:20:36 +04:00
2008-11-28 06:23:36 +03:00
# define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
# define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
# define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
# define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
2005-04-17 02:20:36 +04:00
/*
* Flags for xfs_mountfs
*/
2006-03-31 07:04:17 +04:00
# define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
2005-04-17 02:20:36 +04:00
2005-11-02 06:38:42 +03:00
static inline xfs_agnumber_t
xfs_daddr_to_agno ( struct xfs_mount * mp , xfs_daddr_t d )
2005-04-17 02:20:36 +04:00
{
2017-04-20 01:19:32 +03:00
xfs_rfsblock_t ld = XFS_BB_TO_FSBT ( mp , d ) ;
2005-11-02 06:38:42 +03:00
do_div ( ld , mp - > m_sb . sb_agblocks ) ;
return ( xfs_agnumber_t ) ld ;
2005-04-17 02:20:36 +04:00
}
2005-11-02 06:38:42 +03:00
static inline xfs_agblock_t
xfs_daddr_to_agbno ( struct xfs_mount * mp , xfs_daddr_t d )
2005-04-17 02:20:36 +04:00
{
2017-04-20 01:19:32 +03:00
xfs_rfsblock_t ld = XFS_BB_TO_FSBT ( mp , d ) ;
2005-11-02 06:38:42 +03:00
return ( xfs_agblock_t ) do_div ( ld , mp - > m_sb . sb_agblocks ) ;
2005-04-17 02:20:36 +04:00
}
2021-06-02 03:48:24 +03:00
int xfs_buf_hash_init ( struct xfs_perag * pag ) ;
void xfs_buf_hash_destroy ( struct xfs_perag * pag ) ;
2016-12-07 09:36:36 +03:00
2015-11-03 05:06:34 +03:00
extern void xfs_uuid_table_free ( void ) ;
2017-06-16 21:00:05 +03:00
extern uint64_t xfs_default_resblks ( xfs_mount_t * mp ) ;
2008-08-13 10:49:32 +04:00
extern int xfs_mountfs ( xfs_mount_t * mp ) ;
2008-08-13 10:49:57 +04:00
extern void xfs_unmountfs ( xfs_mount_t * ) ;
2015-02-23 13:24:37 +03:00
2021-08-06 21:05:40 +03:00
/*
* Deltas for the block count can vary from 1 to very large , but lock contention
* only occurs on frequent small block count updates such as in the delayed
* allocation path for buffered writes ( page a time updates ) . Hence we set
* a large batch count ( 1024 ) to minimise global counter updates except when
* we get near to ENOSPC and we have to be very accurate with our updates .
*/
# define XFS_FDBLOCKS_BATCH 1024
2015-02-23 13:22:03 +03:00
extern int xfs_mod_fdblocks ( struct xfs_mount * mp , int64_t delta ,
bool reserved ) ;
2015-02-23 13:22:54 +03:00
extern int xfs_mod_frextents ( struct xfs_mount * mp , int64_t delta ) ;
2006-03-31 07:04:17 +04:00
extern int xfs_readsb ( xfs_mount_t * , int ) ;
2005-04-17 02:20:36 +04:00
extern void xfs_freesb ( xfs_mount_t * ) ;
2014-11-28 06:02:59 +03:00
extern bool xfs_fs_writable ( struct xfs_mount * mp , int level ) ;
2017-06-16 21:00:05 +03:00
extern int xfs_sb_validate_fsb_count ( struct xfs_sb * , uint64_t ) ;
2005-04-17 02:20:36 +04:00
2010-02-17 22:36:13 +03:00
extern int xfs_dev_is_read_only ( struct xfs_mount * , char * ) ;
2011-01-04 03:35:03 +03:00
extern void xfs_set_low_space_thresholds ( struct xfs_mount * ) ;
2015-11-03 04:27:22 +03:00
int xfs_zero_extent ( struct xfs_inode * ip , xfs_fsblock_t start_fsb ,
xfs_off_t count_fsb ) ;
2016-05-18 04:05:33 +03:00
struct xfs_error_cfg * xfs_error_get_cfg ( struct xfs_mount * mp ,
int error_class , int error ) ;
2018-07-20 19:28:40 +03:00
void xfs_force_summary_recalc ( struct xfs_mount * mp ) ;
2019-04-26 04:26:22 +03:00
void xfs_mod_delalloc ( struct xfs_mount * mp , int64_t delta ) ;
2016-05-18 04:05:33 +03:00
2005-04-17 02:20:36 +04:00
# endif /* __XFS_MOUNT_H__ */