2005-04-16 15:20:36 -07:00
/*
2005-11-02 14:58:39 +11:00
* Copyright ( c ) 2000 - 2003 , 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-16 15:20:36 -07:00
*
2005-11-02 14:58:39 +11:00
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License as
2005-04-16 15:20:36 -07:00
* published by the Free Software Foundation .
*
2005-11-02 14:58:39 +11:00
* This program is distributed in the hope that it would be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
2005-04-16 15:20:36 -07:00
*
2005-11-02 14:58:39 +11:00
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write the Free Software Foundation ,
* Inc . , 51 Franklin St , Fifth Floor , Boston , MA 02110 - 1301 USA
2005-04-16 15:20:36 -07:00
*/
# ifndef __XFS_LOG_PRIV_H__
# define __XFS_LOG_PRIV_H__
struct xfs_buf ;
struct ktrace ;
struct log ;
2005-11-02 14:38:42 +11:00
struct xlog_ticket ;
2005-04-16 15:20:36 -07:00
struct xfs_buf_cancel ;
struct xfs_mount ;
/*
* Macros , structures , prototypes for internal log manager use .
*/
# define XLOG_MIN_ICLOGS 2
# define XLOG_MAX_ICLOGS 8
# define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
# define XLOG_VERSION_1 1
# define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
# define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
2007-08-16 16:25:33 +10:00
# define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
2005-04-16 15:20:36 -07:00
# define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
# define XLOG_MAX_RECORD_BSIZE (256*1024)
# define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
2007-08-16 16:25:33 +10:00
# define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
2005-04-16 15:20:36 -07:00
# define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
# define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
# define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
( log ) - > l_mp - > m_sb . sb_logsunit )
# define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
# define XLOG_HEADER_SIZE 512
# define XLOG_REC_SHIFT(log) \
2008-03-06 13:44:28 +11:00
BTOBB ( 1 < < ( xfs_sb_version_haslogv2 ( & log - > l_mp - > m_sb ) ? \
2005-04-16 15:20:36 -07:00
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT ) )
# define XLOG_TOTAL_REC_SHIFT(log) \
2008-03-06 13:44:28 +11:00
BTOBB ( XLOG_MAX_ICLOGS < < ( xfs_sb_version_haslogv2 ( & log - > l_mp - > m_sb ) ? \
2005-04-16 15:20:36 -07:00
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT ) )
2007-10-12 10:58:05 +10:00
static inline xfs_lsn_t xlog_assign_lsn ( uint cycle , uint block )
{
return ( ( xfs_lsn_t ) cycle < < 32 ) | block ;
}
2005-04-16 15:20:36 -07:00
2007-10-12 10:58:05 +10:00
static inline uint xlog_get_cycle ( char * ptr )
{
2007-10-12 10:59:34 +10:00
if ( be32_to_cpu ( * ( __be32 * ) ptr ) = = XLOG_HEADER_MAGIC_NUM )
return be32_to_cpu ( * ( ( __be32 * ) ptr + 1 ) ) ;
2007-10-12 10:58:05 +10:00
else
2007-10-12 10:59:34 +10:00
return be32_to_cpu ( * ( __be32 * ) ptr ) ;
2007-10-12 10:58:05 +10:00
}
2005-04-16 15:20:36 -07:00
# define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
# ifdef __KERNEL__
/*
* get client id from packed copy .
*
* this hack is here because the xlog_pack code copies four bytes
* of xlog_op_header containing the fields oh_clientid , oh_flags
* and oh_res2 into the packed copy .
*
* later on this four byte chunk is treated as an int and the
* client id is pulled out .
*
* this has endian issues , of course .
*/
2007-10-12 10:59:34 +10:00
static inline uint xlog_get_client_id ( __be32 i )
2007-10-12 10:58:05 +10:00
{
2007-10-12 10:59:34 +10:00
return be32_to_cpu ( i ) > > 24 ;
2007-10-12 10:58:05 +10:00
}
2005-04-16 15:20:36 -07:00
# define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
# define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
# define xlog_warn(args...) cmn_err(CE_WARN, ## args)
/*
* In core log state
*/
# define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
# define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
# define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
# define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
# define XLOG_STATE_DO_CALLBACK \
0x0010 /* Process callback functions */
# define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
# define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
# define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
# define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
# define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
# endif /* __KERNEL__ */
/*
* Flags to log operation header
*
* The first write of a new transaction will be preceded with a start
* record , XLOG_START_TRANS . Once a transaction is committed , a commit
* record is written , XLOG_COMMIT_TRANS . If a single region can not fit into
* the remainder of the current active in - core log , it is split up into
* multiple regions . Each partial region will be marked with a
* XLOG_CONTINUE_TRANS until the last one , which gets marked with XLOG_END_TRANS .
*
*/
# define XLOG_START_TRANS 0x01 /* Start a new transaction */
# define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
# define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
# define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
# define XLOG_END_TRANS 0x10 /* End a continued transaction */
# define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
# ifdef __KERNEL__
/*
* Flags to log ticket
*/
# define XLOG_TIC_INITED 0x1 /* has been initialized */
# define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
# define XLOG_TIC_IN_Q 0x4
# endif /* __KERNEL__ */
# define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
/*
* Flags for log structure
*/
# define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
# define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
# define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
# define XLOG_IO_ERROR 0x8 / * log hit an I / O error, and being
shutdown */
typedef __uint32_t xlog_tid_t ;
# ifdef __KERNEL__
/*
* Below are states for covering allocation transactions .
* By covering , we mean changing the h_tail_lsn in the last on - disk
* log write such that no allocation transactions will be re - done during
* recovery after a system crash . Recovery starts at the last on - disk
* log write .
*
* These states are used to insert dummy log entries to cover
* space allocation transactions which can undo non - transactional changes
* after a crash . Writes to a file with space
* already allocated do not result in any transactions . Allocations
* might include space beyond the EOF . So if we just push the EOF a
* little , the last transaction for the file could contain the wrong
* size . If there is no file system activity , after an allocation
* transaction , and the system crashes , the allocation transaction
* will get replayed and the file will be truncated . This could
* be hours / days / . . . after the allocation occurred .
*
* The fix for this is to do two dummy transactions when the
* system is idle . We need two dummy transaction because the h_tail_lsn
* in the log record header needs to point beyond the last possible
* non - dummy transaction . The first dummy changes the h_tail_lsn to
* the first transaction before the dummy . The second dummy causes
* h_tail_lsn to point to the first dummy . Recovery starts at h_tail_lsn .
*
* These dummy transactions get committed when everything
* is idle ( after there has been some activity ) .
*
* There are 5 states used to control this .
*
* IDLE - - no logging has been done on the file system or
* we are done covering previous transactions .
* NEED - - logging has occurred and we need a dummy transaction
* when the log becomes idle .
* DONE - - we were in the NEED state and have committed a dummy
* transaction .
* NEED2 - - we detected that a dummy transaction has gone to the
* on disk log with no other transactions .
* DONE2 - - we committed a dummy transaction when in the NEED2 state .
*
* There are two places where we switch states :
*
* 1. ) In xfs_sync , when we detect an idle log and are in NEED or NEED2 .
* We commit the dummy transaction and switch to DONE or DONE2 ,
* respectively . In all other states , we don ' t do anything .
*
* 2. ) When we finish writing the on - disk log ( xlog_state_clean_log ) .
*
* No matter what state we are in , if this isn ' t the dummy
* transaction going out , the next state is NEED .
* So , if we aren ' t in the DONE or DONE2 states , the next state
* is NEED . We can ' t be finishing a write of the dummy record
* unless it was committed and the state switched to DONE or DONE2 .
*
* If we are in the DONE state and this was a write of the
* dummy transaction , we move to NEED2 .
*
* If we are in the DONE2 state and this was a write of the
* dummy transaction , we move to IDLE .
*
*
* Writing only one dummy transaction can get appended to
* one file space allocation . When this happens , the log recovery
* code replays the space allocation and a file could be truncated .
* This is why we have the NEED2 and DONE2 states before going idle .
*/
# define XLOG_STATE_COVER_IDLE 0
# define XLOG_STATE_COVER_NEED 1
# define XLOG_STATE_COVER_DONE 2
# define XLOG_STATE_COVER_NEED2 3
# define XLOG_STATE_COVER_DONE2 4
# define XLOG_COVER_OPS 5
2005-09-02 16:42:05 +10:00
/* Ticket reservation region accounting */
# define XLOG_TIC_LEN_MAX 15
/*
* Reservation region
* As would be stored in xfs_log_iovec but without the i_addr which
* we don ' t care about .
*/
typedef struct xlog_res {
2006-01-11 21:02:47 +11:00
uint r_len ; /* region length :4 */
uint r_type ; /* region's transaction type :4 */
2005-09-02 16:42:05 +10:00
} xlog_res_t ;
2005-04-16 15:20:36 -07:00
typedef struct xlog_ticket {
2008-08-13 16:34:31 +10:00
sv_t t_wait ; /* ticket wait queue : 20 */
2008-04-10 12:18:46 +10:00
struct xlog_ticket * t_next ; /* :4|8 */
2005-09-02 16:42:05 +10:00
struct xlog_ticket * t_prev ; /* :4|8 */
xlog_tid_t t_tid ; /* transaction identifier : 4 */
2008-11-17 17:37:10 +11:00
atomic_t t_ref ; /* ticket reference count : 4 */
2005-09-02 16:42:05 +10:00
int t_curr_res ; /* current reservation in bytes : 4 */
int t_unit_res ; /* unit reservation in bytes : 4 */
char t_ocnt ; /* original count : 1 */
char t_cnt ; /* current count : 1 */
char t_clientid ; /* who does this belong to; : 1 */
char t_flags ; /* properties of reservation : 1 */
uint t_trans_type ; /* transaction type : 4 */
/* reservation array fields */
uint t_res_num ; /* num in array : 4 */
uint t_res_num_ophdrs ; /* num op hdrs : 4 */
uint t_res_arr_sum ; /* array sum : 4 */
uint t_res_o_flow ; /* sum overflow : 4 */
2006-01-11 21:02:47 +11:00
xlog_res_t t_res_arr [ XLOG_TIC_LEN_MAX ] ; /* array of res : 8 * 15 */
2005-04-16 15:20:36 -07:00
} xlog_ticket_t ;
2005-09-02 16:42:05 +10:00
2005-04-16 15:20:36 -07:00
# endif
typedef struct xlog_op_header {
2007-10-12 10:58:59 +10:00
__be32 oh_tid ; /* transaction id of operation : 4 b */
__be32 oh_len ; /* bytes in data region : 4 b */
__u8 oh_clientid ; /* who sent me this : 1 b */
__u8 oh_flags ; /* : 1 b */
__u16 oh_res2 ; /* 32 bit align : 2 b */
2005-04-16 15:20:36 -07:00
} xlog_op_header_t ;
/* valid values for h_fmt */
# define XLOG_FMT_UNKNOWN 0
# define XLOG_FMT_LINUX_LE 1
# define XLOG_FMT_LINUX_BE 2
# define XLOG_FMT_IRIX_BE 3
/* our fmt */
2005-09-08 15:30:05 +10:00
# ifdef XFS_NATIVE_HOST
2005-04-16 15:20:36 -07:00
# define XLOG_FMT XLOG_FMT_LINUX_BE
# else
2005-09-08 15:30:05 +10:00
# define XLOG_FMT XLOG_FMT_LINUX_LE
2005-04-16 15:20:36 -07:00
# endif
typedef struct xlog_rec_header {
2007-10-12 10:59:34 +10:00
__be32 h_magicno ; /* log record (LR) identifier : 4 */
__be32 h_cycle ; /* write cycle of log : 4 */
__be32 h_version ; /* LR version : 4 */
__be32 h_len ; /* len in bytes; should be 64-bit aligned: 4 */
__be64 h_lsn ; /* lsn of this LR : 8 */
__be64 h_tail_lsn ; /* lsn of 1st LR w/ buffers not committed: 8 */
__be32 h_chksum ; /* may not be used; non-zero if used : 4 */
__be32 h_prev_block ; /* block number to previous LR : 4 */
__be32 h_num_logops ; /* number of log operations in this LR : 4 */
__be32 h_cycle_data [ XLOG_HEADER_CYCLE_SIZE / BBSIZE ] ;
2005-04-16 15:20:36 -07:00
/* new fields */
2007-10-12 10:59:34 +10:00
__be32 h_fmt ; /* format of log record : 4 */
uuid_t h_fs_uuid ; /* uuid of FS : 16 */
__be32 h_size ; /* iclog size : 4 */
2005-04-16 15:20:36 -07:00
} xlog_rec_header_t ;
typedef struct xlog_rec_ext_header {
2007-10-12 10:59:34 +10:00
__be32 xh_cycle ; /* write cycle of log : 4 */
__be32 xh_cycle_data [ XLOG_HEADER_CYCLE_SIZE / BBSIZE ] ; /* : 256 */
2005-04-16 15:20:36 -07:00
} xlog_rec_ext_header_t ;
# ifdef __KERNEL__
2008-11-28 14:23:38 +11:00
/*
* Quite misnamed , because this union lays out the actual on - disk log buffer .
*/
typedef union xlog_in_core2 {
xlog_rec_header_t hic_header ;
xlog_rec_ext_header_t hic_xheader ;
char hic_sector [ XLOG_HEADER_SIZE ] ;
} xlog_in_core_2_t ;
2005-04-16 15:20:36 -07:00
/*
* - A log record header is 512 bytes . There is plenty of room to grow the
* xlog_rec_header_t into the reserved space .
* - ic_data follows , so a write to disk can start at the beginning of
* the iclog .
2008-08-13 16:34:31 +10:00
* - ic_forcewait is used to implement synchronous forcing of the iclog to disk .
2005-04-16 15:20:36 -07:00
* - ic_next is the pointer to the next iclog in the ring .
* - ic_bp is a pointer to the buffer used to write this incore log to disk .
* - ic_log is a pointer back to the global log structure .
* - ic_callback is a linked list of callback function / argument pairs to be
* called after an iclog finishes writing .
* - ic_size is the full size of the header plus data .
* - ic_offset is the current number of bytes written to in this iclog .
* - ic_refcnt is bumped when someone is writing to the log .
* - ic_state is the state of the iclog .
2008-04-10 12:18:39 +10:00
*
* Because of cacheline contention on large machines , we need to separate
* various resources onto different cachelines . To start with , make the
* structure cacheline aligned . The following fields can be contended on
* by independent processes :
*
* - ic_callback_ *
* - ic_refcnt
* - fields protected by the global l_icloglock
*
* so we need to ensure that these fields are located in separate cachelines .
* We ' ll put all the read - only and l_icloglock fields in the first cacheline ,
* and move everything else out to subsequent cachelines .
2005-04-16 15:20:36 -07:00
*/
2008-11-28 14:23:38 +11:00
typedef struct xlog_in_core {
2008-08-13 16:34:31 +10:00
sv_t ic_force_wait ;
sv_t ic_write_wait ;
2005-04-16 15:20:36 -07:00
struct xlog_in_core * ic_next ;
struct xlog_in_core * ic_prev ;
struct xfs_buf * ic_bp ;
struct log * ic_log ;
int ic_size ;
int ic_offset ;
int ic_bwritecnt ;
ushort_t ic_state ;
char * ic_datap ; /* pointer to iclog data */
2008-04-10 12:18:39 +10:00
# ifdef XFS_LOG_TRACE
struct ktrace * ic_trace ;
# endif
/* Callback structures need their own cacheline */
spinlock_t ic_callback_lock ____cacheline_aligned_in_smp ;
xfs_log_callback_t * ic_callback ;
xfs_log_callback_t * * ic_callback_tail ;
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp ;
2008-11-28 14:23:38 +11:00
xlog_in_core_2_t * ic_data ;
# define ic_header ic_data->hic_header
2005-04-16 15:20:36 -07:00
} xlog_in_core_t ;
/*
* The reservation head lsn is not made up of a cycle number and block number .
* Instead , it uses a cycle number and byte number . Logs don ' t expect to
* overflow 31 bits worth of byte offset , so using a byte number will mean
* that round off problems won ' t occur when releasing partial reservations .
*/
typedef struct log {
2008-04-10 12:18:54 +10:00
/* The following fields don't need locking */
struct xfs_mount * l_mp ; /* mount point */
2008-10-30 17:39:35 +11:00
struct xfs_ail * l_ailp ; /* AIL log is working with */
2008-04-10 12:18:54 +10:00
struct xfs_buf * l_xbuf ; /* extra buffer for log
* wrapping */
struct xfs_buftarg * l_targ ; /* buftarg of log */
uint l_flags ;
uint l_quotaoffs_flag ; /* XFS_DQ_*, for QUOTAOFFs */
struct xfs_buf_cancel * * l_buf_cancel_table ;
int l_iclog_hsize ; /* size of iclog header */
int l_iclog_heads ; /* # of iclog header sectors */
uint l_sectbb_log ; /* log2 of sector size in BBs */
uint l_sectbb_mask ; /* sector size (in BBs)
* alignment mask */
int l_iclog_size ; /* size of log in bytes */
int l_iclog_size_log ; /* log power size of log */
int l_iclog_bufs ; /* number of iclog buffers */
xfs_daddr_t l_logBBstart ; /* start block of log */
int l_logsize ; /* size of log in bytes */
int l_logBBsize ; /* size of log in BB chunks */
2005-04-16 15:20:36 -07:00
/* The following block of fields are changed while holding icloglock */
2008-05-19 16:34:27 +10:00
sv_t l_flush_wait ____cacheline_aligned_in_smp ;
/* waiting for iclog flush */
2005-04-16 15:20:36 -07:00
int l_covered_state ; /* state of "covering disk
* log entries " */
xlog_in_core_t * l_iclog ; /* head log queue */
2007-10-11 17:37:10 +10:00
spinlock_t l_icloglock ; /* grab to change iclog state */
2005-04-16 15:20:36 -07:00
xfs_lsn_t l_tail_lsn ; /* lsn of 1st LR with unflushed
* buffers */
xfs_lsn_t l_last_sync_lsn ; /* lsn of last LR on disk */
int l_curr_cycle ; /* Cycle number of log writes */
int l_prev_cycle ; /* Cycle number before last
* block increment */
int l_curr_block ; /* current logical log block */
int l_prev_block ; /* previous logical log block */
/* The following block of fields are changed while holding grant_lock */
2008-04-10 12:18:54 +10:00
spinlock_t l_grant_lock ____cacheline_aligned_in_smp ;
2005-04-16 15:20:36 -07:00
xlog_ticket_t * l_reserve_headq ;
xlog_ticket_t * l_write_headq ;
int l_grant_reserve_cycle ;
int l_grant_reserve_bytes ;
int l_grant_write_cycle ;
int l_grant_write_bytes ;
# ifdef XFS_LOG_TRACE
struct ktrace * l_grant_trace ;
# endif
2008-04-10 12:18:54 +10:00
/* The following field are used for debugging; need to hold icloglock */
# ifdef DEBUG
char * l_iclog_bak [ XLOG_MAX_ICLOGS ] ;
# endif
2005-04-16 15:20:36 -07:00
} xlog_t ;
2005-11-02 15:12:04 +11:00
# define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
2005-04-16 15:20:36 -07:00
/* common routines */
extern xfs_lsn_t xlog_assign_tail_lsn ( struct xfs_mount * mp ) ;
extern int xlog_find_tail ( xlog_t * log ,
xfs_daddr_t * head_blk ,
2006-01-11 15:34:19 +11:00
xfs_daddr_t * tail_blk ) ;
extern int xlog_recover ( xlog_t * log ) ;
2008-08-13 16:49:32 +10:00
extern int xlog_recover_finish ( xlog_t * log ) ;
2005-04-16 15:20:36 -07:00
extern void xlog_pack_data ( xlog_t * log , xlog_in_core_t * iclog , int ) ;
extern void xlog_recover_process_iunlinks ( xlog_t * log ) ;
extern struct xfs_buf * xlog_get_bp ( xlog_t * , int ) ;
extern void xlog_put_bp ( struct xfs_buf * ) ;
extern int xlog_bread ( xlog_t * , xfs_daddr_t , int , struct xfs_buf * ) ;
2008-04-10 12:18:46 +10:00
extern kmem_zone_t * xfs_log_ticket_zone ;
2005-04-16 15:20:36 -07:00
/* iclog tracing */
# define XLOG_TRACE_GRAB_FLUSH 1
# define XLOG_TRACE_REL_FLUSH 2
# define XLOG_TRACE_SLEEP_FLUSH 3
# define XLOG_TRACE_WAKE_FLUSH 4
2006-09-28 11:04:16 +10:00
/*
* Unmount record type is used as a pseudo transaction type for the ticket .
* It ' s value must be outside the range of XFS_TRANS_ * values .
*/
# define XLOG_UNMOUNT_REC_TYPE (-1U)
2005-04-16 15:20:36 -07:00
# endif /* __KERNEL__ */
# endif /* __XFS_LOG_PRIV_H__ */