9daee913dc
superblock and improved the performance of the online resizing of file systems with bigalloc enabled. Fixed a lot of bugs, in particular for the inline data feature, potential races when creating and deleting inodes with shared extended attribute blocks, and the handling directory blocks which are corrupted. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmLrRpEACgkQ8vlZVpUN gaN2OAf/a2lxhZ6DvkTfWjni0BG6i2ajd11lT93N+we0wWk9f0VdNR2JUdTum0HL UtwP48km+FuqkbDrODlbSky5V+IZhd90ihLyPbPKSU/52c7d6IxNOCz2Fxq981j2 Ik6QgdegvCaUDHmluJfYYcS5Pa97HXtSb6VVi1RAvHHFbYDSChObs76ZQWBmhsSh Mo84mFGS7BDIVNVkg4PBMx4b3iFvKfE1AUdfA5dhB4GXHgDA+77GByw+RjdQ6Dh/ W0l5AVAXbK7BYSVX6Cg41WUMYOBu58Hrh/CHL1DWv3khvjgxLqM7ERAFOISVI3Ax vCXPXfjpbTFElUQuOw4m33vixaFU+A== =xTsM -----END PGP SIGNATURE----- Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates from Ted Ts'o: "Add new ioctls to set and get the file system UUID in the ext4 superblock and improved the performance of the online resizing of file systems with bigalloc enabled. Fixed a lot of bugs, in particular for the inline data feature, potential races when creating and deleting inodes with shared extended attribute blocks, and the handling of directory blocks which are corrupted" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (37 commits) ext4: add ioctls to get/set the ext4 superblock uuid ext4: avoid resizing to a partial cluster size ext4: reduce computation of overhead during resize jbd2: fix assertion 'jh->b_frozen_data == NULL' failure when journal aborted ext4: block range must be validated before use in ext4_mb_clear_bb() mbcache: automatically delete entries from cache on freeing mbcache: Remove mb_cache_entry_delete() ext2: avoid deleting xattr block that is being reused ext2: unindent codeblock in ext2_xattr_set() ext2: factor our freeing of xattr block reference ext4: fix race when reusing xattr blocks ext4: unindent codeblock in ext4_xattr_block_set() ext4: remove EA inode entry from mbcache on inode eviction mbcache: add functions to delete entry if unused mbcache: don't reclaim used entries ext4: make sure ext4_append() always allocates new block ext4: check if directory block is within i_size ext4: reflect mb_optimize_scan value in options file ext4: avoid remove directory when directory is corrupted ext4: aligned '*' in comments ...
932 lines
24 KiB
C
932 lines
24 KiB
C
// SPDX-License-Identifier: GPL-2.0+
|
|
/*
|
|
* linux/fs/jbd2/recovery.c
|
|
*
|
|
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
|
|
*
|
|
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
|
|
*
|
|
* Journal recovery routines for the generic filesystem journaling code;
|
|
* part of the ext2fs journaling system.
|
|
*/
|
|
|
|
#ifndef __KERNEL__
|
|
#include "jfs_user.h"
|
|
#else
|
|
#include <linux/time.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/jbd2.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/crc32.h>
|
|
#include <linux/blkdev.h>
|
|
#endif
|
|
|
|
/*
|
|
* Maintain information about the progress of the recovery job, so that
|
|
* the different passes can carry information between them.
|
|
*/
|
|
struct recovery_info
|
|
{
|
|
tid_t start_transaction;
|
|
tid_t end_transaction;
|
|
|
|
int nr_replays;
|
|
int nr_revokes;
|
|
int nr_revoke_hits;
|
|
};
|
|
|
|
static int do_one_pass(journal_t *journal,
|
|
struct recovery_info *info, enum passtype pass);
|
|
static int scan_revoke_records(journal_t *, struct buffer_head *,
|
|
tid_t, struct recovery_info *);
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
/* Release readahead buffers after use */
|
|
static void journal_brelse_array(struct buffer_head *b[], int n)
|
|
{
|
|
while (--n >= 0)
|
|
brelse (b[n]);
|
|
}
|
|
|
|
|
|
/*
|
|
* When reading from the journal, we are going through the block device
|
|
* layer directly and so there is no readahead being done for us. We
|
|
* need to implement any readahead ourselves if we want it to happen at
|
|
* all. Recovery is basically one long sequential read, so make sure we
|
|
* do the IO in reasonably large chunks.
|
|
*
|
|
* This is not so critical that we need to be enormously clever about
|
|
* the readahead size, though. 128K is a purely arbitrary, good-enough
|
|
* fixed value.
|
|
*/
|
|
|
|
#define MAXBUF 8
|
|
static int do_readahead(journal_t *journal, unsigned int start)
|
|
{
|
|
int err;
|
|
unsigned int max, nbufs, next;
|
|
unsigned long long blocknr;
|
|
struct buffer_head *bh;
|
|
|
|
struct buffer_head * bufs[MAXBUF];
|
|
|
|
/* Do up to 128K of readahead */
|
|
max = start + (128 * 1024 / journal->j_blocksize);
|
|
if (max > journal->j_total_len)
|
|
max = journal->j_total_len;
|
|
|
|
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
|
|
* a time to the block device IO layer. */
|
|
|
|
nbufs = 0;
|
|
|
|
for (next = start; next < max; next++) {
|
|
err = jbd2_journal_bmap(journal, next, &blocknr);
|
|
|
|
if (err) {
|
|
printk(KERN_ERR "JBD2: bad block at offset %u\n",
|
|
next);
|
|
goto failed;
|
|
}
|
|
|
|
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
|
|
if (!bh) {
|
|
err = -ENOMEM;
|
|
goto failed;
|
|
}
|
|
|
|
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
|
|
bufs[nbufs++] = bh;
|
|
if (nbufs == MAXBUF) {
|
|
ll_rw_block(REQ_OP_READ, nbufs, bufs);
|
|
journal_brelse_array(bufs, nbufs);
|
|
nbufs = 0;
|
|
}
|
|
} else
|
|
brelse(bh);
|
|
}
|
|
|
|
if (nbufs)
|
|
ll_rw_block(REQ_OP_READ, nbufs, bufs);
|
|
err = 0;
|
|
|
|
failed:
|
|
if (nbufs)
|
|
journal_brelse_array(bufs, nbufs);
|
|
return err;
|
|
}
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
|
/*
|
|
* Read a block from the journal
|
|
*/
|
|
|
|
static int jread(struct buffer_head **bhp, journal_t *journal,
|
|
unsigned int offset)
|
|
{
|
|
int err;
|
|
unsigned long long blocknr;
|
|
struct buffer_head *bh;
|
|
|
|
*bhp = NULL;
|
|
|
|
if (offset >= journal->j_total_len) {
|
|
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
|
|
return -EFSCORRUPTED;
|
|
}
|
|
|
|
err = jbd2_journal_bmap(journal, offset, &blocknr);
|
|
|
|
if (err) {
|
|
printk(KERN_ERR "JBD2: bad block at offset %u\n",
|
|
offset);
|
|
return err;
|
|
}
|
|
|
|
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
|
|
if (!bh)
|
|
return -ENOMEM;
|
|
|
|
if (!buffer_uptodate(bh)) {
|
|
/* If this is a brand new buffer, start readahead.
|
|
Otherwise, we assume we are already reading it. */
|
|
if (!buffer_req(bh))
|
|
do_readahead(journal, offset);
|
|
wait_on_buffer(bh);
|
|
}
|
|
|
|
if (!buffer_uptodate(bh)) {
|
|
printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
|
|
offset);
|
|
brelse(bh);
|
|
return -EIO;
|
|
}
|
|
|
|
*bhp = bh;
|
|
return 0;
|
|
}
|
|
|
|
static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
|
|
{
|
|
struct jbd2_journal_block_tail *tail;
|
|
__be32 provided;
|
|
__u32 calculated;
|
|
|
|
if (!jbd2_journal_has_csum_v2or3(j))
|
|
return 1;
|
|
|
|
tail = (struct jbd2_journal_block_tail *)((char *)buf +
|
|
j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
|
|
provided = tail->t_checksum;
|
|
tail->t_checksum = 0;
|
|
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
|
|
tail->t_checksum = provided;
|
|
|
|
return provided == cpu_to_be32(calculated);
|
|
}
|
|
|
|
/*
|
|
* Count the number of in-use tags in a journal descriptor block.
|
|
*/
|
|
|
|
static int count_tags(journal_t *journal, struct buffer_head *bh)
|
|
{
|
|
char * tagp;
|
|
journal_block_tag_t tag;
|
|
int nr = 0, size = journal->j_blocksize;
|
|
int tag_bytes = journal_tag_bytes(journal);
|
|
|
|
if (jbd2_journal_has_csum_v2or3(journal))
|
|
size -= sizeof(struct jbd2_journal_block_tail);
|
|
|
|
tagp = &bh->b_data[sizeof(journal_header_t)];
|
|
|
|
while ((tagp - bh->b_data + tag_bytes) <= size) {
|
|
memcpy(&tag, tagp, sizeof(tag));
|
|
|
|
nr++;
|
|
tagp += tag_bytes;
|
|
if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
|
|
tagp += 16;
|
|
|
|
if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
|
|
break;
|
|
}
|
|
|
|
return nr;
|
|
}
|
|
|
|
|
|
/* Make sure we wrap around the log correctly! */
|
|
#define wrap(journal, var) \
|
|
do { \
|
|
unsigned long _wrap_last = \
|
|
jbd2_has_feature_fast_commit(journal) ? \
|
|
(journal)->j_fc_last : (journal)->j_last; \
|
|
\
|
|
if (var >= _wrap_last) \
|
|
var -= (_wrap_last - (journal)->j_first); \
|
|
} while (0)
|
|
|
|
static int fc_do_one_pass(journal_t *journal,
|
|
struct recovery_info *info, enum passtype pass)
|
|
{
|
|
unsigned int expected_commit_id = info->end_transaction;
|
|
unsigned long next_fc_block;
|
|
struct buffer_head *bh;
|
|
int err = 0;
|
|
|
|
next_fc_block = journal->j_fc_first;
|
|
if (!journal->j_fc_replay_callback)
|
|
return 0;
|
|
|
|
while (next_fc_block <= journal->j_fc_last) {
|
|
jbd2_debug(3, "Fast commit replay: next block %ld\n",
|
|
next_fc_block);
|
|
err = jread(&bh, journal, next_fc_block);
|
|
if (err) {
|
|
jbd2_debug(3, "Fast commit replay: read error\n");
|
|
break;
|
|
}
|
|
|
|
err = journal->j_fc_replay_callback(journal, bh, pass,
|
|
next_fc_block - journal->j_fc_first,
|
|
expected_commit_id);
|
|
next_fc_block++;
|
|
if (err < 0 || err == JBD2_FC_REPLAY_STOP)
|
|
break;
|
|
err = 0;
|
|
}
|
|
|
|
if (err)
|
|
jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
|
|
|
|
return err;
|
|
}
|
|
|
|
/**
|
|
* jbd2_journal_recover - recovers a on-disk journal
|
|
* @journal: the journal to recover
|
|
*
|
|
* The primary function for recovering the log contents when mounting a
|
|
* journaled device.
|
|
*
|
|
* Recovery is done in three passes. In the first pass, we look for the
|
|
* end of the log. In the second, we assemble the list of revoke
|
|
* blocks. In the third and final pass, we replay any un-revoked blocks
|
|
* in the log.
|
|
*/
|
|
int jbd2_journal_recover(journal_t *journal)
|
|
{
|
|
int err, err2;
|
|
journal_superblock_t * sb;
|
|
|
|
struct recovery_info info;
|
|
|
|
memset(&info, 0, sizeof(info));
|
|
sb = journal->j_superblock;
|
|
|
|
/*
|
|
* The journal superblock's s_start field (the current log head)
|
|
* is always zero if, and only if, the journal was cleanly
|
|
* unmounted.
|
|
*/
|
|
|
|
if (!sb->s_start) {
|
|
jbd2_debug(1, "No recovery required, last transaction %d\n",
|
|
be32_to_cpu(sb->s_sequence));
|
|
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
|
|
return 0;
|
|
}
|
|
|
|
err = do_one_pass(journal, &info, PASS_SCAN);
|
|
if (!err)
|
|
err = do_one_pass(journal, &info, PASS_REVOKE);
|
|
if (!err)
|
|
err = do_one_pass(journal, &info, PASS_REPLAY);
|
|
|
|
jbd2_debug(1, "JBD2: recovery, exit status %d, "
|
|
"recovered transactions %u to %u\n",
|
|
err, info.start_transaction, info.end_transaction);
|
|
jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
|
|
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
|
|
|
|
/* Restart the log at the next transaction ID, thus invalidating
|
|
* any existing commit records in the log. */
|
|
journal->j_transaction_sequence = ++info.end_transaction;
|
|
|
|
jbd2_journal_clear_revoke(journal);
|
|
err2 = sync_blockdev(journal->j_fs_dev);
|
|
if (!err)
|
|
err = err2;
|
|
/* Make sure all replayed data is on permanent storage */
|
|
if (journal->j_flags & JBD2_BARRIER) {
|
|
err2 = blkdev_issue_flush(journal->j_fs_dev);
|
|
if (!err)
|
|
err = err2;
|
|
}
|
|
return err;
|
|
}
|
|
|
|
/**
|
|
* jbd2_journal_skip_recovery - Start journal and wipe exiting records
|
|
* @journal: journal to startup
|
|
*
|
|
* Locate any valid recovery information from the journal and set up the
|
|
* journal structures in memory to ignore it (presumably because the
|
|
* caller has evidence that it is out of date).
|
|
* This function doesn't appear to be exported..
|
|
*
|
|
* We perform one pass over the journal to allow us to tell the user how
|
|
* much recovery information is being erased, and to let us initialise
|
|
* the journal transaction sequence numbers to the next unused ID.
|
|
*/
|
|
int jbd2_journal_skip_recovery(journal_t *journal)
|
|
{
|
|
int err;
|
|
|
|
struct recovery_info info;
|
|
|
|
memset (&info, 0, sizeof(info));
|
|
|
|
err = do_one_pass(journal, &info, PASS_SCAN);
|
|
|
|
if (err) {
|
|
printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
|
|
++journal->j_transaction_sequence;
|
|
} else {
|
|
#ifdef CONFIG_JBD2_DEBUG
|
|
int dropped = info.end_transaction -
|
|
be32_to_cpu(journal->j_superblock->s_sequence);
|
|
jbd2_debug(1,
|
|
"JBD2: ignoring %d transaction%s from the journal.\n",
|
|
dropped, (dropped == 1) ? "" : "s");
|
|
#endif
|
|
journal->j_transaction_sequence = ++info.end_transaction;
|
|
}
|
|
|
|
journal->j_tail = 0;
|
|
return err;
|
|
}
|
|
|
|
static inline unsigned long long read_tag_block(journal_t *journal,
|
|
journal_block_tag_t *tag)
|
|
{
|
|
unsigned long long block = be32_to_cpu(tag->t_blocknr);
|
|
if (jbd2_has_feature_64bit(journal))
|
|
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
|
|
return block;
|
|
}
|
|
|
|
/*
|
|
* calc_chksums calculates the checksums for the blocks described in the
|
|
* descriptor block.
|
|
*/
|
|
static int calc_chksums(journal_t *journal, struct buffer_head *bh,
|
|
unsigned long *next_log_block, __u32 *crc32_sum)
|
|
{
|
|
int i, num_blks, err;
|
|
unsigned long io_block;
|
|
struct buffer_head *obh;
|
|
|
|
num_blks = count_tags(journal, bh);
|
|
/* Calculate checksum of the descriptor block. */
|
|
*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
|
|
|
|
for (i = 0; i < num_blks; i++) {
|
|
io_block = (*next_log_block)++;
|
|
wrap(journal, *next_log_block);
|
|
err = jread(&obh, journal, io_block);
|
|
if (err) {
|
|
printk(KERN_ERR "JBD2: IO error %d recovering block "
|
|
"%lu in log\n", err, io_block);
|
|
return 1;
|
|
} else {
|
|
*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
|
|
obh->b_size);
|
|
}
|
|
put_bh(obh);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
|
|
{
|
|
struct commit_header *h;
|
|
__be32 provided;
|
|
__u32 calculated;
|
|
|
|
if (!jbd2_journal_has_csum_v2or3(j))
|
|
return 1;
|
|
|
|
h = buf;
|
|
provided = h->h_chksum[0];
|
|
h->h_chksum[0] = 0;
|
|
calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
|
|
h->h_chksum[0] = provided;
|
|
|
|
return provided == cpu_to_be32(calculated);
|
|
}
|
|
|
|
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
|
|
journal_block_tag3_t *tag3,
|
|
void *buf, __u32 sequence)
|
|
{
|
|
__u32 csum32;
|
|
__be32 seq;
|
|
|
|
if (!jbd2_journal_has_csum_v2or3(j))
|
|
return 1;
|
|
|
|
seq = cpu_to_be32(sequence);
|
|
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
|
|
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
|
|
|
|
if (jbd2_has_feature_csum3(j))
|
|
return tag3->t_checksum == cpu_to_be32(csum32);
|
|
else
|
|
return tag->t_checksum == cpu_to_be16(csum32);
|
|
}
|
|
|
|
static int do_one_pass(journal_t *journal,
|
|
struct recovery_info *info, enum passtype pass)
|
|
{
|
|
unsigned int first_commit_ID, next_commit_ID;
|
|
unsigned long next_log_block;
|
|
int err, success = 0;
|
|
journal_superblock_t * sb;
|
|
journal_header_t * tmp;
|
|
struct buffer_head * bh;
|
|
unsigned int sequence;
|
|
int blocktype;
|
|
int tag_bytes = journal_tag_bytes(journal);
|
|
__u32 crc32_sum = ~0; /* Transactional Checksums */
|
|
int descr_csum_size = 0;
|
|
int block_error = 0;
|
|
bool need_check_commit_time = false;
|
|
__u64 last_trans_commit_time = 0, commit_time;
|
|
|
|
/*
|
|
* First thing is to establish what we expect to find in the log
|
|
* (in terms of transaction IDs), and where (in terms of log
|
|
* block offsets): query the superblock.
|
|
*/
|
|
|
|
sb = journal->j_superblock;
|
|
next_commit_ID = be32_to_cpu(sb->s_sequence);
|
|
next_log_block = be32_to_cpu(sb->s_start);
|
|
|
|
first_commit_ID = next_commit_ID;
|
|
if (pass == PASS_SCAN)
|
|
info->start_transaction = first_commit_ID;
|
|
|
|
jbd2_debug(1, "Starting recovery pass %d\n", pass);
|
|
|
|
/*
|
|
* Now we walk through the log, transaction by transaction,
|
|
* making sure that each transaction has a commit block in the
|
|
* expected place. Each complete transaction gets replayed back
|
|
* into the main filesystem.
|
|
*/
|
|
|
|
while (1) {
|
|
int flags;
|
|
char * tagp;
|
|
journal_block_tag_t tag;
|
|
struct buffer_head * obh;
|
|
struct buffer_head * nbh;
|
|
|
|
cond_resched();
|
|
|
|
/* If we already know where to stop the log traversal,
|
|
* check right now that we haven't gone past the end of
|
|
* the log. */
|
|
|
|
if (pass != PASS_SCAN)
|
|
if (tid_geq(next_commit_ID, info->end_transaction))
|
|
break;
|
|
|
|
jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
|
|
next_commit_ID, next_log_block,
|
|
jbd2_has_feature_fast_commit(journal) ?
|
|
journal->j_fc_last : journal->j_last);
|
|
|
|
/* Skip over each chunk of the transaction looking
|
|
* either the next descriptor block or the final commit
|
|
* record. */
|
|
|
|
jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
|
|
err = jread(&bh, journal, next_log_block);
|
|
if (err)
|
|
goto failed;
|
|
|
|
next_log_block++;
|
|
wrap(journal, next_log_block);
|
|
|
|
/* What kind of buffer is it?
|
|
*
|
|
* If it is a descriptor block, check that it has the
|
|
* expected sequence number. Otherwise, we're all done
|
|
* here. */
|
|
|
|
tmp = (journal_header_t *)bh->b_data;
|
|
|
|
if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
|
|
brelse(bh);
|
|
break;
|
|
}
|
|
|
|
blocktype = be32_to_cpu(tmp->h_blocktype);
|
|
sequence = be32_to_cpu(tmp->h_sequence);
|
|
jbd2_debug(3, "Found magic %d, sequence %d\n",
|
|
blocktype, sequence);
|
|
|
|
if (sequence != next_commit_ID) {
|
|
brelse(bh);
|
|
break;
|
|
}
|
|
|
|
/* OK, we have a valid descriptor block which matches
|
|
* all of the sequence number checks. What are we going
|
|
* to do with it? That depends on the pass... */
|
|
|
|
switch(blocktype) {
|
|
case JBD2_DESCRIPTOR_BLOCK:
|
|
/* Verify checksum first */
|
|
if (jbd2_journal_has_csum_v2or3(journal))
|
|
descr_csum_size =
|
|
sizeof(struct jbd2_journal_block_tail);
|
|
if (descr_csum_size > 0 &&
|
|
!jbd2_descriptor_block_csum_verify(journal,
|
|
bh->b_data)) {
|
|
/*
|
|
* PASS_SCAN can see stale blocks due to lazy
|
|
* journal init. Don't error out on those yet.
|
|
*/
|
|
if (pass != PASS_SCAN) {
|
|
pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
|
|
next_log_block);
|
|
err = -EFSBADCRC;
|
|
brelse(bh);
|
|
goto failed;
|
|
}
|
|
need_check_commit_time = true;
|
|
jbd2_debug(1,
|
|
"invalid descriptor block found in %lu\n",
|
|
next_log_block);
|
|
}
|
|
|
|
/* If it is a valid descriptor block, replay it
|
|
* in pass REPLAY; if journal_checksums enabled, then
|
|
* calculate checksums in PASS_SCAN, otherwise,
|
|
* just skip over the blocks it describes. */
|
|
if (pass != PASS_REPLAY) {
|
|
if (pass == PASS_SCAN &&
|
|
jbd2_has_feature_checksum(journal) &&
|
|
!need_check_commit_time &&
|
|
!info->end_transaction) {
|
|
if (calc_chksums(journal, bh,
|
|
&next_log_block,
|
|
&crc32_sum)) {
|
|
put_bh(bh);
|
|
break;
|
|
}
|
|
put_bh(bh);
|
|
continue;
|
|
}
|
|
next_log_block += count_tags(journal, bh);
|
|
wrap(journal, next_log_block);
|
|
put_bh(bh);
|
|
continue;
|
|
}
|
|
|
|
/* A descriptor block: we can now write all of
|
|
* the data blocks. Yay, useful work is finally
|
|
* getting done here! */
|
|
|
|
tagp = &bh->b_data[sizeof(journal_header_t)];
|
|
while ((tagp - bh->b_data + tag_bytes)
|
|
<= journal->j_blocksize - descr_csum_size) {
|
|
unsigned long io_block;
|
|
|
|
memcpy(&tag, tagp, sizeof(tag));
|
|
flags = be16_to_cpu(tag.t_flags);
|
|
|
|
io_block = next_log_block++;
|
|
wrap(journal, next_log_block);
|
|
err = jread(&obh, journal, io_block);
|
|
if (err) {
|
|
/* Recover what we can, but
|
|
* report failure at the end. */
|
|
success = err;
|
|
printk(KERN_ERR
|
|
"JBD2: IO error %d recovering "
|
|
"block %ld in log\n",
|
|
err, io_block);
|
|
} else {
|
|
unsigned long long blocknr;
|
|
|
|
J_ASSERT(obh != NULL);
|
|
blocknr = read_tag_block(journal,
|
|
&tag);
|
|
|
|
/* If the block has been
|
|
* revoked, then we're all done
|
|
* here. */
|
|
if (jbd2_journal_test_revoke
|
|
(journal, blocknr,
|
|
next_commit_ID)) {
|
|
brelse(obh);
|
|
++info->nr_revoke_hits;
|
|
goto skip_write;
|
|
}
|
|
|
|
/* Look for block corruption */
|
|
if (!jbd2_block_tag_csum_verify(
|
|
journal, &tag, (journal_block_tag3_t *)tagp,
|
|
obh->b_data, be32_to_cpu(tmp->h_sequence))) {
|
|
brelse(obh);
|
|
success = -EFSBADCRC;
|
|
printk(KERN_ERR "JBD2: Invalid "
|
|
"checksum recovering "
|
|
"data block %llu in "
|
|
"log\n", blocknr);
|
|
block_error = 1;
|
|
goto skip_write;
|
|
}
|
|
|
|
/* Find a buffer for the new
|
|
* data being restored */
|
|
nbh = __getblk(journal->j_fs_dev,
|
|
blocknr,
|
|
journal->j_blocksize);
|
|
if (nbh == NULL) {
|
|
printk(KERN_ERR
|
|
"JBD2: Out of memory "
|
|
"during recovery.\n");
|
|
err = -ENOMEM;
|
|
brelse(bh);
|
|
brelse(obh);
|
|
goto failed;
|
|
}
|
|
|
|
lock_buffer(nbh);
|
|
memcpy(nbh->b_data, obh->b_data,
|
|
journal->j_blocksize);
|
|
if (flags & JBD2_FLAG_ESCAPE) {
|
|
*((__be32 *)nbh->b_data) =
|
|
cpu_to_be32(JBD2_MAGIC_NUMBER);
|
|
}
|
|
|
|
BUFFER_TRACE(nbh, "marking dirty");
|
|
set_buffer_uptodate(nbh);
|
|
mark_buffer_dirty(nbh);
|
|
BUFFER_TRACE(nbh, "marking uptodate");
|
|
++info->nr_replays;
|
|
/* ll_rw_block(WRITE, 1, &nbh); */
|
|
unlock_buffer(nbh);
|
|
brelse(obh);
|
|
brelse(nbh);
|
|
}
|
|
|
|
skip_write:
|
|
tagp += tag_bytes;
|
|
if (!(flags & JBD2_FLAG_SAME_UUID))
|
|
tagp += 16;
|
|
|
|
if (flags & JBD2_FLAG_LAST_TAG)
|
|
break;
|
|
}
|
|
|
|
brelse(bh);
|
|
continue;
|
|
|
|
case JBD2_COMMIT_BLOCK:
|
|
/* How to differentiate between interrupted commit
|
|
* and journal corruption ?
|
|
*
|
|
* {nth transaction}
|
|
* Checksum Verification Failed
|
|
* |
|
|
* ____________________
|
|
* | |
|
|
* async_commit sync_commit
|
|
* | |
|
|
* | GO TO NEXT "Journal Corruption"
|
|
* | TRANSACTION
|
|
* |
|
|
* {(n+1)th transanction}
|
|
* |
|
|
* _______|______________
|
|
* | |
|
|
* Commit block found Commit block not found
|
|
* | |
|
|
* "Journal Corruption" |
|
|
* _____________|_________
|
|
* | |
|
|
* nth trans corrupt OR nth trans
|
|
* and (n+1)th interrupted interrupted
|
|
* before commit block
|
|
* could reach the disk.
|
|
* (Cannot find the difference in above
|
|
* mentioned conditions. Hence assume
|
|
* "Interrupted Commit".)
|
|
*/
|
|
commit_time = be64_to_cpu(
|
|
((struct commit_header *)bh->b_data)->h_commit_sec);
|
|
/*
|
|
* If need_check_commit_time is set, it means we are in
|
|
* PASS_SCAN and csum verify failed before. If
|
|
* commit_time is increasing, it's the same journal,
|
|
* otherwise it is stale journal block, just end this
|
|
* recovery.
|
|
*/
|
|
if (need_check_commit_time) {
|
|
if (commit_time >= last_trans_commit_time) {
|
|
pr_err("JBD2: Invalid checksum found in transaction %u\n",
|
|
next_commit_ID);
|
|
err = -EFSBADCRC;
|
|
brelse(bh);
|
|
goto failed;
|
|
}
|
|
ignore_crc_mismatch:
|
|
/*
|
|
* It likely does not belong to same journal,
|
|
* just end this recovery with success.
|
|
*/
|
|
jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
|
|
next_commit_ID);
|
|
brelse(bh);
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Found an expected commit block: if checksums
|
|
* are present, verify them in PASS_SCAN; else not
|
|
* much to do other than move on to the next sequence
|
|
* number.
|
|
*/
|
|
if (pass == PASS_SCAN &&
|
|
jbd2_has_feature_checksum(journal)) {
|
|
struct commit_header *cbh =
|
|
(struct commit_header *)bh->b_data;
|
|
unsigned found_chksum =
|
|
be32_to_cpu(cbh->h_chksum[0]);
|
|
|
|
if (info->end_transaction) {
|
|
journal->j_failed_commit =
|
|
info->end_transaction;
|
|
brelse(bh);
|
|
break;
|
|
}
|
|
|
|
/* Neither checksum match nor unused? */
|
|
if (!((crc32_sum == found_chksum &&
|
|
cbh->h_chksum_type ==
|
|
JBD2_CRC32_CHKSUM &&
|
|
cbh->h_chksum_size ==
|
|
JBD2_CRC32_CHKSUM_SIZE) ||
|
|
(cbh->h_chksum_type == 0 &&
|
|
cbh->h_chksum_size == 0 &&
|
|
found_chksum == 0)))
|
|
goto chksum_error;
|
|
|
|
crc32_sum = ~0;
|
|
}
|
|
if (pass == PASS_SCAN &&
|
|
!jbd2_commit_block_csum_verify(journal,
|
|
bh->b_data)) {
|
|
chksum_error:
|
|
if (commit_time < last_trans_commit_time)
|
|
goto ignore_crc_mismatch;
|
|
info->end_transaction = next_commit_ID;
|
|
|
|
if (!jbd2_has_feature_async_commit(journal)) {
|
|
journal->j_failed_commit =
|
|
next_commit_ID;
|
|
brelse(bh);
|
|
break;
|
|
}
|
|
}
|
|
if (pass == PASS_SCAN)
|
|
last_trans_commit_time = commit_time;
|
|
brelse(bh);
|
|
next_commit_ID++;
|
|
continue;
|
|
|
|
case JBD2_REVOKE_BLOCK:
|
|
/*
|
|
* Check revoke block crc in pass_scan, if csum verify
|
|
* failed, check commit block time later.
|
|
*/
|
|
if (pass == PASS_SCAN &&
|
|
!jbd2_descriptor_block_csum_verify(journal,
|
|
bh->b_data)) {
|
|
jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
|
|
next_log_block);
|
|
need_check_commit_time = true;
|
|
}
|
|
/* If we aren't in the REVOKE pass, then we can
|
|
* just skip over this block. */
|
|
if (pass != PASS_REVOKE) {
|
|
brelse(bh);
|
|
continue;
|
|
}
|
|
|
|
err = scan_revoke_records(journal, bh,
|
|
next_commit_ID, info);
|
|
brelse(bh);
|
|
if (err)
|
|
goto failed;
|
|
continue;
|
|
|
|
default:
|
|
jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
|
|
blocktype);
|
|
brelse(bh);
|
|
goto done;
|
|
}
|
|
}
|
|
|
|
done:
|
|
/*
|
|
* We broke out of the log scan loop: either we came to the
|
|
* known end of the log or we found an unexpected block in the
|
|
* log. If the latter happened, then we know that the "current"
|
|
* transaction marks the end of the valid log.
|
|
*/
|
|
|
|
if (pass == PASS_SCAN) {
|
|
if (!info->end_transaction)
|
|
info->end_transaction = next_commit_ID;
|
|
} else {
|
|
/* It's really bad news if different passes end up at
|
|
* different places (but possible due to IO errors). */
|
|
if (info->end_transaction != next_commit_ID) {
|
|
printk(KERN_ERR "JBD2: recovery pass %d ended at "
|
|
"transaction %u, expected %u\n",
|
|
pass, next_commit_ID, info->end_transaction);
|
|
if (!success)
|
|
success = -EIO;
|
|
}
|
|
}
|
|
|
|
if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) {
|
|
err = fc_do_one_pass(journal, info, pass);
|
|
if (err)
|
|
success = err;
|
|
}
|
|
|
|
if (block_error && success == 0)
|
|
success = -EIO;
|
|
return success;
|
|
|
|
failed:
|
|
return err;
|
|
}
|
|
|
|
/* Scan a revoke record, marking all blocks mentioned as revoked. */
|
|
|
|
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
|
|
tid_t sequence, struct recovery_info *info)
|
|
{
|
|
jbd2_journal_revoke_header_t *header;
|
|
int offset, max;
|
|
unsigned csum_size = 0;
|
|
__u32 rcount;
|
|
int record_len = 4;
|
|
|
|
header = (jbd2_journal_revoke_header_t *) bh->b_data;
|
|
offset = sizeof(jbd2_journal_revoke_header_t);
|
|
rcount = be32_to_cpu(header->r_count);
|
|
|
|
if (jbd2_journal_has_csum_v2or3(journal))
|
|
csum_size = sizeof(struct jbd2_journal_block_tail);
|
|
if (rcount > journal->j_blocksize - csum_size)
|
|
return -EINVAL;
|
|
max = rcount;
|
|
|
|
if (jbd2_has_feature_64bit(journal))
|
|
record_len = 8;
|
|
|
|
while (offset + record_len <= max) {
|
|
unsigned long long blocknr;
|
|
int err;
|
|
|
|
if (record_len == 4)
|
|
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
|
|
else
|
|
blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
|
|
offset += record_len;
|
|
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
|
|
if (err)
|
|
return err;
|
|
++info->nr_revokes;
|
|
}
|
|
return 0;
|
|
}
|