From d8cdda3efb9331bedbcca2343591eab2316f4cae Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Jan 2011 20:27:45 +0200 Subject: [PATCH 01/25] UBIFS: re-arrange variables in ubifs_info This is a cosmetic patch which re-arranges variables in 'struct ubifs_info' so that all boolean-like variables which are only changed during mounting or re-mounting to R/W mode are places together. Then they are turned into bit-fields, which makes the structure a little bit smaller. Signed-off-by: Artem Bityutskiy --- fs/ubifs/ubifs.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 381d6b207a52..d1efa37d80ae 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1166,22 +1166,22 @@ struct ubifs_debug_info; * @rp_uid: reserved pool user ID * @rp_gid: reserved pool group ID * - * @empty: if the UBI device is empty + * @empty: %1 if the UBI device is empty + * @need_recovery: %1 if the file-system needs recovery + * @replaying: %1 during journal replay + * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode + * @always_chk_crc: always check CRCs (while mounting and remounting to R/W + * mode) * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) * @replay_sqnum: sequence number of node currently being replayed - * @need_recovery: file-system needs recovery - * @replaying: set to %1 during journal replay * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W * mode * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted * FS to R/W mode * @size_tree: inode size information for recovery - * @remounting_rw: set while re-mounting from R/O mode to R/W mode - * @always_chk_crc: always check CRCs (while mounting and remounting to R/W - * mode) * @mount_opts: UBIFS-specific mount options * * @dbg: debugging-related information @@ -1402,19 +1402,19 @@ struct ubifs_info { gid_t rp_gid; /* The below fields are used only during mounting and re-mounting */ - int empty; + unsigned int empty:1; + unsigned int need_recovery:1; + unsigned int replaying:1; + unsigned int remounting_rw:1; + unsigned int always_chk_crc:1; struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; unsigned long long replay_sqnum; - int need_recovery; - int replaying; struct list_head unclean_leb_list; struct ubifs_mst_node *rcvrd_mst_node; struct rb_root size_tree; - int remounting_rw; - int always_chk_crc; struct ubifs_mount_opts mount_opts; #ifdef CONFIG_UBIFS_FS_DEBUG From 18d1d7fbcc260e67d249bf90b454d8cf34288453 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Jan 2011 22:27:56 +0200 Subject: [PATCH 02/25] UBIFS: introduce mounting flag This is a preparational patch which removes the 'c->always_chk_crc' which was set during mounting and remounting to R/W mode and introduces 'c->mounting' flag which is set when mounting. Now the 'c->always_chk_crc' flag is the same as 'c->remounting_rw && c->mounting'. This patch is a preparation for the next one which will need to know when we are mounting and remounting to R/W mode, which is exactly what 'c->always_chk_crc' effectively is, but its name does not suite the next patch. The other possibility would be to just re-name it, but then we'd end up with less logical flags coverage. Signed-off-by: Artem Bityutskiy --- fs/ubifs/io.c | 12 ++++++++---- fs/ubifs/super.c | 11 ++--------- fs/ubifs/tnc.c | 10 +++++++--- fs/ubifs/ubifs.h | 5 ++--- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d82173182eeb..d1fe56203a1d 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -88,8 +88,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) * This function may skip data nodes CRC checking if @c->no_chk_data_crc is * true, which is controlled by corresponding UBIFS mount option. However, if * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is - * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is - * ignored and CRC is checked. + * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are + * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC + * is checked. This is because during mounting or re-mounting from R/O mode to + * R/W mode we may read journal nodes (when replying the journal or doing the + * recovery) and the journal nodes may potentially be corrupted, so checking is + * required. * * This function returns zero in case of success and %-EUCLEAN in case of bad * CRC or magic. @@ -131,8 +135,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_len > c->ranges[type].max_len) goto out_len; - if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && - c->no_chk_data_crc) + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting && + !c->remounting_rw && c->no_chk_data_crc) return 0; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 91fac54c70e3..703a62109cf2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1194,11 +1194,7 @@ static int mount_ubifs(struct ubifs_info *c) if (c->bulk_read == 1) bu_init(c); - /* - * We have to check all CRCs, even for data nodes, when we mount the FS - * (specifically, when we are replaying). - */ - c->always_chk_crc = 1; + c->mounting = 1; err = ubifs_read_superblock(c); if (err) @@ -1374,7 +1370,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - c->always_chk_crc = 0; + c->mounting = 0; ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", c->vi.ubi_num, c->vi.vol_id, c->vi.name); @@ -1535,7 +1531,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; - c->always_chk_crc = 1; err = check_free_space(c); if (err) @@ -1642,7 +1637,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) dbg_gen("re-mounted read-write"); c->ro_mount = 0; c->remounting_rw = 0; - c->always_chk_crc = 0; err = dbg_check_space_info(c); mutex_unlock(&c->umount_mutex); return err; @@ -1659,7 +1653,6 @@ out: c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; - c->always_chk_crc = 0; mutex_unlock(&c->umount_mutex); return err; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index ad9cf0133622..de485979ca39 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, * * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc * is true (it is controlled by corresponding mount option). However, if - * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always - * checked. + * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to + * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is + * because during mounting or re-mounting from R/O mode to R/W mode we may read + * journal nodes (when replying the journal or doing the recovery) and the + * journal nodes may potentially be corrupted, so checking is required. */ static int try_read_node(const struct ubifs_info *c, void *buf, int type, int len, int lnum, int offs) @@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, if (node_len != len) return 0; - if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting && + !c->remounting_rw) return 1; crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d1efa37d80ae..d1823541f987 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1169,9 +1169,8 @@ struct ubifs_debug_info; * @empty: %1 if the UBI device is empty * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay + * @mounting: %1 while mounting * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode - * @always_chk_crc: always check CRCs (while mounting and remounting to R/W - * mode) * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay @@ -1405,8 +1404,8 @@ struct ubifs_info { unsigned int empty:1; unsigned int need_recovery:1; unsigned int replaying:1; + unsigned int mounting:1; unsigned int remounting_rw:1; - unsigned int always_chk_crc:1; struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; From 944fdef52ca9fc0fe077578f51201ef397e30abe Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 16 Jan 2011 19:22:02 +0200 Subject: [PATCH 03/25] UBIFS: do not start the commit if there is nothing to commit This patch fixes suboptimal UBIFS 'sync_fs()' implementation which causes flash I/O even if the file-system is synchronized. E.g., a 'printk()' in the MTD erasure function (e.g., 'nand_erase_nand()') can show that for every 'sync' shell command UBIFS erases at least one eraseblock. So '$ while true; do sync; done' will cause huge amount of flash I/O. The reason for this is that UBIFS commits in 'sync_fs()', and starts the commit even if there is nothing to commit, e.g., it anyway changes the log. This patch adds a check in the 'do_commit()' UBIFS functions which prevents the commit if there is nothing to commit. Reported-by: Hans J. Koch Tested-by: John Ogness Signed-off-by: Artem Bityutskiy --- fs/ubifs/commit.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 02429d81ca33..b148fbc80f8d 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -48,6 +48,56 @@ #include #include "ubifs.h" +/* + * nothing_to_commit - check if there is nothing to commit. + * @c: UBIFS file-system description object + * + * This is a helper function which checks if there is anything to commit. It is + * used as an optimization to avoid starting the commit if it is not really + * necessary. Indeed, the commit operation always assumes flash I/O (e.g., + * writing the commit start node to the log), and it is better to avoid doing + * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is + * nothing to commit, it is more optimal to avoid any flash I/O. + * + * This function has to be called with @c->commit_sem locked for writing - + * this function does not take LPT/TNC locks because the @c->commit_sem + * guarantees that we have exclusive access to the TNC and LPT data structures. + * + * This function returns %1 if there is nothing to commit and %0 otherwise. + */ +static int nothing_to_commit(struct ubifs_info *c) +{ + /* + * During mounting or remounting from R/O mode to R/W mode we may + * commit for various recovery-related reasons. + */ + if (c->mounting || c->remounting_rw) + return 0; + + /* + * If the root TNC node is dirty, we definitely have something to + * commit. + */ + if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags)) + return 0; + + /* + * Even though the TNC is clean, the LPT tree may have dirty nodes. For + * example, this may happen if the budgeting subsystem invoked GC to + * make some free space, and the GC found an LEB with only dirty and + * free space. In this case GC would just change the lprops of this + * LEB (by turning all space into free space) and unmap it. + */ + if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + ubifs_assert(c->dirty_pn_cnt == 0); + ubifs_assert(c->dirty_nn_cnt == 0); + + return 1; +} + /** * do_commit - commit the journal. * @c: UBIFS file-system description object @@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + if (nothing_to_commit(c)) { + up_write(&c->commit_sem); + err = 0; + goto out_cancel; + } + /* Sync all write buffers (necessary for recovery) */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); @@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c) if (err) goto out; +out_cancel: spin_lock(&c->cs_lock); c->cmt_state = COMMIT_RESTING; wake_up(&c->cmt_wq); dbg_cmt("commit end"); spin_unlock(&c->cs_lock); - return 0; out_up: From 822ed64c5b5d15474c6abb1834726643e2cff558 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 6 Feb 2011 14:49:26 +0200 Subject: [PATCH 04/25] UBIFS: remove double semicolon Just a tiny clean-up - remove ;; Signed-off-by: Artem Bityutskiy --- fs/ubifs/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 3e1ee57dbeaa..36216b46f772 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, if (!quiet) ubifs_err("empty space starts at non-aligned offset %d", offs); - goto corrupted;; + goto corrupted; } ubifs_end_scan(c, sleb, lnum, offs); From be7b42a5cb4c5050bcab4f57022007155c119d45 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 6 Feb 2011 16:41:06 +0200 Subject: [PATCH 05/25] UBIFS: describe UBIFS recovery logic some more This patch adds more commentaries about UBIFS recovery logic which should explain the famous UBIFS "corrupt empty space" errors. Signed-off-by: Artem Bityutskiy --- fs/ubifs/recovery.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 77e9b874b6c2..6ecbc91ef9ac 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -28,6 +28,23 @@ * UBIFS always cleans away all remnants of an unclean un-mount, so that * errors do not accumulate. However UBIFS defers recovery if it is mounted * read-only, and the flash is not modified in that case. + * + * The general UBIFS approach to the recovery is that it recovers from + * corruptions which could be caused by power cuts, but it refuses to recover + * from corruption caused by other reasons. And UBIFS tries to distinguish + * between these 2 reasons of corruptions and silently recover in the former + * case and loudly complain in the latter case. + * + * UBIFS writes only to erased LEBs, so it writes only to the flash space + * containing only 0xFFs. UBIFS also always writes strictly from the beginning + * of the LEB to the end. And UBIFS assumes that the underlying flash media + * writes in @c->min_io_unit bytes at a time. + * + * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. + * I/O unit corresponding to offset X to contain corrupted data, all the + * following min. I/O units have to contain empty space (all 0xFFs). If this is + * not true, the corruption cannot be the result of a power cut, and UBIFS + * refuses to mount. */ #include @@ -671,6 +688,10 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } else { int corruption = first_non_ff(buf, len); + /* + * See header comment for this file for more + * explanations about the reasons we have this check. + */ ubifs_err("corrupt empty space LEB %d:%d, corruption " "starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ From 8c559d30b4e59cf6994215ada1fe744928f494bf Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Fri, 4 Feb 2011 15:24:19 +0300 Subject: [PATCH 06/25] UBIFS: restrict world-writable debugfs files Don't allow everybody to dump sensitive information about filesystems. Signed-off-by: Vasiliy Kulikov Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0bee4dbffc31..bcb1acb79263 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2813,19 +2813,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_tnc = dent; From 10ac27970274e9094aee84a6452a25bf1b7e59e1 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 8 Feb 2011 17:21:11 +0200 Subject: [PATCH 07/25] UBIFS: fix LEB number in printk This is a minor patch which fixes the LEB number we print when corrupted empty space is found. Signed-off-by: Artem Bityutskiy --- fs/ubifs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 6ecbc91ef9ac..e2714f8f05ff 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -695,7 +695,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, ubifs_err("corrupt empty space LEB %d:%d, corruption " "starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ - offs = corruption; + offs += corruption; buf += corruption; goto corrupted; } From 30b542ef453e6832ff682170b2db95d7bca2fe70 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 30 Jan 2011 18:37:33 +0200 Subject: [PATCH 08/25] UBI: incorporate maximum write size Incorporate MTD write buffer size into UBI device information because UBIFS needs this field. UBI does not use it ATM, just provides to upper layers in 'struct ubi_device_info'. Signed-off-by: Artem Bityutskiy --- drivers/mtd/ubi/build.c | 14 ++++++++++++++ drivers/mtd/ubi/kapi.c | 1 + drivers/mtd/ubi/ubi.h | 3 +++ include/linux/mtd/ubi.h | 19 +++++++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 5ebe280225d6..f38e8de81811 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -690,11 +690,25 @@ static int io_init(struct ubi_device *ubi) ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); + ubi->max_write_size = ubi->mtd->writebufsize; + /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (ubi->max_write_size < ubi->min_io_size || + ubi->max_write_size % ubi->min_io_size || + !is_power_of_2(ubi->max_write_size)) { + ubi_err("bad write buffer size %d for %d min. I/O unit", + ubi->max_write_size, ubi->min_io_size); + return -EINVAL; + } + /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_msg("min_io_size %d", ubi->min_io_size); + dbg_msg("max_write_size %d", ubi->max_write_size); dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize); diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 69fa4ef03c53..701df4f848f6 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -41,6 +41,7 @@ void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di) di->ubi_num = ubi->ubi_num; di->leb_size = ubi->leb_size; di->min_io_size = ubi->min_io_size; + di->max_write_size = ubi->max_write_size; di->ro_mode = ubi->ro_mode; di->cdev = ubi->cdev.dev; } diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 0b0149c41fe3..b78994330ebc 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -381,6 +381,8 @@ struct ubi_wl_entry; * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or * not * @nor_flash: non-zero if working on top of NOR flash + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) * @mtd: MTD device descriptor * * @peb_buf1: a buffer of PEB size used for different purposes @@ -464,6 +466,7 @@ struct ubi_device { int vid_hdr_shift; unsigned int bad_allowed:1; unsigned int nor_flash:1; + int max_write_size; struct mtd_info *mtd; void *peb_buf1; diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index b31bd9e9bca3..36c70593ae62 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -117,17 +117,36 @@ struct ubi_volume_info { * @ubi_num: ubi device number * @leb_size: logical eraseblock size on this UBI device * @min_io_size: minimal I/O unit size + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) * @ro_mode: if this device is in read-only mode * @cdev: UBI character device major and minor numbers * * Note, @leb_size is the logical eraseblock size offered by the UBI device. * Volumes of this UBI device may have smaller logical eraseblock size if their * alignment is not equivalent to %1. + * + * The @max_write_size field describes flash write maximum write unit. For + * example, NOR flash allows for changing individual bytes, so @min_io_size is + * %1. However, it does not mean than NOR flash has to write data byte-by-byte. + * Instead, CFI NOR flashes have a write-buffer of, e.g., 64 bytes, and when + * writing large chunks of data, they write 64-bytes at a time. Obviously, this + * improves write throughput. + * + * Also, the MTD device may have N interleaved (striped) flash chips + * underneath, in which case @min_io_size can be physical min. I/O size of + * single flash chip, while @max_write_size can be N * @min_io_size. + * + * The @max_write_size field is always greater or equivalent to @min_io_size. + * E.g., some NOR flashes may have (@min_io_size = 1, @max_write_size = 64). In + * contrast, NAND flashes usually have @min_io_size = @max_write_size = NAND + * page size. */ struct ubi_device_info { int ubi_num; int leb_size; int min_io_size; + int max_write_size; int ro_mode; dev_t cdev; }; From f43ec882b8b65de0ebde2e1ad52e8de0349d83ae Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 14 Feb 2011 13:36:54 +0200 Subject: [PATCH 09/25] UBI: provide LEB offset information Provide the LEB offset information in the UBI device information data structure. This piece of information is required by UBIFS to find out what are the LEB offsets which are aligned to the max. write size. If LEB offset not aligned to max. write size, then UBIFS has to take this into account to write more optimally. Signed-off-by: Artem Bityutskiy --- drivers/mtd/ubi/kapi.c | 1 + include/linux/mtd/ubi.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 701df4f848f6..d39716e5b204 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -40,6 +40,7 @@ void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di) { di->ubi_num = ubi->ubi_num; di->leb_size = ubi->leb_size; + di->leb_start = ubi->leb_start; di->min_io_size = ubi->min_io_size; di->max_write_size = ubi->max_write_size; di->ro_mode = ubi->ro_mode; diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index 36c70593ae62..84854edf4436 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -116,6 +116,8 @@ struct ubi_volume_info { * struct ubi_device_info - UBI device description data structure. * @ubi_num: ubi device number * @leb_size: logical eraseblock size on this UBI device + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @min_io_size: minimal I/O unit size * @max_write_size: maximum amount of bytes the underlying flash can write at a * time (MTD write buffer size) @@ -145,6 +147,7 @@ struct ubi_volume_info { struct ubi_device_info { int ubi_num; int leb_size; + int leb_start; int min_io_size; int max_write_size; int ro_mode; From 3e8e2e0c8da1f1701a8014543c951c41751791cc Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 30 Jan 2011 18:58:32 +0200 Subject: [PATCH 10/25] UBIFS: incorporate maximum write size Incorporate maximum write size into the UBIFS description data structure. This patch just introduces new 'c->max_write_size' and 'c->max_write_shift' fields as a preparation for the following patches. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 19 +++++++++++++++++++ fs/ubifs/ubifs.h | 5 +++++ 2 files changed, 24 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 703a62109cf2..efc327b92f98 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -507,6 +507,8 @@ static int init_constants_early(struct ubifs_info *c) c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; + c->max_write_size = c->di.max_write_size; + c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_err("too small LEBs (%d bytes), min. is %d bytes", @@ -525,6 +527,18 @@ static int init_constants_early(struct ubifs_info *c) return -EINVAL; } + /* + * Maximum write size has to be greater or equivalent to min. I/O + * size, and be multiple of min. I/O size. + */ + if (c->max_write_size < c->min_io_size || + c->max_write_size % c->min_io_size || + !is_power_of_2(c->max_write_size)) { + ubifs_err("bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); + return -EINVAL; + } + /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is @@ -533,6 +547,10 @@ static int init_constants_early(struct ubifs_info *c) if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; + if (c->max_write_size < c->min_io_size) { + c->max_write_size = c->min_io_size; + c->max_write_shift = c->min_io_shift; + } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); @@ -1391,6 +1409,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("compiled on: " __DATE__ " at " __TIME__); dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("max. write size: %d bytes", c->max_write_size); dbg_msg("LEB size: %d bytes (%d KiB)", c->leb_size, c->leb_size >> 10); dbg_msg("data journal heads: %d", diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d1823541f987..8b519499f14a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1024,6 +1024,9 @@ struct ubifs_debug_info; * * @min_io_size: minimal input/output unit size * @min_io_shift: number of bits in @min_io_size minus one + * @max_write_size: maximum amount of bytes the underlying flash can write at a + * time (MTD write buffer size) + * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes * @half_leb_size: half LEB size * @idx_leb_size: how many bytes of an LEB are effectively available when it is @@ -1270,6 +1273,8 @@ struct ubifs_info { int min_io_size; int min_io_shift; + int max_write_size; + int max_write_shift; int leb_size; int half_leb_size; int idx_leb_size; From ca2ec61d157f23ec24aaa200f8016ea0a8aeb617 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 14 Feb 2011 15:17:55 +0200 Subject: [PATCH 11/25] UBI: incorporate LEB offset information Incorporate the LEB offset information into UBIFS. We'll use this information in one of the next patches to figure out what are the max. write size offsets relative to the PEB. So this patch is just a preparation. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 1 + fs/ubifs/ubifs.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index efc327b92f98..d4b4cb4596e2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -504,6 +504,7 @@ static int init_constants_early(struct ubifs_info *c) c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; + c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 8b519499f14a..942c1d3cb0db 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1028,6 +1028,8 @@ struct ubifs_debug_info; * time (MTD write buffer size) * @max_write_shift: number of bits in @max_write_size minus one * @leb_size: logical eraseblock size in bytes + * @leb_start: starting offset of logical eraseblocks within physical + * eraseblocks * @half_leb_size: half LEB size * @idx_leb_size: how many bytes of an LEB are effectively available when it is * used to store indexing nodes (@leb_size - @max_idx_node_sz) @@ -1276,6 +1278,7 @@ struct ubifs_info { int max_write_size; int max_write_shift; int leb_size; + int leb_start; int half_leb_size; int idx_leb_size; int leb_cnt; From 3c89f396dc78671cfbc1eb20ef1d5be6a9a02780 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 1 Feb 2011 19:02:49 +0200 Subject: [PATCH 12/25] UBIFS: introduce write-buffer size field Currently we assume write-buffer size is always min_io_size. But this is about to change and write-buffers may be of variable size. Namely, they will be of max_write_size at the beginning, but will get smaller when we are approaching the end of LEB. This is a preparation patch which introduces 'size' field in the write-buffer structure which carries the current write-buffer size. Signed-off-by: Artem Bityutskiy --- fs/ubifs/io.c | 28 +++++++++++++++++++--------- fs/ubifs/ubifs.h | 2 ++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index d1fe56203a1d..7c2a014b59f9 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -361,7 +361,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dbg_io("LEB %d:%d, %d bytes, jhead %s", wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); ubifs_assert(!(wbuf->avail & 7)); - ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); if (c->ro_error) @@ -369,10 +372,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); + wbuf->size, wbuf->dtype); if (err) { ubifs_err("cannot write %d bytes to LEB %d:%d", - c->min_io_size, wbuf->lnum, wbuf->offs); + wbuf->size, wbuf->lnum, wbuf->offs); dbg_dump_stack(); return err; } @@ -380,8 +383,9 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) dirt = wbuf->avail; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; + wbuf->offs += wbuf->size; wbuf->avail = c->min_io_size; + wbuf->size = c->min_io_size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -425,6 +429,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, wbuf->lnum = lnum; wbuf->offs = offs; wbuf->avail = c->min_io_size; + wbuf->size = c->min_io_size; wbuf->used = 0; spin_unlock(&wbuf->lock); wbuf->dtype = dtype; @@ -522,7 +527,10 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); - ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size); + ubifs_assert(wbuf->size >= c->min_io_size); + ubifs_assert(wbuf->size <= c->max_write_size); + ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); @@ -547,7 +555,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_io("flush jhead %s wbuf to LEB %d:%d", dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, - wbuf->offs, c->min_io_size, + wbuf->offs, wbuf->size, wbuf->dtype); if (err) goto out; @@ -555,6 +563,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) spin_lock(&wbuf->lock); wbuf->offs += c->min_io_size; wbuf->avail = c->min_io_size; + wbuf->size = c->min_io_size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -577,11 +586,11 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - c->min_io_size, wbuf->dtype); + wbuf->size, wbuf->dtype); if (err) goto out; - offs = wbuf->offs + c->min_io_size; + offs = wbuf->offs + wbuf->size; len -= wbuf->avail; aligned_len -= wbuf->avail; written = wbuf->avail; @@ -618,6 +627,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) wbuf->offs = offs; wbuf->used = aligned_len; wbuf->avail = c->min_io_size - aligned_len; + wbuf->size = c->min_io_size; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -855,7 +865,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = c->min_io_size; + wbuf->avail = wbuf->size = c->min_io_size; wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 942c1d3cb0db..362495078489 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -646,6 +646,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @offs: write-buffer offset in this logical eraseblock * @avail: number of bytes available in the write-buffer * @used: number of used bytes in the write-buffer + * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range) * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, * %UBI_UNKNOWN) * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep @@ -680,6 +681,7 @@ struct ubifs_wbuf { int offs; int avail; int used; + int size; int dtype; int jhead; int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); From 6c7f74f703cc4baf053270a6e78a32f832f03445 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 6 Feb 2011 14:45:26 +0200 Subject: [PATCH 13/25] UBIFS: use max_write_size for write-buffers Switch write-buffers from 'c->min_io_size' to 'c->max_write_size' which presumably has to be more write speed-efficient. However, when write-buffer is synchronized, write only the the min. I/O units which contain the data, do not write whole write-buffer. This is more space-efficient. Additionally, this patch takes into account that the LEB might not start from the max. write unit-aligned address. Signed-off-by: Artem Bityutskiy --- fs/ubifs/io.c | 181 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 137 insertions(+), 44 deletions(-) diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 7c2a014b59f9..dfd168b7807e 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -31,6 +31,26 @@ * buffer is full or when it is not used for some time (by timer). This is * similar to the mechanism is used by JFFS2. * + * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum + * write size (@c->max_write_size). The latter is the maximum amount of bytes + * the underlying flash is able to program at a time, and writing in + * @c->max_write_size units should presumably be faster. Obviously, + * @c->min_io_size <= @c->max_write_size. Write-buffers are of + * @c->max_write_size bytes in size for maximum performance. However, when a + * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size + * boundary) which contains data is written, not the whole write-buffer, + * because this is more space-efficient. + * + * This optimization adds few complications to the code. Indeed, on the one + * hand, we want to write in optimal @c->max_write_size bytes chunks, which + * also means aligning writes at the @c->max_write_size bytes offsets. On the + * other hand, we do not want to waste space when synchronizing the write + * buffer, so during synchronization we writes in smaller chunks. And this makes + * the next write offset to be not aligned to @c->max_write_size bytes. So the + * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned + * to @c->max_write_size bytes again. We do this by temporarily shrinking + * write-buffer size (@wbuf->size). + * * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by * mutexes defined inside these objects. Since sometimes upper-level code * has to lock the write-buffer (e.g. journal space reservation code), many @@ -46,8 +66,8 @@ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it * uses padding nodes or padding bytes, if the padding node does not fit. * - * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes - * every time they are read from the flash media. + * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when + * they are read from the flash media. */ #include @@ -347,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * * This function synchronizes write-buffer @buf and returns zero in case of * success or a negative error code in case of failure. + * + * Note, although write-buffers are of @c->max_write_size, this function does + * not necessarily writes all @c->max_write_size bytes to the flash. Instead, + * if the write-buffer is only partially filled with data, only the used part + * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized. + * This way we waste less space. */ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) { struct ubifs_info *c = wbuf->c; - int err, dirt; + int err, dirt, sync_len; cancel_wbuf_timer_nolock(wbuf); if (!wbuf->used || wbuf->lnum == -1) @@ -366,26 +392,48 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) ubifs_assert(wbuf->size <= c->max_write_size); ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->ro_error) return -EROFS; - ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + /* + * Do not write whole write buffer but write only the minimum necessary + * amount of min. I/O units. + */ + sync_len = ALIGN(wbuf->used, c->min_io_size); + dirt = sync_len - wbuf->used; + if (dirt) + ubifs_pad(c, wbuf->buf + wbuf->used, dirt); err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); + sync_len, wbuf->dtype); if (err) { ubifs_err("cannot write %d bytes to LEB %d:%d", - wbuf->size, wbuf->lnum, wbuf->offs); + sync_len, wbuf->lnum, wbuf->offs); dbg_dump_stack(); return err; } - dirt = wbuf->avail; - spin_lock(&wbuf->lock); - wbuf->offs += wbuf->size; - wbuf->avail = c->min_io_size; - wbuf->size = c->min_io_size; + wbuf->offs += sync_len; + /* + * Now @wbuf->offs is not necessarily aligned to @c->max_write_size. + * But our goal is to optimize writes and make sure we write in + * @c->max_write_size chunks and to @c->max_write_size-aligned offset. + * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make + * sure that @wbuf->offs + @wbuf->size is aligned to + * @c->max_write_size. This way we make sure that after next + * write-buffer flush we are again at the optimal offset (aligned to + * @c->max_write_size). + */ + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -428,8 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, spin_lock(&wbuf->lock); wbuf->lnum = lnum; wbuf->offs = offs; - wbuf->avail = c->min_io_size; - wbuf->size = c->min_io_size; + if (c->leb_size - wbuf->offs < c->max_write_size) + wbuf->size = c->leb_size - wbuf->offs; + else if (wbuf->offs & (c->max_write_size - 1)) + wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs; + else + wbuf->size = c->max_write_size; + wbuf->avail = wbuf->size; wbuf->used = 0; spin_unlock(&wbuf->lock); wbuf->dtype = dtype; @@ -509,8 +562,9 @@ out_timers: * * This function writes data to flash via write-buffer @wbuf. This means that * the last piece of the node won't reach the flash media immediately if it - * does not take whole minimal I/O unit. Instead, the node will sit in RAM - * until the write-buffer is synchronized (e.g., by timer). + * does not take whole max. write unit (@c->max_write_size). Instead, the node + * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or + * because more data are appended to the write-buffer). * * This function returns zero in case of success and a negative error code in * case of failure. If the node cannot be written because there is no more @@ -533,6 +587,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); + if (c->leb_size - wbuf->offs >= c->max_write_size) + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -561,9 +617,12 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto out; spin_lock(&wbuf->lock); - wbuf->offs += c->min_io_size; - wbuf->avail = c->min_io_size; - wbuf->size = c->min_io_size; + wbuf->offs += wbuf->size; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size; wbuf->used = 0; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -577,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - /* - * The node is large enough and does not fit entirely within current - * minimal I/O unit. We have to fill and flush write-buffer and switch - * to the next min. I/O unit. - */ - dbg_io("flush jhead %s wbuf to LEB %d:%d", - dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); - memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); - err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, - wbuf->size, wbuf->dtype); - if (err) - goto out; + offs = wbuf->offs; + written = 0; - offs = wbuf->offs + wbuf->size; - len -= wbuf->avail; - aligned_len -= wbuf->avail; - written = wbuf->avail; + if (wbuf->used) { + /* + * The node is large enough and does not fit entirely within + * current available space. We have to fill and flush + * write-buffer and switch to the next max. write unit. + */ + dbg_io("flush jhead %s wbuf to LEB %d:%d", + dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written += wbuf->avail; + } else if (wbuf->offs & (c->max_write_size - 1)) { + /* + * The write-buffer offset is not aligned to + * @c->max_write_size and @wbuf->size is less than + * @c->max_write_size. Write @wbuf->size bytes to make sure the + * following writes are done in optimal @c->max_write_size + * chunks. + */ + dbg_io("write %d bytes to LEB %d:%d", + wbuf->size, wbuf->lnum, wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs, + wbuf->size, wbuf->dtype); + if (err) + goto out; + + offs += wbuf->size; + len -= wbuf->size; + aligned_len -= wbuf->size; + written += wbuf->size; + } /* - * The remaining data may take more whole min. I/O units, so write the - * remains multiple to min. I/O unit size directly to the flash media. + * The remaining data may take more whole max. write units, so write the + * remains multiple to max. write unit size directly to the flash media. * We align node length to 8-byte boundary because we anyway flash wbuf * if the remaining space is less than 8 bytes. */ - n = aligned_len >> c->min_io_shift; + n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->min_io_shift; + n <<= c->max_write_shift; dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, wbuf->dtype); @@ -619,15 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (aligned_len) /* * And now we have what's left and what does not take whole - * min. I/O unit, so write it to the write-buffer and we are + * max. write unit, so write it to the write-buffer and we are * done. */ memcpy(wbuf->buf, buf + written, len); wbuf->offs = offs; + if (c->leb_size - wbuf->offs >= c->max_write_size) + wbuf->size = c->max_write_size; + else + wbuf->size = c->leb_size - wbuf->offs; + wbuf->avail = wbuf->size - aligned_len; wbuf->used = aligned_len; - wbuf->avail = c->min_io_size - aligned_len; - wbuf->size = c->min_io_size; wbuf->next_ino = 0; spin_unlock(&wbuf->lock); @@ -851,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; - wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL); if (!wbuf->buf) return -ENOMEM; - size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); wbuf->inodes = kmalloc(size, GFP_KERNEL); if (!wbuf->inodes) { kfree(wbuf->buf); @@ -865,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->used = 0; wbuf->lnum = wbuf->offs = -1; - wbuf->avail = wbuf->size = c->min_io_size; + /* + * If the LEB starts at the max. write size aligned address, then + * write-buffer size has to be set to @c->max_write_size. Otherwise, + * set it to something smaller so that it ends at the closest max. + * write size boundary. + */ + size = c->max_write_size - (c->leb_start % c->max_write_size); + wbuf->avail = wbuf->size = size; wbuf->dtype = UBI_UNKNOWN; wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); From 2765df7da540687c4d57ca840182122f074c5b9c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 2 Feb 2011 09:22:54 +0200 Subject: [PATCH 14/25] UBIFS: use max_write_size during recovery When recovering from unclean reboots UBIFS scans the journal and checks nodes. If a corrupted node is found, UBIFS tries to check if this is the last node in the LEB or not. This is is done by checking if there only 0xFF bytes starting from the next min. I/O unit. However, since now we write in c->max_write_size, we should actually check for 0xFFs starting from the next max. write unit. Signed-off-by: Artem Bityutskiy --- fs/ubifs/recovery.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index e2714f8f05ff..936f2cbfe6b6 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -38,7 +38,7 @@ * UBIFS writes only to erased LEBs, so it writes only to the flash space * containing only 0xFFs. UBIFS also always writes strictly from the beginning * of the LEB to the end. And UBIFS assumes that the underlying flash media - * writes in @c->min_io_unit bytes at a time. + * writes in @c->max_write_size bytes at a time. * * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min. * I/O unit corresponding to offset X to contain corrupted data, all the @@ -379,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * @offs: offset to check * * This function returns %1 if @offs was in the last write to the LEB whose data - * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next @c->min_io_size boundary. + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next @c->max_write_size + * boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { @@ -388,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) uint8_t *p; /* - * Round up to the next @c->min_io_size boundary i.e. @offs is in the - * last wbuf written. After that should be empty space. + * Round up to the next @c->max_write_size boundary i.e. @offs is in + * the last wbuf written. After that should be empty space. */ - empty_offs = ALIGN(offs + 1, c->min_io_size); + empty_offs = ALIGN(offs + 1, c->max_write_size); check_len = c->leb_size - empty_offs; p = buf + empty_offs - offs; return is_empty(p, check_len); @@ -446,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int skip, dlen = le32_to_cpu(ch->len); /* Check for empty space after the corrupt node's common header */ - skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs; if (is_empty(buf + skip, len - skip)) return 1; /* @@ -458,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, return 0; } /* Now we know the corrupt node's length we can skip over it */ - skip = ALIGN(offs + dlen, c->min_io_size) - offs; + skip = ALIGN(offs + dlen, c->max_write_size) - offs; /* After which there should be empty space */ if (is_empty(buf + skip, len - skip)) return 1; @@ -857,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, static int recover_head(const struct ubifs_info *c, int lnum, int offs, void *sbuf) { - int len, err; + int len = c->max_write_size, err; - if (c->min_io_size > 1) - len = c->min_io_size; - else - len = 512; if (offs + len > c->leb_size) len = c->leb_size - offs; From d882962f6af2b484b62a7fb05ef959e1bf355fc4 Mon Sep 17 00:00:00 2001 From: "Matthew L. Creech" Date: Fri, 4 Mar 2011 17:55:02 -0500 Subject: [PATCH 15/25] UBIFS: handle allocation failures in UBIFS write path Running kernel 2.6.37, my PPC-based device occasionally gets an order-2 allocation failure in UBIFS, which causes the root FS to become unwritable: kswapd0: page allocation failure. order:2, mode:0x4050 Call Trace: [c787dc30] [c00085b8] show_stack+0x7c/0x194 (unreliable) [c787dc70] [c0061aec] __alloc_pages_nodemask+0x4f0/0x57c [c787dd00] [c0061b98] __get_free_pages+0x20/0x50 [c787dd10] [c00e4f88] ubifs_jnl_write_data+0x54/0x200 [c787dd50] [c00e82d4] do_writepage+0x94/0x198 [c787dd90] [c00675e4] shrink_page_list+0x40c/0x77c [c787de40] [c0067de0] shrink_inactive_list+0x1e0/0x370 [c787de90] [c0068224] shrink_zone+0x2b4/0x2b8 [c787df00] [c0068854] kswapd+0x408/0x5d4 [c787dfb0] [c0037bcc] kthread+0x80/0x84 [c787dff0] [c000ef44] kernel_thread+0x4c/0x68 Similar problems were encountered last April by Tomasz Stanislawski: http://patchwork.ozlabs.org/patch/50965/ This patch implements Artem's suggested fix: fall back to a mutex-protected static buffer, allocated at mount time. I tested it by forcing execution down the failure path, and didn't see any ill effects. Artem: massaged the patch a little, improved it so that we'd not allocate the write reserve buffer when we are in R/O mode. Signed-off-by: Matthew L. Creech Signed-off-by: Artem Bityutskiy --- fs/ubifs/journal.c | 28 ++++++++++++++++++++++------ fs/ubifs/super.c | 18 ++++++++++++++++++ fs/ubifs/ubifs.h | 14 ++++++++++++++ 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 914f1bd89e57..aed25e864227 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len; - int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_jnl("ino %lu, blk %u, len %d, key %s", @@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, DBGKEY(key)); ubifs_assert(len <= UBIFS_BLOCK_SIZE); - data = kmalloc(dlen, GFP_NOFS); - if (!data) - return -ENOMEM; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); + if (!data) { + /* + * Fall-back to the write reserve buffer. Note, we might be + * currently on the memory reclaim path, when the kernel is + * trying to free some memory by writing out dirty pages. The + * write reserve buffer helps us to guarantee that we are + * always able to write the data. + */ + allocated = 0; + mutex_lock(&c->write_reserve_mutex); + data = c->write_reserve_buf; + } data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); @@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, goto out_ro; finish_reservation(c); - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return 0; out_release: @@ -745,7 +758,10 @@ out_ro: ubifs_ro_mode(c, err); finish_reservation(c); out_free: - kfree(data); + if (!allocated) + mutex_unlock(&c->write_reserve_mutex); + else + kfree(data); return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index d4b4cb4596e2..e360c7a71f9e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1213,6 +1213,13 @@ static int mount_ubifs(struct ubifs_info *c) if (c->bulk_read == 1) bu_init(c); + if (!c->ro_mount) { + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + GFP_KERNEL); + if (!c->write_reserve_buf) + goto out_free; + } + c->mounting = 1; err = ubifs_read_superblock(c); @@ -1482,6 +1489,7 @@ out_wbufs: out_cbuf: kfree(c->cbuf); out_free: + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1520,6 +1528,7 @@ static void ubifs_umount(struct ubifs_info *c) kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); + kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); @@ -1605,6 +1614,10 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + if (!c->write_reserve_buf) + goto out; + err = ubifs_lpt_init(c, 0, 1); if (err) goto out; @@ -1669,6 +1682,8 @@ out: c->bgt = NULL; } free_wbufs(c); + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); @@ -1712,6 +1727,8 @@ static void ubifs_remount_ro(struct ubifs_info *c) free_wbufs(c); vfree(c->orph_buf); c->orph_buf = NULL; + kfree(c->write_reserve_buf); + c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); @@ -1942,6 +1959,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&c->mst_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); c->buds = RB_ROOT; c->old_idx = RB_ROOT; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 362495078489..8c40ad3c6721 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -151,6 +151,12 @@ */ #define WORST_COMPR_FACTOR 2 +/* + * How much memory is needed for a buffer where we comress a data node. + */ +#define COMPRESSED_DATA_NODE_BUF_SZ \ + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 @@ -1005,6 +1011,11 @@ struct ubifs_debug_info; * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu * @bu: pre-allocated bulk-read information * + * @write_reserve_mutex: protects @write_reserve_buf + * @write_reserve_buf: on the write path we allocate memory, which might + * sometimes be unavailable, in which case we use this + * write reserve buffer + * * @log_lebs: number of logical eraseblocks in the log * @log_bytes: log size in bytes * @log_last: last LEB of the log @@ -1256,6 +1267,9 @@ struct ubifs_info { struct mutex bu_mutex; struct bu_info bu; + struct mutex write_reserve_mutex; + void *write_reserve_buf; + int log_lebs; long long log_bytes; int log_last; From 6342aaebda9b94e3cd101ba13eee690ac6577124 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 8 Mar 2011 14:26:47 +0200 Subject: [PATCH 16/25] UBIFS: print max. index node size Improve debugging messages by printing the maximum index node size on mount. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index e360c7a71f9e..e9585ad90f5e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1447,9 +1447,9 @@ static int mount_ubifs(struct ubifs_info *c) UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); - dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, - UBIFS_MAX_DENT_NODE_SZ); + UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); dbg_msg("LEB overhead: %d", c->leb_overhead); From cce3f612fedcbeee61977497b99bbf68a4082b6b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 9 Mar 2011 13:36:23 +0200 Subject: [PATCH 17/25] UBIFS: simplify UBIFS Kconfig menu Remove debug message level and debug checks Kconfig options as they proved to be useless anyway. We have sysfs interface which we can use for fine-grained debugging messages and checks selection, see Documentation/filesystems/ubifs.txt for mode details. Signed-off-by: Artem Bityutskiy --- fs/ubifs/Kconfig | 23 ++++++++++------------- fs/ubifs/debug.c | 4 ++-- fs/ubifs/debug.h | 22 ---------------------- 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 830e3f76f442..1d1859dc3de5 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -44,23 +44,20 @@ config UBIFS_FS_ZLIB # Debugging-related stuff config UBIFS_FS_DEBUG - bool "Enable debugging" + bool "Enable debugging support" depends on UBIFS_FS select DEBUG_FS select KALLSYMS_ALL help - This option enables UBIFS debugging. - -config UBIFS_FS_DEBUG_MSG_LVL - int "Default message level (0 = no extra messages, 3 = lots)" - depends on UBIFS_FS_DEBUG - default "0" - help - This controls the amount of debugging messages produced by UBIFS. - If reporting bugs, please try to have available a full dump of the - messages at level 1 while the misbehaviour was occurring. Level 2 - may become necessary if level 1 messages were not enough to find the - bug. Generally Level 3 should be avoided. + This option enables UBIFS debugging support. It makes sure various + assertions, self-checks, debugging messages and test modes are compiled + in (this all is compiled out otherwise). Assertions are light-weight + and this option also enables them. Self-checks, debugging messages and + test modes are switched off by default. Thus, it is safe and actually + recommended to have debugging support enabled, and it should not slow + down UBIFS. You can then further enable / disable individual debugging + features using UBIFS module parameters and the corresponding sysfs + interfaces. config UBIFS_FS_DEBUG_CHKS bool "Enable extra checks" diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index bcb1acb79263..02c10dccdd60 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; -unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; +unsigned int ubifs_msg_flags; +unsigned int ubifs_chk_flags; unsigned int ubifs_tst_flags; module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 69ebe4729151..10190c189817 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -205,12 +205,6 @@ enum { UBIFS_MSG_RCVRY = 0x1000, }; -/* Debugging message type flags for each default debug message level */ -#define UBIFS_MSG_LVL_0 0 -#define UBIFS_MSG_LVL_1 0x1 -#define UBIFS_MSG_LVL_2 0x7f -#define UBIFS_MSG_LVL_3 0xffff - /* * Debugging check flags (must match chk_names in debug.c). * @@ -243,22 +237,6 @@ enum { UBIFS_TST_RCVRY = 0x4, }; -#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 -#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 -#else -#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 -#endif - -#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS -#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff -#else -#define UBIFS_CHK_FLAGS_DEFAULT 0 -#endif - extern spinlock_t dbg_lock; extern unsigned int ubifs_msg_flags; From 2bcf002159c2aedd5c0ab5a21c3ea73fec87ff8d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 10 Mar 2011 16:26:32 +0200 Subject: [PATCH 18/25] UBIFS: do not check data crc by default Change the default UBIFS behavior WRT data CRC checking. Currently, UBIFS checks data CRC when reading, which slows it down quite a bit, and this is the default option. However, it looks like in average user does not need this feature and would prefer faster read speed over extra reliability. And this seems to be de-facto standard that file-systems do not check data CRC every time they read from the media. Thus, make UBIFS default behavior so that it does not check data CRC. This corresponds to the no_chk_data_crc mount option. Those users who need extra protection can always enable it using the chk_data_crc option. Please, read more information about this feature here: http://www.linux-mtd.infradead.org/doc/ubifs.html#L_checksumming Signed-off-by: Artem Bityutskiy --- Documentation/filesystems/ubifs.txt | 4 ++-- fs/ubifs/super.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index 12fedb7834c6..d7b13b01e980 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -82,12 +82,12 @@ Mount options bulk_read read more in one go to take advantage of flash media that read faster sequentially no_bulk_read (*) do not bulk-read -no_chk_data_crc skip checking of CRCs on data nodes in order to +no_chk_data_crc (*) skip checking of CRCs on data nodes in order to improve read performance. Use this option only if the flash media is highly reliable. The effect of this option is that corruption of the contents of a file can go unnoticed. -chk_data_crc (*) do not skip checking CRCs on data nodes +chk_data_crc do not skip checking CRCs on data nodes compr=none override default compressor and set it to "none" compr=lzo override default compressor and set it to "lzo" compr=zlib override default compressor and set it to "zlib" diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index e9585ad90f5e..1da5155a1bea 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1977,6 +1977,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; c->vfs_sb = sb; c->highest_inum = UBIFS_FIRST_INO; From 73d9aec3fd212d7bf8af5aa1eca9c79f8a90ad5a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 15:39:09 +0200 Subject: [PATCH 19/25] UBIFS: allocate dump buffer on demand Instead of using pre-allocated 'c->dbg->buf' buffer in 'dbg_dump_leb()', dynamically allocate it when needed. The intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and save 128KiB of RAM (or more if PEB size is larger). Indeed, currently we allocate this memory even if the user never enables any self-check, which is wasteful. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 02c10dccdd60..c2e5c08a9d30 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; + void *buf; if (dbg_failure_mode) return; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + return; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { ubifs_err("scan error %d", (int)PTR_ERR(sleb)); - return; + goto out; } printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, @@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); ubifs_scan_destroy(sleb); + +out: + vfree(buf); return; } From cd5f7485bbbbfeea4363b535abeaa01df6942c66 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 15:50:37 +0200 Subject: [PATCH 20/25] UBIFS: allocate scanning buffer on demand Instead of using pre-allocated 'c->dbg->buf' buffer in 'scan_check_cb()', dynamically allocate it when needed. The intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and save 128KiB of RAM (or more if PEB size is larger). Indeed, currently we allocate this memory even if the user never enables any self-check, which is wasteful. Signed-off-by: Artem Bityutskiy --- fs/ubifs/lprops.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 4d4ca388889b..c7b25e2f7764 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c, struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; struct ubifs_lp_stats *lst = &data->lst; - int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; + void *buf = NULL; cat = lp->flags & LPROPS_CAT_MASK; if (cat != LPROPS_UNCAT) { @@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c, } } - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to scan LEB %d", lnum); + goto out; + } + + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { /* * After an unclean unmount, empty and freeable LEBs @@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c, lst->empty_lebs += 1; lst->total_free += c->leb_size; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } if (lp->free + lp->dirty == c->leb_size && @@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c, lst->total_free += lp->free; lst->total_dirty += lp->dirty; lst->total_dark += ubifs_calc_dark(c, c->leb_size); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; + goto exit; } data->err = PTR_ERR(sleb); - return LPT_SCAN_STOP; + ret = LPT_SCAN_STOP; + goto exit; } is_idx = -1; @@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - return LPT_SCAN_CONTINUE; + ret = LPT_SCAN_CONTINUE; +exit: + vfree(buf); + return ret; out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " @@ -1246,6 +1259,7 @@ out_print: out_destroy: ubifs_scan_destroy(sleb); out: + vfree(buf); data->err = -EINVAL; return LPT_SCAN_STOP; } From 6fb324a4b0c3c9297cd569bd125ed691f2f98d57 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 15:56:38 +0200 Subject: [PATCH 21/25] UBIFS: allocate ltab checking buffer on demand Instead of using pre-allocated 'c->dbg->buf' buffer in 'dbg_check_ltab_lnum()', dynamically allocate it when needed. The intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and save 128KiB of RAM (or more if PEB size is larger). Indeed, currently we allocate this memory even if the user never enables any self-check, which is wasteful. Signed-off-by: Artem Bityutskiy --- fs/ubifs/lpt_commit.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 5c90dec5db0b..62a38d9c55e9 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) { int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; int ret; - void *buf = c->dbg->buf; + void *buf, *p; if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) return 0; + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory for ltab checking"); + return 0; + } + dbg_lp("LEB %d", lnum); err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); - return err; + goto out; } while (1) { - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int i, pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { - buf += pad_len; + p += pad_len; len -= pad_len; dirty += pad_len; continue; } - if (!dbg_is_all_ff(buf, len)) { + if (!dbg_is_all_ff(p, len)) { dbg_msg("invalid empty space in LEB %d at %d", lnum, c->leb_size - len); err = -EINVAL; @@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } - return err; + goto out; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); node_len = get_lpt_node_len(c, node_type); ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); if (ret == 1) dirty += node_len; - buf += node_len; + p += node_len; len -= node_len; } + + err = 0; +out: + vfree(buf); + return err; } /** From cab95d446cb766062fa7e2e7e326035d7c65b803 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 16:58:39 +0200 Subject: [PATCH 22/25] UBIFS: allocate lpt dump buffer on demand Instead of using pre-allocated 'c->dbg->buf' buffer in 'dump_lpt_leb()', dynamically allocate it when needed. The intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and save 128KiB of RAM (or more if PEB size is larger). Indeed, currently we allocate this memory even if the user never enables any self-check, which is wasteful. Signed-off-by: Artem Bityutskiy --- fs/ubifs/lpt_commit.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 62a38d9c55e9..0a3c2c3f5c4a 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1881,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) static void dump_lpt_leb(const struct ubifs_info *c, int lnum) { int err, len = c->leb_size, node_type, node_num, node_len, offs; - void *buf = c->dbg->buf; + void *buf, *p; printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", current->pid, lnum); + buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to dump LPT"); + return; + } + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); if (err) { ubifs_err("cannot read LEB %d, error %d", lnum, err); - return; + goto out; } while (1) { offs = c->leb_size - len; - if (!is_a_node(c, buf, len)) { + if (!is_a_node(c, p, len)) { int pad_len; - pad_len = get_pad_len(c, buf, len); + pad_len = get_pad_len(c, p, len); if (pad_len) { printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", lnum, offs, pad_len); - buf += pad_len; + p += pad_len; len -= pad_len; continue; } @@ -1909,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; } - node_type = get_lpt_node_type(c, buf, &node_num); + node_type = get_lpt_node_type(c, p, &node_num); switch (node_type) { case UBIFS_LPT_PNODE: { @@ -1934,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) else printk(KERN_DEBUG "LEB %d:%d, nnode, ", lnum, offs); - err = ubifs_unpack_nnode(c, buf, &nnode); + err = ubifs_unpack_nnode(c, p, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); @@ -1955,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) break; default: ubifs_err("LPT node type %d not recognized", node_type); - return; + goto out; } - buf += node_len; + p += node_len; len -= node_len; } printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", current->pid, lnum); +out: + vfree(buf); + return; } /** From f5cf319cf32d2284b3fbc24f3c526e2a9363b4ac Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 17:11:25 +0200 Subject: [PATCH 23/25] UBIFS: allocate orphans scan buffer on demand Instead of using pre-allocated 'c->dbg->buf' buffer in 'dbg_scan_orphans()', dynamically allocate it when needed. The intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and save 128KiB of RAM (or more if PEB size is larger). Indeed, currently we allocate this memory even if the user never enables any self-check, which is wasteful. Signed-off-by: Artem Bityutskiy --- fs/ubifs/orphan.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 82009c74b6a3..2cdbd31641d7 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) { int lnum, err = 0; + void *buf; /* Check no-orphans flag and skip this if no orphans */ if (c->no_orphs) return 0; + buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL); + if (!buf) { + ubifs_err("cannot allocate memory to check orphans"); + return 0; + } + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { struct ubifs_scan_leb *sleb; - sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); + sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; @@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) break; } + vfree(buf); return err; } From 7c83cc91ab1505e53ebfb99b1ea19ed1cf01c1b0 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 11 Mar 2011 17:15:55 +0200 Subject: [PATCH 24/25] UBIFS: save 128KiB or more RAM When debugging is enabled, we allocate a buffer of PEB size for various debugging purposes. However, now all users of this buffer are gone and we can safely remove it and save 128KiB or more RAM. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 9 --------- fs/ubifs/debug.h | 2 -- 2 files changed, 11 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index c2e5c08a9d30..01c2b028e525 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2701,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c) if (!c->dbg) return -ENOMEM; - c->dbg->buf = vmalloc(c->leb_size); - if (!c->dbg->buf) - goto out; - failure_mode_init(c); return 0; - -out: - kfree(c->dbg); - return -ENOMEM; } /** @@ -2720,7 +2712,6 @@ out: void ubifs_debugging_exit(struct ubifs_info *c) { failure_mode_exit(c); - vfree(c->dbg->buf); kfree(c->dbg); } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 10190c189817..4efbba78669a 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,6 @@ /** * ubifs_debug_info - per-FS debugging information. - * @buf: a buffer of LEB size, used for various purposes * @old_zroot: old index root - used by 'dbg_check_old_index()' * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' @@ -54,7 +53,6 @@ * dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { - void *buf; struct ubifs_zbranch old_zroot; int old_zroot_level; unsigned long long old_zroot_sqnum; From 5d630e43284fdb0613e4e7e7dd906f27bc25b6af Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 14 Mar 2011 17:55:40 +0200 Subject: [PATCH 25/25] UBIFS: clean-up commentaries Clean-up commentaries in debug.h and remove references to non-existing symblols. Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 4efbba78669a..919f0de29d8f 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -171,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) /* - * Debugging message type flags (must match msg_type_names in debug.c). + * Debugging message type flags. * * UBIFS_MSG_GEN: general messages * UBIFS_MSG_JNL: journal messages @@ -204,7 +204,7 @@ enum { }; /* - * Debugging check flags (must match chk_names in debug.c). + * Debugging check flags. * * UBIFS_CHK_GEN: general checks * UBIFS_CHK_TNC: check TNC @@ -225,7 +225,7 @@ enum { }; /* - * Special testing flags (must match tst_names in debug.c). + * Special testing flags. * * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method * UBIFS_TST_RCVRY: failure mode for recovery testing