3584240b9c
Rename all uds_log_* to vdo_log_*. Signed-off-by: Mike Snitzer <snitzer@kernel.org> Signed-off-by: Chung Chung <cchung@redhat.com> Signed-off-by: Matthew Sakai <msakai@redhat.com>
1757 lines
56 KiB
C
1757 lines
56 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright 2023 Red Hat
|
|
*/
|
|
|
|
#include "repair.h"
|
|
|
|
#include <linux/min_heap.h>
|
|
#include <linux/minmax.h>
|
|
|
|
#include "logger.h"
|
|
#include "memory-alloc.h"
|
|
#include "permassert.h"
|
|
|
|
#include "block-map.h"
|
|
#include "completion.h"
|
|
#include "constants.h"
|
|
#include "encodings.h"
|
|
#include "int-map.h"
|
|
#include "io-submitter.h"
|
|
#include "recovery-journal.h"
|
|
#include "slab-depot.h"
|
|
#include "types.h"
|
|
#include "vdo.h"
|
|
#include "wait-queue.h"
|
|
|
|
/*
|
|
* An explicitly numbered block mapping. Numbering the mappings allows them to be sorted by logical
|
|
* block number during repair while still preserving the relative order of journal entries with
|
|
* the same logical block number.
|
|
*/
|
|
struct numbered_block_mapping {
|
|
struct block_map_slot block_map_slot;
|
|
struct block_map_entry block_map_entry;
|
|
/* A serial number to use during replay */
|
|
u32 number;
|
|
} __packed;
|
|
|
|
/*
|
|
* The absolute position of an entry in the recovery journal, including the sector number and the
|
|
* entry number within the sector.
|
|
*/
|
|
struct recovery_point {
|
|
/* Block sequence number */
|
|
sequence_number_t sequence_number;
|
|
/* Sector number */
|
|
u8 sector_count;
|
|
/* Entry number */
|
|
journal_entry_count_t entry_count;
|
|
/* Whether or not the increment portion of the current entry has been applied */
|
|
bool increment_applied;
|
|
};
|
|
|
|
struct repair_completion {
|
|
/* The completion header */
|
|
struct vdo_completion completion;
|
|
|
|
/* A buffer to hold the data read off disk */
|
|
char *journal_data;
|
|
|
|
/* For loading the journal */
|
|
data_vio_count_t vio_count;
|
|
data_vio_count_t vios_complete;
|
|
struct vio *vios;
|
|
|
|
/* The number of entries to be applied to the block map */
|
|
size_t block_map_entry_count;
|
|
/* The sequence number of the first valid block for block map recovery */
|
|
sequence_number_t block_map_head;
|
|
/* The sequence number of the first valid block for slab journal replay */
|
|
sequence_number_t slab_journal_head;
|
|
/* The sequence number of the last valid block of the journal (if known) */
|
|
sequence_number_t tail;
|
|
/*
|
|
* The highest sequence number of the journal. During recovery (vs read-only rebuild), not
|
|
* the same as the tail, since the tail ignores blocks after the first hole.
|
|
*/
|
|
sequence_number_t highest_tail;
|
|
|
|
/* The number of logical blocks currently known to be in use */
|
|
block_count_t logical_blocks_used;
|
|
/* The number of block map data blocks known to be allocated */
|
|
block_count_t block_map_data_blocks;
|
|
|
|
/* These fields are for playing the journal into the block map */
|
|
/* The entry data for the block map recovery */
|
|
struct numbered_block_mapping *entries;
|
|
/* The number of entries in the entry array */
|
|
size_t entry_count;
|
|
/* number of pending (non-ready) requests*/
|
|
page_count_t outstanding;
|
|
/* number of page completions */
|
|
page_count_t page_count;
|
|
bool launching;
|
|
/*
|
|
* a heap wrapping journal_entries. It re-orders and sorts journal entries in ascending LBN
|
|
* order, then original journal order. This permits efficient iteration over the journal
|
|
* entries in order.
|
|
*/
|
|
struct min_heap replay_heap;
|
|
/* Fields tracking progress through the journal entries. */
|
|
struct numbered_block_mapping *current_entry;
|
|
struct numbered_block_mapping *current_unfetched_entry;
|
|
/* Current requested page's PBN */
|
|
physical_block_number_t pbn;
|
|
|
|
/* These fields are only used during recovery. */
|
|
/* A location just beyond the last valid entry of the journal */
|
|
struct recovery_point tail_recovery_point;
|
|
/* The location of the next recovery journal entry to apply */
|
|
struct recovery_point next_recovery_point;
|
|
/* The journal point to give to the next synthesized decref */
|
|
struct journal_point next_journal_point;
|
|
/* The number of entries played into slab journals */
|
|
size_t entries_added_to_slab_journals;
|
|
|
|
/* These fields are only used during read-only rebuild */
|
|
page_count_t page_to_fetch;
|
|
/* the number of leaf pages in the block map */
|
|
page_count_t leaf_pages;
|
|
/* the last slot of the block map */
|
|
struct block_map_slot last_slot;
|
|
|
|
/*
|
|
* The page completions used for playing the journal into the block map, and, during
|
|
* read-only rebuild, for rebuilding the reference counts from the block map.
|
|
*/
|
|
struct vdo_page_completion page_completions[];
|
|
};
|
|
|
|
/*
|
|
* This is a min_heap callback function that orders numbered_block_mappings using the
|
|
* 'block_map_slot' field as the primary key and the mapping 'number' field as the secondary key.
|
|
* Using the mapping number preserves the journal order of entries for the same slot, allowing us
|
|
* to sort by slot while still ensuring we replay all entries with the same slot in the exact order
|
|
* as they appeared in the journal.
|
|
*/
|
|
static bool mapping_is_less_than(const void *item1, const void *item2)
|
|
{
|
|
const struct numbered_block_mapping *mapping1 =
|
|
(const struct numbered_block_mapping *) item1;
|
|
const struct numbered_block_mapping *mapping2 =
|
|
(const struct numbered_block_mapping *) item2;
|
|
|
|
if (mapping1->block_map_slot.pbn != mapping2->block_map_slot.pbn)
|
|
return mapping1->block_map_slot.pbn < mapping2->block_map_slot.pbn;
|
|
|
|
if (mapping1->block_map_slot.slot != mapping2->block_map_slot.slot)
|
|
return mapping1->block_map_slot.slot < mapping2->block_map_slot.slot;
|
|
|
|
if (mapping1->number != mapping2->number)
|
|
return mapping1->number < mapping2->number;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void swap_mappings(void *item1, void *item2)
|
|
{
|
|
struct numbered_block_mapping *mapping1 = item1;
|
|
struct numbered_block_mapping *mapping2 = item2;
|
|
|
|
swap(*mapping1, *mapping2);
|
|
}
|
|
|
|
static const struct min_heap_callbacks repair_min_heap = {
|
|
.elem_size = sizeof(struct numbered_block_mapping),
|
|
.less = mapping_is_less_than,
|
|
.swp = swap_mappings,
|
|
};
|
|
|
|
static struct numbered_block_mapping *sort_next_heap_element(struct repair_completion *repair)
|
|
{
|
|
struct min_heap *heap = &repair->replay_heap;
|
|
struct numbered_block_mapping *last;
|
|
|
|
if (heap->nr == 0)
|
|
return NULL;
|
|
|
|
/*
|
|
* Swap the next heap element with the last one on the heap, popping it off the heap,
|
|
* restore the heap invariant, and return a pointer to the popped element.
|
|
*/
|
|
last = &repair->entries[--heap->nr];
|
|
swap_mappings(heap->data, last);
|
|
min_heapify(heap, 0, &repair_min_heap);
|
|
return last;
|
|
}
|
|
|
|
/**
|
|
* as_repair_completion() - Convert a generic completion to a repair_completion.
|
|
* @completion: The completion to convert.
|
|
*
|
|
* Return: The repair_completion.
|
|
*/
|
|
static inline struct repair_completion * __must_check
|
|
as_repair_completion(struct vdo_completion *completion)
|
|
{
|
|
vdo_assert_completion_type(completion, VDO_REPAIR_COMPLETION);
|
|
return container_of(completion, struct repair_completion, completion);
|
|
}
|
|
|
|
static void prepare_repair_completion(struct repair_completion *repair,
|
|
vdo_action_fn callback, enum vdo_zone_type zone_type)
|
|
{
|
|
struct vdo_completion *completion = &repair->completion;
|
|
const struct thread_config *thread_config = &completion->vdo->thread_config;
|
|
thread_id_t thread_id;
|
|
|
|
/* All blockmap access is done on single thread, so use logical zone 0. */
|
|
thread_id = ((zone_type == VDO_ZONE_TYPE_LOGICAL) ?
|
|
thread_config->logical_threads[0] :
|
|
thread_config->admin_thread);
|
|
vdo_reset_completion(completion);
|
|
vdo_set_completion_callback(completion, callback, thread_id);
|
|
}
|
|
|
|
static void launch_repair_completion(struct repair_completion *repair,
|
|
vdo_action_fn callback, enum vdo_zone_type zone_type)
|
|
{
|
|
prepare_repair_completion(repair, callback, zone_type);
|
|
vdo_launch_completion(&repair->completion);
|
|
}
|
|
|
|
static void uninitialize_vios(struct repair_completion *repair)
|
|
{
|
|
while (repair->vio_count > 0)
|
|
free_vio_components(&repair->vios[--repair->vio_count]);
|
|
|
|
vdo_free(vdo_forget(repair->vios));
|
|
}
|
|
|
|
static void free_repair_completion(struct repair_completion *repair)
|
|
{
|
|
if (repair == NULL)
|
|
return;
|
|
|
|
/*
|
|
* We do this here because this function is the only common bottleneck for all clean up
|
|
* paths.
|
|
*/
|
|
repair->completion.vdo->block_map->zones[0].page_cache.rebuilding = false;
|
|
|
|
uninitialize_vios(repair);
|
|
vdo_free(vdo_forget(repair->journal_data));
|
|
vdo_free(vdo_forget(repair->entries));
|
|
vdo_free(repair);
|
|
}
|
|
|
|
static void finish_repair(struct vdo_completion *completion)
|
|
{
|
|
struct vdo_completion *parent = completion->parent;
|
|
struct vdo *vdo = completion->vdo;
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
|
|
vdo_assert_on_admin_thread(vdo, __func__);
|
|
|
|
if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE)
|
|
vdo->states.vdo.complete_recoveries++;
|
|
|
|
vdo_initialize_recovery_journal_post_repair(vdo->recovery_journal,
|
|
vdo->states.vdo.complete_recoveries,
|
|
repair->highest_tail,
|
|
repair->logical_blocks_used,
|
|
repair->block_map_data_blocks);
|
|
free_repair_completion(vdo_forget(repair));
|
|
|
|
if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
|
|
vdo_log_info("Read-only rebuild complete");
|
|
vdo_launch_completion(parent);
|
|
return;
|
|
}
|
|
|
|
/* FIXME: shouldn't this say either "recovery" or "repair"? */
|
|
vdo_log_info("Rebuild complete");
|
|
|
|
/*
|
|
* Now that we've freed the repair completion and its vast array of journal entries, we
|
|
* can allocate refcounts.
|
|
*/
|
|
vdo_continue_completion(parent, vdo_allocate_reference_counters(vdo->depot));
|
|
}
|
|
|
|
/**
|
|
* abort_repair() - Handle a repair error.
|
|
* @completion: The repair completion.
|
|
*/
|
|
static void abort_repair(struct vdo_completion *completion)
|
|
{
|
|
struct vdo_completion *parent = completion->parent;
|
|
int result = completion->result;
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
|
|
if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state))
|
|
vdo_log_info("Read-only rebuild aborted");
|
|
else
|
|
vdo_log_warning("Recovery aborted");
|
|
|
|
free_repair_completion(vdo_forget(repair));
|
|
vdo_continue_completion(parent, result);
|
|
}
|
|
|
|
/**
|
|
* abort_on_error() - Abort a repair if there is an error.
|
|
* @result: The result to check.
|
|
* @repair: The repair completion.
|
|
*
|
|
* Return: true if the result was an error.
|
|
*/
|
|
static bool __must_check abort_on_error(int result, struct repair_completion *repair)
|
|
{
|
|
if (result == VDO_SUCCESS)
|
|
return false;
|
|
|
|
vdo_fail_completion(&repair->completion, result);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* drain_slab_depot() - Flush out all dirty refcounts blocks now that they have been rebuilt or
|
|
* recovered.
|
|
*/
|
|
static void drain_slab_depot(struct vdo_completion *completion)
|
|
{
|
|
struct vdo *vdo = completion->vdo;
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
const struct admin_state_code *operation;
|
|
|
|
vdo_assert_on_admin_thread(vdo, __func__);
|
|
|
|
prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
|
|
if (vdo_state_requires_read_only_rebuild(vdo->load_state)) {
|
|
vdo_log_info("Saving rebuilt state");
|
|
operation = VDO_ADMIN_STATE_REBUILDING;
|
|
} else {
|
|
vdo_log_info("Replayed %zu journal entries into slab journals",
|
|
repair->entries_added_to_slab_journals);
|
|
operation = VDO_ADMIN_STATE_RECOVERING;
|
|
}
|
|
|
|
vdo_drain_slab_depot(vdo->depot, operation, completion);
|
|
}
|
|
|
|
/**
|
|
* flush_block_map_updates() - Flush the block map now that all the reference counts are rebuilt.
|
|
* @completion: The repair completion.
|
|
*
|
|
* This callback is registered in finish_if_done().
|
|
*/
|
|
static void flush_block_map_updates(struct vdo_completion *completion)
|
|
{
|
|
vdo_assert_on_admin_thread(completion->vdo, __func__);
|
|
|
|
vdo_log_info("Flushing block map changes");
|
|
prepare_repair_completion(as_repair_completion(completion), drain_slab_depot,
|
|
VDO_ZONE_TYPE_ADMIN);
|
|
vdo_drain_block_map(completion->vdo->block_map, VDO_ADMIN_STATE_RECOVERING,
|
|
completion);
|
|
}
|
|
|
|
static bool fetch_page(struct repair_completion *repair,
|
|
struct vdo_completion *completion);
|
|
|
|
/**
|
|
* handle_page_load_error() - Handle an error loading a page.
|
|
* @completion: The vdo_page_completion.
|
|
*/
|
|
static void handle_page_load_error(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = completion->parent;
|
|
|
|
repair->outstanding--;
|
|
vdo_set_completion_result(&repair->completion, completion->result);
|
|
vdo_release_page_completion(completion);
|
|
fetch_page(repair, completion);
|
|
}
|
|
|
|
/**
|
|
* unmap_entry() - Unmap an invalid entry and indicate that its page must be written out.
|
|
* @page: The page containing the entries
|
|
* @completion: The page_completion for writing the page
|
|
* @slot: The slot to unmap
|
|
*/
|
|
static void unmap_entry(struct block_map_page *page, struct vdo_completion *completion,
|
|
slot_number_t slot)
|
|
{
|
|
page->entries[slot] = UNMAPPED_BLOCK_MAP_ENTRY;
|
|
vdo_request_page_write(completion);
|
|
}
|
|
|
|
/**
|
|
* remove_out_of_bounds_entries() - Unmap entries which outside the logical space.
|
|
* @page: The page containing the entries
|
|
* @completion: The page_completion for writing the page
|
|
* @start: The first slot to check
|
|
*/
|
|
static void remove_out_of_bounds_entries(struct block_map_page *page,
|
|
struct vdo_completion *completion,
|
|
slot_number_t start)
|
|
{
|
|
slot_number_t slot;
|
|
|
|
for (slot = start; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
|
|
struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
|
|
|
|
if (vdo_is_mapped_location(&mapping))
|
|
unmap_entry(page, completion, slot);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* process_slot() - Update the reference counts for a single entry.
|
|
* @page: The page containing the entries
|
|
* @completion: The page_completion for writing the page
|
|
* @slot: The slot to check
|
|
*
|
|
* Return: true if the entry was a valid mapping
|
|
*/
|
|
static bool process_slot(struct block_map_page *page, struct vdo_completion *completion,
|
|
slot_number_t slot)
|
|
{
|
|
struct slab_depot *depot = completion->vdo->depot;
|
|
int result;
|
|
struct data_location mapping = vdo_unpack_block_map_entry(&page->entries[slot]);
|
|
|
|
if (!vdo_is_valid_location(&mapping)) {
|
|
/* This entry is invalid, so remove it from the page. */
|
|
unmap_entry(page, completion, slot);
|
|
return false;
|
|
}
|
|
|
|
if (!vdo_is_mapped_location(&mapping))
|
|
return false;
|
|
|
|
|
|
if (mapping.pbn == VDO_ZERO_BLOCK)
|
|
return true;
|
|
|
|
if (!vdo_is_physical_data_block(depot, mapping.pbn)) {
|
|
/*
|
|
* This is a nonsense mapping. Remove it from the map so we're at least consistent
|
|
* and mark the page dirty.
|
|
*/
|
|
unmap_entry(page, completion, slot);
|
|
return false;
|
|
}
|
|
|
|
result = vdo_adjust_reference_count_for_rebuild(depot, mapping.pbn,
|
|
VDO_JOURNAL_DATA_REMAPPING);
|
|
if (result == VDO_SUCCESS)
|
|
return true;
|
|
|
|
vdo_log_error_strerror(result,
|
|
"Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu",
|
|
(unsigned long long) vdo_get_block_map_page_pbn(page),
|
|
slot, (unsigned long long) mapping.pbn);
|
|
unmap_entry(page, completion, slot);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* rebuild_reference_counts_from_page() - Rebuild reference counts from a block map page.
|
|
* @repair: The repair completion.
|
|
* @completion: The page completion holding the page.
|
|
*/
|
|
static void rebuild_reference_counts_from_page(struct repair_completion *repair,
|
|
struct vdo_completion *completion)
|
|
{
|
|
slot_number_t slot, last_slot;
|
|
struct block_map_page *page;
|
|
int result;
|
|
|
|
result = vdo_get_cached_page(completion, &page);
|
|
if (result != VDO_SUCCESS) {
|
|
vdo_set_completion_result(&repair->completion, result);
|
|
return;
|
|
}
|
|
|
|
if (!page->header.initialized)
|
|
return;
|
|
|
|
/* Remove any bogus entries which exist beyond the end of the logical space. */
|
|
if (vdo_get_block_map_page_pbn(page) == repair->last_slot.pbn) {
|
|
last_slot = repair->last_slot.slot;
|
|
remove_out_of_bounds_entries(page, completion, last_slot);
|
|
} else {
|
|
last_slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
|
|
}
|
|
|
|
/* Inform the slab depot of all entries on this page. */
|
|
for (slot = 0; slot < last_slot; slot++) {
|
|
if (process_slot(page, completion, slot))
|
|
repair->logical_blocks_used++;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* page_loaded() - Process a page which has just been loaded.
|
|
* @completion: The vdo_page_completion for the fetched page.
|
|
*
|
|
* This callback is registered by fetch_page().
|
|
*/
|
|
static void page_loaded(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = completion->parent;
|
|
|
|
repair->outstanding--;
|
|
rebuild_reference_counts_from_page(repair, completion);
|
|
vdo_release_page_completion(completion);
|
|
|
|
/* Advance progress to the next page, and fetch the next page we haven't yet requested. */
|
|
fetch_page(repair, completion);
|
|
}
|
|
|
|
static physical_block_number_t get_pbn_to_fetch(struct repair_completion *repair,
|
|
struct block_map *block_map)
|
|
{
|
|
physical_block_number_t pbn = VDO_ZERO_BLOCK;
|
|
|
|
if (repair->completion.result != VDO_SUCCESS)
|
|
return VDO_ZERO_BLOCK;
|
|
|
|
while ((pbn == VDO_ZERO_BLOCK) && (repair->page_to_fetch < repair->leaf_pages))
|
|
pbn = vdo_find_block_map_page_pbn(block_map, repair->page_to_fetch++);
|
|
|
|
if (vdo_is_physical_data_block(repair->completion.vdo->depot, pbn))
|
|
return pbn;
|
|
|
|
vdo_set_completion_result(&repair->completion, VDO_BAD_MAPPING);
|
|
return VDO_ZERO_BLOCK;
|
|
}
|
|
|
|
/**
|
|
* fetch_page() - Fetch a page from the block map.
|
|
* @repair: The repair_completion.
|
|
* @completion: The page completion to use.
|
|
*
|
|
* Return true if the rebuild is complete
|
|
*/
|
|
static bool fetch_page(struct repair_completion *repair,
|
|
struct vdo_completion *completion)
|
|
{
|
|
struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
|
|
struct block_map *block_map = repair->completion.vdo->block_map;
|
|
physical_block_number_t pbn = get_pbn_to_fetch(repair, block_map);
|
|
|
|
if (pbn != VDO_ZERO_BLOCK) {
|
|
repair->outstanding++;
|
|
/*
|
|
* We must set the requeue flag here to ensure that we don't blow the stack if all
|
|
* the requested pages are already in the cache or get load errors.
|
|
*/
|
|
vdo_get_page(page_completion, &block_map->zones[0], pbn, true, repair,
|
|
page_loaded, handle_page_load_error, true);
|
|
}
|
|
|
|
if (repair->outstanding > 0)
|
|
return false;
|
|
|
|
launch_repair_completion(repair, flush_block_map_updates, VDO_ZONE_TYPE_ADMIN);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* rebuild_from_leaves() - Rebuild reference counts from the leaf block map pages.
|
|
* @completion: The repair completion.
|
|
*
|
|
* Rebuilds reference counts from the leaf block map pages now that reference counts have been
|
|
* rebuilt from the interior tree pages (which have been loaded in the process). This callback is
|
|
* registered in rebuild_reference_counts().
|
|
*/
|
|
static void rebuild_from_leaves(struct vdo_completion *completion)
|
|
{
|
|
page_count_t i;
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
struct block_map *map = completion->vdo->block_map;
|
|
|
|
repair->logical_blocks_used = 0;
|
|
|
|
/*
|
|
* The PBN calculation doesn't work until the tree pages have been loaded, so we can't set
|
|
* this value at the start of repair.
|
|
*/
|
|
repair->leaf_pages = vdo_compute_block_map_page_count(map->entry_count);
|
|
repair->last_slot = (struct block_map_slot) {
|
|
.slot = map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE,
|
|
.pbn = vdo_find_block_map_page_pbn(map, repair->leaf_pages - 1),
|
|
};
|
|
if (repair->last_slot.slot == 0)
|
|
repair->last_slot.slot = VDO_BLOCK_MAP_ENTRIES_PER_PAGE;
|
|
|
|
for (i = 0; i < repair->page_count; i++) {
|
|
if (fetch_page(repair, &repair->page_completions[i].completion)) {
|
|
/*
|
|
* The rebuild has already moved on, so it isn't safe nor is there a need
|
|
* to launch any more fetches.
|
|
*/
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* process_entry() - Process a single entry from the block map tree.
|
|
* @pbn: A pbn which holds a block map tree page.
|
|
* @completion: The parent completion of the traversal.
|
|
*
|
|
* Implements vdo_entry_callback_fn.
|
|
*
|
|
* Return: VDO_SUCCESS or an error.
|
|
*/
|
|
static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
struct slab_depot *depot = completion->vdo->depot;
|
|
int result;
|
|
|
|
if ((pbn == VDO_ZERO_BLOCK) || !vdo_is_physical_data_block(depot, pbn)) {
|
|
return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
|
|
"PBN %llu out of range",
|
|
(unsigned long long) pbn);
|
|
}
|
|
|
|
result = vdo_adjust_reference_count_for_rebuild(depot, pbn,
|
|
VDO_JOURNAL_BLOCK_MAP_REMAPPING);
|
|
if (result != VDO_SUCCESS) {
|
|
return vdo_log_error_strerror(result,
|
|
"Could not adjust reference count for block map tree PBN %llu",
|
|
(unsigned long long) pbn);
|
|
}
|
|
|
|
repair->block_map_data_blocks++;
|
|
return VDO_SUCCESS;
|
|
}
|
|
|
|
static void rebuild_reference_counts(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
struct vdo *vdo = completion->vdo;
|
|
struct vdo_page_cache *cache = &vdo->block_map->zones[0].page_cache;
|
|
|
|
/* We must allocate ref_counts before we can rebuild them. */
|
|
if (abort_on_error(vdo_allocate_reference_counters(vdo->depot), repair))
|
|
return;
|
|
|
|
/*
|
|
* Completion chaining from page cache hits can lead to stack overflow during the rebuild,
|
|
* so clear out the cache before this rebuild phase.
|
|
*/
|
|
if (abort_on_error(vdo_invalidate_page_cache(cache), repair))
|
|
return;
|
|
|
|
prepare_repair_completion(repair, rebuild_from_leaves, VDO_ZONE_TYPE_LOGICAL);
|
|
vdo_traverse_forest(vdo->block_map, process_entry, completion);
|
|
}
|
|
|
|
/**
|
|
* increment_recovery_point() - Move the given recovery point forward by one entry.
|
|
*/
|
|
static void increment_recovery_point(struct recovery_point *point)
|
|
{
|
|
if (++point->entry_count < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
|
|
return;
|
|
|
|
point->entry_count = 0;
|
|
if (point->sector_count < (VDO_SECTORS_PER_BLOCK - 1)) {
|
|
point->sector_count++;
|
|
return;
|
|
}
|
|
|
|
point->sequence_number++;
|
|
point->sector_count = 1;
|
|
}
|
|
|
|
/**
|
|
* advance_points() - Advance the current recovery and journal points.
|
|
* @repair: The repair_completion whose points are to be advanced.
|
|
* @entries_per_block: The number of entries in a recovery journal block.
|
|
*/
|
|
static void advance_points(struct repair_completion *repair,
|
|
journal_entry_count_t entries_per_block)
|
|
{
|
|
if (!repair->next_recovery_point.increment_applied) {
|
|
repair->next_recovery_point.increment_applied = true;
|
|
return;
|
|
}
|
|
|
|
increment_recovery_point(&repair->next_recovery_point);
|
|
vdo_advance_journal_point(&repair->next_journal_point, entries_per_block);
|
|
repair->next_recovery_point.increment_applied = false;
|
|
}
|
|
|
|
/**
|
|
* before_recovery_point() - Check whether the first point precedes the second point.
|
|
* @first: The first recovery point.
|
|
* @second: The second recovery point.
|
|
*
|
|
* Return: true if the first point precedes the second point.
|
|
*/
|
|
static bool __must_check before_recovery_point(const struct recovery_point *first,
|
|
const struct recovery_point *second)
|
|
{
|
|
if (first->sequence_number < second->sequence_number)
|
|
return true;
|
|
|
|
if (first->sequence_number > second->sequence_number)
|
|
return false;
|
|
|
|
if (first->sector_count < second->sector_count)
|
|
return true;
|
|
|
|
return ((first->sector_count == second->sector_count) &&
|
|
(first->entry_count < second->entry_count));
|
|
}
|
|
|
|
static struct packed_journal_sector * __must_check get_sector(struct recovery_journal *journal,
|
|
char *journal_data,
|
|
sequence_number_t sequence,
|
|
u8 sector_number)
|
|
{
|
|
off_t offset;
|
|
|
|
offset = ((vdo_get_recovery_journal_block_number(journal, sequence) * VDO_BLOCK_SIZE) +
|
|
(VDO_SECTOR_SIZE * sector_number));
|
|
return (struct packed_journal_sector *) (journal_data + offset);
|
|
}
|
|
|
|
/**
|
|
* get_entry() - Unpack the recovery journal entry associated with the given recovery point.
|
|
* @repair: The repair completion.
|
|
* @point: The recovery point.
|
|
*
|
|
* Return: The unpacked contents of the matching recovery journal entry.
|
|
*/
|
|
static struct recovery_journal_entry get_entry(const struct repair_completion *repair,
|
|
const struct recovery_point *point)
|
|
{
|
|
struct packed_journal_sector *sector;
|
|
|
|
sector = get_sector(repair->completion.vdo->recovery_journal,
|
|
repair->journal_data, point->sequence_number,
|
|
point->sector_count);
|
|
return vdo_unpack_recovery_journal_entry(§or->entries[point->entry_count]);
|
|
}
|
|
|
|
/**
|
|
* validate_recovery_journal_entry() - Validate a recovery journal entry.
|
|
* @vdo: The vdo.
|
|
* @entry: The entry to validate.
|
|
*
|
|
* Return: VDO_SUCCESS or an error.
|
|
*/
|
|
static int validate_recovery_journal_entry(const struct vdo *vdo,
|
|
const struct recovery_journal_entry *entry)
|
|
{
|
|
if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) ||
|
|
(entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) ||
|
|
!vdo_is_valid_location(&entry->mapping) ||
|
|
!vdo_is_valid_location(&entry->unmapping) ||
|
|
!vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn) ||
|
|
!vdo_is_physical_data_block(vdo->depot, entry->unmapping.pbn)) {
|
|
return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
|
|
"Invalid entry: %s (%llu, %u) from %llu to %llu is not within bounds",
|
|
vdo_get_journal_operation_name(entry->operation),
|
|
(unsigned long long) entry->slot.pbn,
|
|
entry->slot.slot,
|
|
(unsigned long long) entry->unmapping.pbn,
|
|
(unsigned long long) entry->mapping.pbn);
|
|
}
|
|
|
|
if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) &&
|
|
(vdo_is_state_compressed(entry->mapping.state) ||
|
|
(entry->mapping.pbn == VDO_ZERO_BLOCK) ||
|
|
(entry->unmapping.state != VDO_MAPPING_STATE_UNMAPPED) ||
|
|
(entry->unmapping.pbn != VDO_ZERO_BLOCK))) {
|
|
return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
|
|
"Invalid entry: %s (%llu, %u) from %llu to %llu is not a valid tree mapping",
|
|
vdo_get_journal_operation_name(entry->operation),
|
|
(unsigned long long) entry->slot.pbn,
|
|
entry->slot.slot,
|
|
(unsigned long long) entry->unmapping.pbn,
|
|
(unsigned long long) entry->mapping.pbn);
|
|
}
|
|
|
|
return VDO_SUCCESS;
|
|
}
|
|
|
|
/**
|
|
* add_slab_journal_entries() - Replay recovery journal entries into the slab journals of the
|
|
* allocator currently being recovered.
|
|
* @completion: The allocator completion.
|
|
*
|
|
* Waits for slab journal tailblock space when necessary. This method is its own callback.
|
|
*/
|
|
static void add_slab_journal_entries(struct vdo_completion *completion)
|
|
{
|
|
struct recovery_point *recovery_point;
|
|
struct repair_completion *repair = completion->parent;
|
|
struct vdo *vdo = completion->vdo;
|
|
struct recovery_journal *journal = vdo->recovery_journal;
|
|
struct block_allocator *allocator = vdo_as_block_allocator(completion);
|
|
|
|
/* Get ready in case we need to enqueue again. */
|
|
vdo_prepare_completion(completion, add_slab_journal_entries,
|
|
vdo_notify_slab_journals_are_recovered,
|
|
completion->callback_thread_id, repair);
|
|
for (recovery_point = &repair->next_recovery_point;
|
|
before_recovery_point(recovery_point, &repair->tail_recovery_point);
|
|
advance_points(repair, journal->entries_per_block)) {
|
|
int result;
|
|
physical_block_number_t pbn;
|
|
struct vdo_slab *slab;
|
|
struct recovery_journal_entry entry = get_entry(repair, recovery_point);
|
|
bool increment = !repair->next_recovery_point.increment_applied;
|
|
|
|
if (increment) {
|
|
result = validate_recovery_journal_entry(vdo, &entry);
|
|
if (result != VDO_SUCCESS) {
|
|
vdo_enter_read_only_mode(vdo, result);
|
|
vdo_fail_completion(completion, result);
|
|
return;
|
|
}
|
|
|
|
pbn = entry.mapping.pbn;
|
|
} else {
|
|
pbn = entry.unmapping.pbn;
|
|
}
|
|
|
|
if (pbn == VDO_ZERO_BLOCK)
|
|
continue;
|
|
|
|
slab = vdo_get_slab(vdo->depot, pbn);
|
|
if (slab->allocator != allocator)
|
|
continue;
|
|
|
|
if (!vdo_attempt_replay_into_slab(slab, pbn, entry.operation, increment,
|
|
&repair->next_journal_point,
|
|
completion))
|
|
return;
|
|
|
|
repair->entries_added_to_slab_journals++;
|
|
}
|
|
|
|
vdo_notify_slab_journals_are_recovered(completion);
|
|
}
|
|
|
|
/**
|
|
* vdo_replay_into_slab_journals() - Replay recovery journal entries in the slab journals of slabs
|
|
* owned by a given block_allocator.
|
|
* @allocator: The allocator whose slab journals are to be recovered.
|
|
* @context: The slab depot load context supplied by a recovery when it loads the depot.
|
|
*/
|
|
void vdo_replay_into_slab_journals(struct block_allocator *allocator, void *context)
|
|
{
|
|
struct vdo_completion *completion = &allocator->completion;
|
|
struct repair_completion *repair = context;
|
|
struct vdo *vdo = completion->vdo;
|
|
|
|
vdo_assert_on_physical_zone_thread(vdo, allocator->zone_number, __func__);
|
|
if (repair->entry_count == 0) {
|
|
/* there's nothing to replay */
|
|
repair->logical_blocks_used = vdo->recovery_journal->logical_blocks_used;
|
|
repair->block_map_data_blocks = vdo->recovery_journal->block_map_data_blocks;
|
|
vdo_notify_slab_journals_are_recovered(completion);
|
|
return;
|
|
}
|
|
|
|
repair->next_recovery_point = (struct recovery_point) {
|
|
.sequence_number = repair->slab_journal_head,
|
|
.sector_count = 1,
|
|
.entry_count = 0,
|
|
};
|
|
|
|
repair->next_journal_point = (struct journal_point) {
|
|
.sequence_number = repair->slab_journal_head,
|
|
.entry_count = 0,
|
|
};
|
|
|
|
vdo_log_info("Replaying entries into slab journals for zone %u",
|
|
allocator->zone_number);
|
|
completion->parent = repair;
|
|
add_slab_journal_entries(completion);
|
|
}
|
|
|
|
static void load_slab_depot(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
const struct admin_state_code *operation;
|
|
|
|
vdo_assert_on_admin_thread(completion->vdo, __func__);
|
|
|
|
if (vdo_state_requires_read_only_rebuild(completion->vdo->load_state)) {
|
|
prepare_repair_completion(repair, rebuild_reference_counts,
|
|
VDO_ZONE_TYPE_LOGICAL);
|
|
operation = VDO_ADMIN_STATE_LOADING_FOR_REBUILD;
|
|
} else {
|
|
prepare_repair_completion(repair, drain_slab_depot, VDO_ZONE_TYPE_ADMIN);
|
|
operation = VDO_ADMIN_STATE_LOADING_FOR_RECOVERY;
|
|
}
|
|
|
|
vdo_load_slab_depot(completion->vdo->depot, operation, completion, repair);
|
|
}
|
|
|
|
static void flush_block_map(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
const struct admin_state_code *operation;
|
|
|
|
vdo_assert_on_admin_thread(completion->vdo, __func__);
|
|
|
|
vdo_log_info("Flushing block map changes");
|
|
prepare_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
|
|
operation = (vdo_state_requires_read_only_rebuild(completion->vdo->load_state) ?
|
|
VDO_ADMIN_STATE_REBUILDING :
|
|
VDO_ADMIN_STATE_RECOVERING);
|
|
vdo_drain_block_map(completion->vdo->block_map, operation, completion);
|
|
}
|
|
|
|
static bool finish_if_done(struct repair_completion *repair)
|
|
{
|
|
/* Pages are still being launched or there is still work to do */
|
|
if (repair->launching || (repair->outstanding > 0))
|
|
return false;
|
|
|
|
if (repair->completion.result != VDO_SUCCESS) {
|
|
page_count_t i;
|
|
|
|
for (i = 0; i < repair->page_count; i++) {
|
|
struct vdo_page_completion *page_completion =
|
|
&repair->page_completions[i];
|
|
|
|
if (page_completion->ready)
|
|
vdo_release_page_completion(&page_completion->completion);
|
|
}
|
|
|
|
vdo_launch_completion(&repair->completion);
|
|
return true;
|
|
}
|
|
|
|
if (repair->current_entry >= repair->entries)
|
|
return false;
|
|
|
|
launch_repair_completion(repair, flush_block_map, VDO_ZONE_TYPE_ADMIN);
|
|
return true;
|
|
}
|
|
|
|
static void abort_block_map_recovery(struct repair_completion *repair, int result)
|
|
{
|
|
vdo_set_completion_result(&repair->completion, result);
|
|
finish_if_done(repair);
|
|
}
|
|
|
|
/**
|
|
* find_entry_starting_next_page() - Find the first journal entry after a given entry which is not
|
|
* on the same block map page.
|
|
* @current_entry: The entry to search from.
|
|
* @needs_sort: Whether sorting is needed to proceed.
|
|
*
|
|
* Return: Pointer to the first later journal entry on a different block map page, or a pointer to
|
|
* just before the journal entries if no subsequent entry is on a different block map page.
|
|
*/
|
|
static struct numbered_block_mapping *
|
|
find_entry_starting_next_page(struct repair_completion *repair,
|
|
struct numbered_block_mapping *current_entry, bool needs_sort)
|
|
{
|
|
size_t current_page;
|
|
|
|
/* If current_entry is invalid, return immediately. */
|
|
if (current_entry < repair->entries)
|
|
return current_entry;
|
|
|
|
current_page = current_entry->block_map_slot.pbn;
|
|
|
|
/* Decrement current_entry until it's out of bounds or on a different page. */
|
|
while ((current_entry >= repair->entries) &&
|
|
(current_entry->block_map_slot.pbn == current_page)) {
|
|
if (needs_sort) {
|
|
struct numbered_block_mapping *just_sorted_entry =
|
|
sort_next_heap_element(repair);
|
|
VDO_ASSERT_LOG_ONLY(just_sorted_entry < current_entry,
|
|
"heap is returning elements in an unexpected order");
|
|
}
|
|
|
|
current_entry--;
|
|
}
|
|
|
|
return current_entry;
|
|
}
|
|
|
|
/*
|
|
* Apply a range of journal entries [starting_entry, ending_entry) journal
|
|
* entries to a block map page.
|
|
*/
|
|
static void apply_journal_entries_to_page(struct block_map_page *page,
|
|
struct numbered_block_mapping *starting_entry,
|
|
struct numbered_block_mapping *ending_entry)
|
|
{
|
|
struct numbered_block_mapping *current_entry = starting_entry;
|
|
|
|
while (current_entry != ending_entry) {
|
|
page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry;
|
|
current_entry--;
|
|
}
|
|
}
|
|
|
|
static void recover_ready_pages(struct repair_completion *repair,
|
|
struct vdo_completion *completion);
|
|
|
|
static void block_map_page_loaded(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion->parent);
|
|
|
|
repair->outstanding--;
|
|
if (!repair->launching)
|
|
recover_ready_pages(repair, completion);
|
|
}
|
|
|
|
static void handle_block_map_page_load_error(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion->parent);
|
|
|
|
repair->outstanding--;
|
|
abort_block_map_recovery(repair, completion->result);
|
|
}
|
|
|
|
static void fetch_block_map_page(struct repair_completion *repair,
|
|
struct vdo_completion *completion)
|
|
{
|
|
physical_block_number_t pbn;
|
|
|
|
if (repair->current_unfetched_entry < repair->entries)
|
|
/* Nothing left to fetch. */
|
|
return;
|
|
|
|
/* Fetch the next page we haven't yet requested. */
|
|
pbn = repair->current_unfetched_entry->block_map_slot.pbn;
|
|
repair->current_unfetched_entry =
|
|
find_entry_starting_next_page(repair, repair->current_unfetched_entry,
|
|
true);
|
|
repair->outstanding++;
|
|
vdo_get_page(((struct vdo_page_completion *) completion),
|
|
&repair->completion.vdo->block_map->zones[0], pbn, true,
|
|
&repair->completion, block_map_page_loaded,
|
|
handle_block_map_page_load_error, false);
|
|
}
|
|
|
|
static struct vdo_page_completion *get_next_page_completion(struct repair_completion *repair,
|
|
struct vdo_page_completion *completion)
|
|
{
|
|
completion++;
|
|
if (completion == (&repair->page_completions[repair->page_count]))
|
|
completion = &repair->page_completions[0];
|
|
return completion;
|
|
}
|
|
|
|
static void recover_ready_pages(struct repair_completion *repair,
|
|
struct vdo_completion *completion)
|
|
{
|
|
struct vdo_page_completion *page_completion = (struct vdo_page_completion *) completion;
|
|
|
|
if (finish_if_done(repair))
|
|
return;
|
|
|
|
if (repair->pbn != page_completion->pbn)
|
|
return;
|
|
|
|
while (page_completion->ready) {
|
|
struct numbered_block_mapping *start_of_next_page;
|
|
struct block_map_page *page;
|
|
int result;
|
|
|
|
result = vdo_get_cached_page(completion, &page);
|
|
if (result != VDO_SUCCESS) {
|
|
abort_block_map_recovery(repair, result);
|
|
return;
|
|
}
|
|
|
|
start_of_next_page =
|
|
find_entry_starting_next_page(repair, repair->current_entry,
|
|
false);
|
|
apply_journal_entries_to_page(page, repair->current_entry,
|
|
start_of_next_page);
|
|
repair->current_entry = start_of_next_page;
|
|
vdo_request_page_write(completion);
|
|
vdo_release_page_completion(completion);
|
|
|
|
if (finish_if_done(repair))
|
|
return;
|
|
|
|
repair->pbn = repair->current_entry->block_map_slot.pbn;
|
|
fetch_block_map_page(repair, completion);
|
|
page_completion = get_next_page_completion(repair, page_completion);
|
|
completion = &page_completion->completion;
|
|
}
|
|
}
|
|
|
|
static void recover_block_map(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = as_repair_completion(completion);
|
|
struct vdo *vdo = completion->vdo;
|
|
struct numbered_block_mapping *first_sorted_entry;
|
|
page_count_t i;
|
|
|
|
vdo_assert_on_logical_zone_thread(vdo, 0, __func__);
|
|
|
|
/* Suppress block map errors. */
|
|
vdo->block_map->zones[0].page_cache.rebuilding =
|
|
vdo_state_requires_read_only_rebuild(vdo->load_state);
|
|
|
|
if (repair->block_map_entry_count == 0) {
|
|
vdo_log_info("Replaying 0 recovery entries into block map");
|
|
vdo_free(vdo_forget(repair->journal_data));
|
|
launch_repair_completion(repair, load_slab_depot, VDO_ZONE_TYPE_ADMIN);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Organize the journal entries into a binary heap so we can iterate over them in sorted
|
|
* order incrementally, avoiding an expensive sort call.
|
|
*/
|
|
repair->replay_heap = (struct min_heap) {
|
|
.data = repair->entries,
|
|
.nr = repair->block_map_entry_count,
|
|
.size = repair->block_map_entry_count,
|
|
};
|
|
min_heapify_all(&repair->replay_heap, &repair_min_heap);
|
|
|
|
vdo_log_info("Replaying %zu recovery entries into block map",
|
|
repair->block_map_entry_count);
|
|
|
|
repair->current_entry = &repair->entries[repair->block_map_entry_count - 1];
|
|
first_sorted_entry = sort_next_heap_element(repair);
|
|
VDO_ASSERT_LOG_ONLY(first_sorted_entry == repair->current_entry,
|
|
"heap is returning elements in an unexpected order");
|
|
|
|
/* Prevent any page from being processed until all pages have been launched. */
|
|
repair->launching = true;
|
|
repair->pbn = repair->current_entry->block_map_slot.pbn;
|
|
repair->current_unfetched_entry = repair->current_entry;
|
|
for (i = 0; i < repair->page_count; i++) {
|
|
if (repair->current_unfetched_entry < repair->entries)
|
|
break;
|
|
|
|
fetch_block_map_page(repair, &repair->page_completions[i].completion);
|
|
}
|
|
repair->launching = false;
|
|
|
|
/* Process any ready pages. */
|
|
recover_ready_pages(repair, &repair->page_completions[0].completion);
|
|
}
|
|
|
|
/**
|
|
* get_recovery_journal_block_header() - Get the block header for a block at a position in the
|
|
* journal data and unpack it.
|
|
* @journal: The recovery journal.
|
|
* @data: The recovery journal data.
|
|
* @sequence: The sequence number.
|
|
*
|
|
* Return: The unpacked header.
|
|
*/
|
|
static struct recovery_block_header __must_check
|
|
get_recovery_journal_block_header(struct recovery_journal *journal, char *data,
|
|
sequence_number_t sequence)
|
|
{
|
|
physical_block_number_t pbn =
|
|
vdo_get_recovery_journal_block_number(journal, sequence);
|
|
char *header = &data[pbn * VDO_BLOCK_SIZE];
|
|
|
|
return vdo_unpack_recovery_block_header((struct packed_journal_header *) header);
|
|
}
|
|
|
|
/**
|
|
* is_valid_recovery_journal_block() - Determine whether the given header describes a valid block
|
|
* for the given journal.
|
|
* @journal: The journal to use.
|
|
* @header: The unpacked block header to check.
|
|
* @old_ok: Whether an old format header is valid.
|
|
*
|
|
* A block is not valid if it is unformatted, or if it is older than the last successful recovery
|
|
* or reformat.
|
|
*
|
|
* Return: True if the header is valid.
|
|
*/
|
|
static bool __must_check is_valid_recovery_journal_block(const struct recovery_journal *journal,
|
|
const struct recovery_block_header *header,
|
|
bool old_ok)
|
|
{
|
|
if ((header->nonce != journal->nonce) ||
|
|
(header->recovery_count != journal->recovery_count))
|
|
return false;
|
|
|
|
if (header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL_2)
|
|
return (header->entry_count <= journal->entries_per_block);
|
|
|
|
return (old_ok &&
|
|
(header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) &&
|
|
(header->entry_count <= RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK));
|
|
}
|
|
|
|
/**
|
|
* is_exact_recovery_journal_block() - Determine whether the given header describes the exact block
|
|
* indicated.
|
|
* @journal: The journal to use.
|
|
* @header: The unpacked block header to check.
|
|
* @sequence: The expected sequence number.
|
|
* @type: The expected metadata type.
|
|
*
|
|
* Return: True if the block matches.
|
|
*/
|
|
static bool __must_check is_exact_recovery_journal_block(const struct recovery_journal *journal,
|
|
const struct recovery_block_header *header,
|
|
sequence_number_t sequence,
|
|
enum vdo_metadata_type type)
|
|
{
|
|
return ((header->metadata_type == type) &&
|
|
(header->sequence_number == sequence) &&
|
|
(is_valid_recovery_journal_block(journal, header, true)));
|
|
}
|
|
|
|
/**
|
|
* find_recovery_journal_head_and_tail() - Find the tail and head of the journal.
|
|
*
|
|
* Return: True if there were valid journal blocks.
|
|
*/
|
|
static bool find_recovery_journal_head_and_tail(struct repair_completion *repair)
|
|
{
|
|
struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
|
|
bool found_entries = false;
|
|
physical_block_number_t i;
|
|
|
|
/*
|
|
* Ensure that we don't replay old entries since we know the tail recorded in the super
|
|
* block must be a lower bound. Not doing so can result in extra data loss by setting the
|
|
* tail too early.
|
|
*/
|
|
repair->highest_tail = journal->tail;
|
|
for (i = 0; i < journal->size; i++) {
|
|
struct recovery_block_header header =
|
|
get_recovery_journal_block_header(journal, repair->journal_data, i);
|
|
|
|
if (!is_valid_recovery_journal_block(journal, &header, true)) {
|
|
/* This block is old or incorrectly formatted */
|
|
continue;
|
|
}
|
|
|
|
if (vdo_get_recovery_journal_block_number(journal, header.sequence_number) != i) {
|
|
/* This block is in the wrong location */
|
|
continue;
|
|
}
|
|
|
|
if (header.sequence_number >= repair->highest_tail) {
|
|
found_entries = true;
|
|
repair->highest_tail = header.sequence_number;
|
|
}
|
|
|
|
if (!found_entries)
|
|
continue;
|
|
|
|
if (header.block_map_head > repair->block_map_head)
|
|
repair->block_map_head = header.block_map_head;
|
|
|
|
if (header.slab_journal_head > repair->slab_journal_head)
|
|
repair->slab_journal_head = header.slab_journal_head;
|
|
}
|
|
|
|
return found_entries;
|
|
}
|
|
|
|
/**
|
|
* unpack_entry() - Unpack a recovery journal entry in either format.
|
|
* @vdo: The vdo.
|
|
* @packed: The entry to unpack.
|
|
* @format: The expected format of the entry.
|
|
* @entry: The unpacked entry.
|
|
*
|
|
* Return: true if the entry should be applied.3
|
|
*/
|
|
static bool unpack_entry(struct vdo *vdo, char *packed, enum vdo_metadata_type format,
|
|
struct recovery_journal_entry *entry)
|
|
{
|
|
if (format == VDO_METADATA_RECOVERY_JOURNAL_2) {
|
|
struct packed_recovery_journal_entry *packed_entry =
|
|
(struct packed_recovery_journal_entry *) packed;
|
|
|
|
*entry = vdo_unpack_recovery_journal_entry(packed_entry);
|
|
} else {
|
|
physical_block_number_t low32, high4;
|
|
|
|
struct packed_recovery_journal_entry_1 *packed_entry =
|
|
(struct packed_recovery_journal_entry_1 *) packed;
|
|
|
|
if (packed_entry->operation == VDO_JOURNAL_DATA_INCREMENT)
|
|
entry->operation = VDO_JOURNAL_DATA_REMAPPING;
|
|
else if (packed_entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT)
|
|
entry->operation = VDO_JOURNAL_BLOCK_MAP_REMAPPING;
|
|
else
|
|
return false;
|
|
|
|
low32 = __le32_to_cpu(packed_entry->pbn_low_word);
|
|
high4 = packed_entry->pbn_high_nibble;
|
|
entry->slot = (struct block_map_slot) {
|
|
.pbn = ((high4 << 32) | low32),
|
|
.slot = (packed_entry->slot_low | (packed_entry->slot_high << 6)),
|
|
};
|
|
entry->mapping = vdo_unpack_block_map_entry(&packed_entry->block_map_entry);
|
|
entry->unmapping = (struct data_location) {
|
|
.pbn = VDO_ZERO_BLOCK,
|
|
.state = VDO_MAPPING_STATE_UNMAPPED,
|
|
};
|
|
}
|
|
|
|
return (validate_recovery_journal_entry(vdo, entry) == VDO_SUCCESS);
|
|
}
|
|
|
|
/**
|
|
* append_sector_entries() - Append an array of recovery journal entries from a journal block
|
|
* sector to the array of numbered mappings in the repair completion,
|
|
* numbering each entry in the order they are appended.
|
|
* @repair: The repair completion.
|
|
* @entries: The entries in the sector.
|
|
* @format: The format of the sector.
|
|
* @entry_count: The number of entries to append.
|
|
*/
|
|
static void append_sector_entries(struct repair_completion *repair, char *entries,
|
|
enum vdo_metadata_type format,
|
|
journal_entry_count_t entry_count)
|
|
{
|
|
journal_entry_count_t i;
|
|
struct vdo *vdo = repair->completion.vdo;
|
|
off_t increment = ((format == VDO_METADATA_RECOVERY_JOURNAL_2)
|
|
? sizeof(struct packed_recovery_journal_entry)
|
|
: sizeof(struct packed_recovery_journal_entry_1));
|
|
|
|
for (i = 0; i < entry_count; i++, entries += increment) {
|
|
struct recovery_journal_entry entry;
|
|
|
|
if (!unpack_entry(vdo, entries, format, &entry))
|
|
/* When recovering from read-only mode, ignore damaged entries. */
|
|
continue;
|
|
|
|
repair->entries[repair->block_map_entry_count] =
|
|
(struct numbered_block_mapping) {
|
|
.block_map_slot = entry.slot,
|
|
.block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
|
|
entry.mapping.state),
|
|
.number = repair->block_map_entry_count,
|
|
};
|
|
repair->block_map_entry_count++;
|
|
}
|
|
}
|
|
|
|
static journal_entry_count_t entries_per_sector(enum vdo_metadata_type format,
|
|
u8 sector_number)
|
|
{
|
|
if (format == VDO_METADATA_RECOVERY_JOURNAL_2)
|
|
return RECOVERY_JOURNAL_ENTRIES_PER_SECTOR;
|
|
|
|
return ((sector_number == (VDO_SECTORS_PER_BLOCK - 1))
|
|
? RECOVERY_JOURNAL_1_ENTRIES_IN_LAST_SECTOR
|
|
: RECOVERY_JOURNAL_1_ENTRIES_PER_SECTOR);
|
|
}
|
|
|
|
static void extract_entries_from_block(struct repair_completion *repair,
|
|
struct recovery_journal *journal,
|
|
sequence_number_t sequence,
|
|
enum vdo_metadata_type format,
|
|
journal_entry_count_t entries)
|
|
{
|
|
sector_count_t i;
|
|
struct recovery_block_header header =
|
|
get_recovery_journal_block_header(journal, repair->journal_data,
|
|
sequence);
|
|
|
|
if (!is_exact_recovery_journal_block(journal, &header, sequence, format)) {
|
|
/* This block is invalid, so skip it. */
|
|
return;
|
|
}
|
|
|
|
entries = min(entries, header.entry_count);
|
|
for (i = 1; i < VDO_SECTORS_PER_BLOCK; i++) {
|
|
struct packed_journal_sector *sector =
|
|
get_sector(journal, repair->journal_data, sequence, i);
|
|
journal_entry_count_t sector_entries =
|
|
min(entries, entries_per_sector(format, i));
|
|
|
|
if (vdo_is_valid_recovery_journal_sector(&header, sector, i)) {
|
|
/* Only extract as many as the block header calls for. */
|
|
append_sector_entries(repair, (char *) sector->entries, format,
|
|
min_t(journal_entry_count_t,
|
|
sector->entry_count,
|
|
sector_entries));
|
|
}
|
|
|
|
/*
|
|
* Even if the sector wasn't full, count it as full when counting up to the
|
|
* entry count the block header claims.
|
|
*/
|
|
entries -= sector_entries;
|
|
}
|
|
}
|
|
|
|
static int parse_journal_for_rebuild(struct repair_completion *repair)
|
|
{
|
|
int result;
|
|
sequence_number_t i;
|
|
block_count_t count;
|
|
enum vdo_metadata_type format;
|
|
struct vdo *vdo = repair->completion.vdo;
|
|
struct recovery_journal *journal = vdo->recovery_journal;
|
|
journal_entry_count_t entries_per_block = journal->entries_per_block;
|
|
|
|
format = get_recovery_journal_block_header(journal, repair->journal_data,
|
|
repair->highest_tail).metadata_type;
|
|
if (format == VDO_METADATA_RECOVERY_JOURNAL)
|
|
entries_per_block = RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK;
|
|
|
|
/*
|
|
* Allocate an array of numbered_block_mapping structures large enough to transcribe every
|
|
* packed_recovery_journal_entry from every valid journal block.
|
|
*/
|
|
count = ((repair->highest_tail - repair->block_map_head + 1) * entries_per_block);
|
|
result = vdo_allocate(count, struct numbered_block_mapping, __func__,
|
|
&repair->entries);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
for (i = repair->block_map_head; i <= repair->highest_tail; i++)
|
|
extract_entries_from_block(repair, journal, i, format, entries_per_block);
|
|
|
|
return VDO_SUCCESS;
|
|
}
|
|
|
|
static int validate_heads(struct repair_completion *repair)
|
|
{
|
|
/* Both reap heads must be behind the tail. */
|
|
if ((repair->block_map_head <= repair->tail) &&
|
|
(repair->slab_journal_head <= repair->tail))
|
|
return VDO_SUCCESS;
|
|
|
|
|
|
return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
|
|
"Journal tail too early. block map head: %llu, slab journal head: %llu, tail: %llu",
|
|
(unsigned long long) repair->block_map_head,
|
|
(unsigned long long) repair->slab_journal_head,
|
|
(unsigned long long) repair->tail);
|
|
}
|
|
|
|
/**
|
|
* extract_new_mappings() - Find all valid new mappings to be applied to the block map.
|
|
*
|
|
* The mappings are extracted from the journal and stored in a sortable array so that all of the
|
|
* mappings to be applied to a given block map page can be done in a single page fetch.
|
|
*/
|
|
static int extract_new_mappings(struct repair_completion *repair)
|
|
{
|
|
int result;
|
|
struct vdo *vdo = repair->completion.vdo;
|
|
struct recovery_point recovery_point = {
|
|
.sequence_number = repair->block_map_head,
|
|
.sector_count = 1,
|
|
.entry_count = 0,
|
|
};
|
|
|
|
/*
|
|
* Allocate an array of numbered_block_mapping structs just large enough to transcribe
|
|
* every packed_recovery_journal_entry from every valid journal block.
|
|
*/
|
|
result = vdo_allocate(repair->entry_count, struct numbered_block_mapping,
|
|
__func__, &repair->entries);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
|
|
increment_recovery_point(&recovery_point)) {
|
|
struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
|
|
|
|
result = validate_recovery_journal_entry(vdo, &entry);
|
|
if (result != VDO_SUCCESS) {
|
|
vdo_enter_read_only_mode(vdo, result);
|
|
return result;
|
|
}
|
|
|
|
repair->entries[repair->block_map_entry_count] =
|
|
(struct numbered_block_mapping) {
|
|
.block_map_slot = entry.slot,
|
|
.block_map_entry = vdo_pack_block_map_entry(entry.mapping.pbn,
|
|
entry.mapping.state),
|
|
.number = repair->block_map_entry_count,
|
|
};
|
|
repair->block_map_entry_count++;
|
|
}
|
|
|
|
result = VDO_ASSERT((repair->block_map_entry_count <= repair->entry_count),
|
|
"approximate entry count is an upper bound");
|
|
if (result != VDO_SUCCESS)
|
|
vdo_enter_read_only_mode(vdo, result);
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* compute_usages() - Compute the lbns in use and block map data blocks counts from the tail of
|
|
* the journal.
|
|
*/
|
|
static noinline int compute_usages(struct repair_completion *repair)
|
|
{
|
|
/*
|
|
* This function is declared noinline to avoid a spurious valgrind error regarding the
|
|
* following structure being uninitialized.
|
|
*/
|
|
struct recovery_point recovery_point = {
|
|
.sequence_number = repair->tail,
|
|
.sector_count = 1,
|
|
.entry_count = 0,
|
|
};
|
|
|
|
struct vdo *vdo = repair->completion.vdo;
|
|
struct recovery_journal *journal = vdo->recovery_journal;
|
|
struct recovery_block_header header =
|
|
get_recovery_journal_block_header(journal, repair->journal_data,
|
|
repair->tail);
|
|
|
|
repair->logical_blocks_used = header.logical_blocks_used;
|
|
repair->block_map_data_blocks = header.block_map_data_blocks;
|
|
|
|
for (; before_recovery_point(&recovery_point, &repair->tail_recovery_point);
|
|
increment_recovery_point(&recovery_point)) {
|
|
struct recovery_journal_entry entry = get_entry(repair, &recovery_point);
|
|
int result;
|
|
|
|
result = validate_recovery_journal_entry(vdo, &entry);
|
|
if (result != VDO_SUCCESS) {
|
|
vdo_enter_read_only_mode(vdo, result);
|
|
return result;
|
|
}
|
|
|
|
if (entry.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
|
|
repair->block_map_data_blocks++;
|
|
continue;
|
|
}
|
|
|
|
if (vdo_is_mapped_location(&entry.mapping))
|
|
repair->logical_blocks_used++;
|
|
|
|
if (vdo_is_mapped_location(&entry.unmapping))
|
|
repair->logical_blocks_used--;
|
|
}
|
|
|
|
return VDO_SUCCESS;
|
|
}
|
|
|
|
static int parse_journal_for_recovery(struct repair_completion *repair)
|
|
{
|
|
int result;
|
|
sequence_number_t i, head;
|
|
bool found_entries = false;
|
|
struct recovery_journal *journal = repair->completion.vdo->recovery_journal;
|
|
|
|
head = min(repair->block_map_head, repair->slab_journal_head);
|
|
for (i = head; i <= repair->highest_tail; i++) {
|
|
struct recovery_block_header header;
|
|
journal_entry_count_t block_entries;
|
|
u8 j;
|
|
|
|
repair->tail = i;
|
|
repair->tail_recovery_point = (struct recovery_point) {
|
|
.sequence_number = i,
|
|
.sector_count = 0,
|
|
.entry_count = 0,
|
|
};
|
|
|
|
header = get_recovery_journal_block_header(journal, repair->journal_data, i);
|
|
if (header.metadata_type == VDO_METADATA_RECOVERY_JOURNAL) {
|
|
/* This is an old format block, so we need to upgrade */
|
|
vdo_log_error_strerror(VDO_UNSUPPORTED_VERSION,
|
|
"Recovery journal is in the old format, a read-only rebuild is required.");
|
|
vdo_enter_read_only_mode(repair->completion.vdo,
|
|
VDO_UNSUPPORTED_VERSION);
|
|
return VDO_UNSUPPORTED_VERSION;
|
|
}
|
|
|
|
if (!is_exact_recovery_journal_block(journal, &header, i,
|
|
VDO_METADATA_RECOVERY_JOURNAL_2)) {
|
|
/* A bad block header was found so this must be the end of the journal. */
|
|
break;
|
|
}
|
|
|
|
block_entries = header.entry_count;
|
|
|
|
/* Examine each sector in turn to determine the last valid sector. */
|
|
for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) {
|
|
struct packed_journal_sector *sector =
|
|
get_sector(journal, repair->journal_data, i, j);
|
|
journal_entry_count_t sector_entries =
|
|
min_t(journal_entry_count_t, sector->entry_count,
|
|
block_entries);
|
|
|
|
/* A bad sector means that this block was torn. */
|
|
if (!vdo_is_valid_recovery_journal_sector(&header, sector, j))
|
|
break;
|
|
|
|
if (sector_entries > 0) {
|
|
found_entries = true;
|
|
repair->tail_recovery_point.sector_count++;
|
|
repair->tail_recovery_point.entry_count = sector_entries;
|
|
block_entries -= sector_entries;
|
|
repair->entry_count += sector_entries;
|
|
}
|
|
|
|
/* If this sector is short, the later sectors can't matter. */
|
|
if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) ||
|
|
(block_entries == 0))
|
|
break;
|
|
}
|
|
|
|
/* If this block was not filled, or if it tore, no later block can matter. */
|
|
if ((header.entry_count != journal->entries_per_block) || (block_entries > 0))
|
|
break;
|
|
}
|
|
|
|
if (!found_entries)
|
|
return validate_heads(repair);
|
|
|
|
/* Set the tail to the last valid tail block, if there is one. */
|
|
if (repair->tail_recovery_point.sector_count == 0)
|
|
repair->tail--;
|
|
|
|
result = validate_heads(repair);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
vdo_log_info("Highest-numbered recovery journal block has sequence number %llu, and the highest-numbered usable block is %llu",
|
|
(unsigned long long) repair->highest_tail,
|
|
(unsigned long long) repair->tail);
|
|
|
|
result = extract_new_mappings(repair);
|
|
if (result != VDO_SUCCESS)
|
|
return result;
|
|
|
|
return compute_usages(repair);
|
|
}
|
|
|
|
static int parse_journal(struct repair_completion *repair)
|
|
{
|
|
if (!find_recovery_journal_head_and_tail(repair))
|
|
return VDO_SUCCESS;
|
|
|
|
return (vdo_state_requires_read_only_rebuild(repair->completion.vdo->load_state) ?
|
|
parse_journal_for_rebuild(repair) :
|
|
parse_journal_for_recovery(repair));
|
|
}
|
|
|
|
static void finish_journal_load(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = completion->parent;
|
|
|
|
if (++repair->vios_complete != repair->vio_count)
|
|
return;
|
|
|
|
vdo_log_info("Finished reading recovery journal");
|
|
uninitialize_vios(repair);
|
|
prepare_repair_completion(repair, recover_block_map, VDO_ZONE_TYPE_LOGICAL);
|
|
vdo_continue_completion(&repair->completion, parse_journal(repair));
|
|
}
|
|
|
|
static void handle_journal_load_error(struct vdo_completion *completion)
|
|
{
|
|
struct repair_completion *repair = completion->parent;
|
|
|
|
/* Preserve the error */
|
|
vdo_set_completion_result(&repair->completion, completion->result);
|
|
vio_record_metadata_io_error(as_vio(completion));
|
|
completion->callback(completion);
|
|
}
|
|
|
|
static void read_journal_endio(struct bio *bio)
|
|
{
|
|
struct vio *vio = bio->bi_private;
|
|
struct vdo *vdo = vio->completion.vdo;
|
|
|
|
continue_vio_after_io(vio, finish_journal_load, vdo->thread_config.admin_thread);
|
|
}
|
|
|
|
/**
|
|
* vdo_repair() - Load the recovery journal and then recover or rebuild a vdo.
|
|
* @parent: The completion to notify when the operation is complete
|
|
*/
|
|
void vdo_repair(struct vdo_completion *parent)
|
|
{
|
|
int result;
|
|
char *ptr;
|
|
struct repair_completion *repair;
|
|
struct vdo *vdo = parent->vdo;
|
|
struct recovery_journal *journal = vdo->recovery_journal;
|
|
physical_block_number_t pbn = journal->origin;
|
|
block_count_t remaining = journal->size;
|
|
block_count_t vio_count = DIV_ROUND_UP(remaining, MAX_BLOCKS_PER_VIO);
|
|
page_count_t page_count = min_t(page_count_t,
|
|
vdo->device_config->cache_size >> 1,
|
|
MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS);
|
|
|
|
vdo_assert_on_admin_thread(vdo, __func__);
|
|
|
|
if (vdo->load_state == VDO_FORCE_REBUILD) {
|
|
vdo_log_warning("Rebuilding reference counts to clear read-only mode");
|
|
vdo->states.vdo.read_only_recoveries++;
|
|
} else if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) {
|
|
vdo_log_warning("Rebuilding reference counts for upgrade");
|
|
} else {
|
|
vdo_log_warning("Device was dirty, rebuilding reference counts");
|
|
}
|
|
|
|
result = vdo_allocate_extended(struct repair_completion, page_count,
|
|
struct vdo_page_completion, __func__,
|
|
&repair);
|
|
if (result != VDO_SUCCESS) {
|
|
vdo_fail_completion(parent, result);
|
|
return;
|
|
}
|
|
|
|
vdo_initialize_completion(&repair->completion, vdo, VDO_REPAIR_COMPLETION);
|
|
repair->completion.error_handler = abort_repair;
|
|
repair->completion.parent = parent;
|
|
prepare_repair_completion(repair, finish_repair, VDO_ZONE_TYPE_ADMIN);
|
|
repair->page_count = page_count;
|
|
|
|
result = vdo_allocate(remaining * VDO_BLOCK_SIZE, char, __func__,
|
|
&repair->journal_data);
|
|
if (abort_on_error(result, repair))
|
|
return;
|
|
|
|
result = vdo_allocate(vio_count, struct vio, __func__, &repair->vios);
|
|
if (abort_on_error(result, repair))
|
|
return;
|
|
|
|
ptr = repair->journal_data;
|
|
for (repair->vio_count = 0; repair->vio_count < vio_count; repair->vio_count++) {
|
|
block_count_t blocks = min_t(block_count_t, remaining,
|
|
MAX_BLOCKS_PER_VIO);
|
|
|
|
result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
|
|
VIO_PRIORITY_METADATA,
|
|
repair, blocks, ptr,
|
|
&repair->vios[repair->vio_count]);
|
|
if (abort_on_error(result, repair))
|
|
return;
|
|
|
|
ptr += (blocks * VDO_BLOCK_SIZE);
|
|
remaining -= blocks;
|
|
}
|
|
|
|
for (vio_count = 0; vio_count < repair->vio_count;
|
|
vio_count++, pbn += MAX_BLOCKS_PER_VIO) {
|
|
vdo_submit_metadata_vio(&repair->vios[vio_count], pbn, read_journal_endio,
|
|
handle_journal_load_error, REQ_OP_READ);
|
|
}
|
|
}
|