1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-12-21 13:34:40 +03:00

Merge branch 'master' into 2018-04-30-vdo-support

This commit is contained in:
Joe Thornber 2018-05-10 12:34:04 +01:00
commit e649f71022
8 changed files with 302 additions and 32 deletions

189
doc/lvm-disk-reading.txt Normal file
View File

@ -0,0 +1,189 @@
LVM disk reading
Reading disks happens in two phases. The first is a discovery phase,
which determines what's on the disks. The second is a working phase,
which does a particular job for the command.
Phase 1: Discovery
------------------
Read all the disks on the system to find out:
- What are the LVM devices?
- What VG's exist on those devices?
This phase is called "label scan" (although it reads and scans everything,
not just the label.) It stores the information it discovers (what LVM
devices exist, and what VGs exist on them) in lvmcache. The devs/VGs info
in lvmcache is the starting point for phase two.
Phase 1 in outline:
For each device:
a. Read the first <N> KB of the device. (N is configurable.)
b. Look for the lvm label_header in the first four sectors,
if none exists, it's not an lvm device, so quit looking at it.
(By default, label_header is in the second sector.)
c. Look at the pv_header, which follows the label_header.
This tells us the location of VG metadata on the device.
There can be 0, 1 or 2 copies of VG metadata. The first
is always at the start of the device, the second (if used)
is at the end.
d. Look at the first mda_header (location came from pv_header
in the previous step). This is by default in sector 8,
4096 bytes from the start of the device. This tells us the
location of the actual VG metadata text.
e. Look at the first copy of the text VG metadata (location came
from mda_header in the previous step). This is by default
in sector 9, 4608 bytes from the start of the device.
The VG metadata is only partially analyzed to create a basic
summary of the VG.
f. Store an "info" entry in lvmcache for this device,
indicating that it is an lvm device, and store a "vginfo"
entry in lvmcache indicating the name of the VG seen
in the metadata in step e.
g. If the pv_header in step c shows a second mda_header
location at the end of the device, then read that as
in step d, and repeat steps e-f for it.
At the end of phase 1, lvmcache will have a list of devices
that belong to LVM, and a list of VG names that exist on
those devices. Each device (info struct) is associated
with the VG (vginfo struct) it is used in.
Phase 1 in code:
The most relevant functions are listed for each step in the outline.
lvmcache_label_scan()
label_scan()
. dev_cache_scan()
choose which devices on the system to look at
. for each dev in dev_cache: bcache prefetch/read
. _process_block() to process data from bcache
_find_lvm_header() checks if this is an lvm dev by looking at label_header
_text_read() via ops->read() looks at mda/pv/vg data to populate lvmcache
. _read_mda_header_and_metadata()
raw_read_mda_header()
. _read_mda_header_and_metadata()
read_metadata_location()
text_read_metadata_summary()
config_file_read_fd()
_read_vgsummary() via ops->read_vgsummary()
. _text_read(): lvmcache_add()
[adds this device to list of lvm devices]
_read_mda_header_and_metadata(): lvmcache_update_vgname_and_id()
[adds the VG name to list of VGs]
Phase 2: Work
-------------
This phase carries out the operation requested by the command that was
run.
Whereas the first phase is based on iterating through each device on the
system, this phase is based on iterating through each VG name. The list
of VG names comes from phase 1, which stored the list in lvmcache to be
used by phase 2.
Some commands may need to iterate through all VG names, while others may
need to iterate through just one or two.
This phase includes locking each VG as work is done on it, so that two
commands do not interfere with each other.
Phase 2 in outline:
For each VG name:
a. Lock the VG.
b. Repeat the phase 1 scan steps for each device in this VG.
The phase 1 information in lvmcache may have changed because no VG lock
was held during phase 1. So, repeat the phase 1 steps, but only for the
devices in this VG. N.B. for commands that are just reporting data,
we skip this step if the data from phase 1 was complete and consistent.
c. Get the list of on-disk metadata locations for this VG.
Phase 1 created this list in lvmcache to be used here. At this
point we copy it out of lvmcache. In the simple/common case,
this is a list of devices in the VG. But, some devices may
have 0 or 2 metadata locations instead of the default 1, so it
is not always equal to the list of devices. We want to read
every copy of the metadata for this VG.
d. For each metadata location on each device in the VG
(the list from the previous step):
1) Look at the mda_header. The location of the mda_header was saved
in the lvmcache info struct by phase 1 (where it came from the
pv_header.) The mda_header tells us where the text VG metadata is
located.
2) Look at the text VG metadata. The location came from mda_header
in the previous step. The VG metadata is fully analyzed and used
to create an in-memory 'struct volume_group'.
e. Compare the copies of VG metadata that were found in each location.
If some copies are older, choose the newest one to use, and update
any older copies.
f. Update details about the devices/VG in lvmcache.
g. Pass the 'vg' struct to the command-specific code to work with.
Phase 2 in code:
The most relevant functions are listed for each step in the outline.
For each VG name:
process_each_vg()
. vg_read()
lock_vol()
. vg_read()
lvmcache_label_rescan_vg() (if needed)
[insert phase 1 steps for scanning devs, but only devs in this vg]
. vg_read()
create_instance()
_text_create_text_instance()
_create_vg_text_instance()
lvmcache_fid_add_mdas_vg()
[Copies mda locations from info->mdas where it was saved
by phase 1, into fid->metadata_areas_in_use. This is
the key connection between phase 1 and phase 2.]
. dm_list_iterate_items(mda, &fid->metadata_areas_in_use)
. _vg_read_raw() via ops->vg_read()
raw_read_mda_header()
. _vg_read_raw()
text_read_metadata()
config_file_read_fd()
_read_vg() via ops->read_vg()
. return the 'vg' struct from vg_read() and use it to do
command-specific work

View File

@ -227,3 +227,46 @@ bool bcache_zero_bytes(struct bcache *cache, int fd, uint64_t start, size_t len)
}
//----------------------------------------------------------------
static bool _set_partial(struct updater *u, int fd, block_address bb, uint64_t offset, size_t len)
{
struct block *b;
uint8_t val = *((uint8_t *) u->data);
if (!bcache_get(u->cache, fd, bb, GF_DIRTY, &b, NULL))
return false;
memset(((unsigned char *) b->data) + offset, val, len);
bcache_put(b);
return true;
}
static bool _set_whole(struct updater *u, int fd, block_address bb, block_address be)
{
struct block *b;
uint8_t val = *((uint8_t *) u->data);
uint64_t len = bcache_block_sectors(u->cache) * 512;
for (; bb != be; bb++) {
if (!bcache_get(u->cache, fd, bb, GF_ZERO, &b, NULL))
return false;
memset((unsigned char *) b->data, val, len);
bcache_put(b);
}
return true;
}
bool bcache_set_bytes(struct bcache *cache, int fd, uint64_t start, size_t len, uint8_t val)
{
struct updater u;
u.cache = cache;
u.partial_fn = _set_partial;
u.whole_fn = _set_whole;
u.data = &val;
return _update_bytes(&u, fd, start, len);
}

View File

@ -162,6 +162,7 @@ void bcache_prefetch_bytes(struct bcache *cache, int fd, uint64_t start, size_t
bool bcache_read_bytes(struct bcache *cache, int fd, uint64_t start, size_t len, void *data);
bool bcache_write_bytes(struct bcache *cache, int fd, uint64_t start, size_t len, void *data);
bool bcache_zero_bytes(struct bcache *cache, int fd, uint64_t start, size_t len);
bool bcache_set_bytes(struct bcache *cache, int fd, uint64_t start, size_t len, uint8_t val);
//----------------------------------------------------------------

View File

@ -17,7 +17,7 @@ SKIP_WITH_LVMPOLLD=1
. lib/inittest
aux have_cache 1 3 0 || skip
aux have_raid 1 3 0 || skip
aux prepare_vg 5 80

View File

@ -21,7 +21,7 @@ SKIP_WITH_LVMPOLLD=1
test $(aux total_mem) -gt $((4096*1024)) || skip
which mkfs.ext4 || skip
aux have_raid 1 13 1 || skip
aux have_raid 1 13 2 || skip
mount_dir="mnt"

View File

@ -15,16 +15,13 @@ SKIP_WITH_LVMPOLLD=1
. lib/inittest
# FIXME - skippping until properly kernel is released
skip
# Test reshaping under io load
# FIXME: This test requires 3GB in /dev/shm!
test $(aux total_mem) -gt $((4096*1024)) || skip
which mkfs.ext4 || skip
aux have_raid 1 13 1 || skip
aux have_raid 1 13 2 || skip
mount_dir="mnt"

View File

@ -21,7 +21,7 @@ SKIP_WITH_LVMPOLLD=1
test $(aux total_mem) -gt $((4096*1024)) || skip
which mkfs.ext4 || skip
aux have_raid 1 13 1 || skip
aux have_raid 1 13 2 || skip
mount_dir="mnt"

View File

@ -102,11 +102,6 @@ static void _verify_bytes(struct block *b, uint64_t base,
T_ASSERT_EQUAL(((uint8_t *) b->data)[offset + i], _pattern_at(pat, base + offset + i));
}
static void _zero_bytes(struct block *b, uint64_t offset, uint64_t len)
{
memset(((uint8_t *) b->data) + offset, 0, len);
}
static uint64_t _min(uint64_t lhs, uint64_t rhs)
{
return rhs < lhs ? rhs : lhs;
@ -145,7 +140,7 @@ static void _verify(struct fixture *f, uint64_t byte_b, uint64_t byte_e, uint8_t
}
}
static void _verify_zeroes(struct fixture *f, uint64_t byte_b, uint64_t byte_e)
static void _verify_set(struct fixture *f, uint64_t byte_b, uint64_t byte_e, uint8_t val)
{
int err;
unsigned i;
@ -160,7 +155,7 @@ static void _verify_zeroes(struct fixture *f, uint64_t byte_b, uint64_t byte_e)
blen = _min(T_BLOCK_SIZE - offset, len);
for (i = 0; i < blen; i++)
T_ASSERT(((uint8_t *) b->data)[offset + i] == 0);
T_ASSERT(((uint8_t *) b->data)[offset + i] == val);
offset = 0;
len -= blen;
@ -169,6 +164,11 @@ static void _verify_zeroes(struct fixture *f, uint64_t byte_b, uint64_t byte_e)
}
}
static void _verify_zeroes(struct fixture *f, uint64_t byte_b, uint64_t byte_e)
{
_verify_set(f, byte_b, byte_e, 0);
}
static void _do_write(struct fixture *f, uint64_t byte_b, uint64_t byte_e, uint8_t pat)
{
unsigned i;
@ -179,30 +179,18 @@ static void _do_write(struct fixture *f, uint64_t byte_b, uint64_t byte_e, uint8
for (i = 0; i < len; i++)
buffer[i] = _pattern_at(pat, byte_b + i);
T_ASSERT(bcache_write_bytes(f->cache, f->fd, byte_b, i, buffer));
T_ASSERT(bcache_write_bytes(f->cache, f->fd, byte_b, byte_e - byte_b, buffer));
free(buffer);
}
static void _do_zero(struct fixture *f, uint64_t byte_b, uint64_t byte_e)
{
int err;
struct block *b;
block_address bb = byte_b / T_BLOCK_SIZE;
block_address be = (byte_e + T_BLOCK_SIZE - 1) / T_BLOCK_SIZE;
uint64_t offset = byte_b % T_BLOCK_SIZE;
uint64_t blen, len = byte_e - byte_b;
T_ASSERT(bcache_zero_bytes(f->cache, f->fd, byte_b, byte_e - byte_b));
}
for (; bb != be; bb++) {
T_ASSERT(bcache_get(f->cache, f->fd, bb, GF_DIRTY, &b, &err));
blen = _min(T_BLOCK_SIZE - offset, len);
_zero_bytes(b, offset, blen);
offset = 0;
len -= blen;
bcache_put(b);
}
static void _do_set(struct fixture *f, uint64_t byte_b, uint64_t byte_e, uint8_t val)
{
T_ASSERT(bcache_set_bytes(f->cache, f->fd, byte_b, byte_e - byte_b, val));
}
static void _reopen(struct fixture *f)
@ -318,6 +306,51 @@ static void _test_zero_many_boundaries(void *fixture)
//----------------------------------------------------------------
static void _set_cycle(struct fixture *f, uint64_t b, uint64_t e)
{
uint8_t val = random();
_verify(f, b, e, INIT_PATTERN);
_do_set(f, b, e, val);
_reopen(f);
_verify(f, b < 128 ? 0 : b - 128, b, INIT_PATTERN);
_verify_set(f, b, e, val);
_verify(f, e, _min(e + 128, _max_byte()), INIT_PATTERN);
}
static void _test_set_first_block(void *fixture)
{
_set_cycle(fixture, byte(0, 0), byte(0, T_BLOCK_SIZE));
}
static void _test_set_last_block(void *fixture)
{
uint64_t last_block = NR_BLOCKS - 1;
_set_cycle(fixture, byte(last_block, 0), byte(last_block, T_BLOCK_SIZE));
}
static void _test_set_several_whole_blocks(void *fixture)
{
_set_cycle(fixture, byte(5, 0), byte(10, 0));
}
static void _test_set_within_single_block(void *fixture)
{
_set_cycle(fixture, byte(7, 3), byte(7, T_BLOCK_SIZE / 2));
}
static void _test_set_cross_one_boundary(void *fixture)
{
_set_cycle(fixture, byte(13, 43), byte(14, 43));
}
static void _test_set_many_boundaries(void *fixture)
{
_set_cycle(fixture, byte(13, 13), byte(23, 13));
}
//----------------------------------------------------------------
#define T(path, desc, fn) register_test(ts, "/base/device/bcache/utils/" path, desc, fn)
static struct test_suite *_tests(void)
@ -342,6 +375,13 @@ static struct test_suite *_tests(void)
T("zero-cross-one-boundary", "zero across one boundary", _test_zero_cross_one_boundary);
T("zero-many-boundaries", "zero many boundaries", _test_zero_many_boundaries);
T("set-first-block", "set the first block", _test_set_first_block);
T("set-last-block", "set the last block", _test_set_last_block);
T("set-several-blocks", "set several whole blocks", _test_set_several_whole_blocks);
T("set-within-single-block", "set within single block", _test_set_within_single_block);
T("set-cross-one-boundary", "set across one boundary", _test_set_cross_one_boundary);
T("set-many-boundaries", "set many boundaries", _test_set_many_boundaries);
return ts;
}