static-delta: find a similar filename using what is before '.' or '-'

Improve the heuristic to use only the part before the first '.' when
looking for a similar file in the current directory.

last versions of dracut generate reproducible initramfs files, but we
still fallback to the full file download if there is any minimal
change that causes a different checksum and file name.

This change extends that case to deal better with similar files that
have a different suffix.

This is the difference generating a static delta from
fedora-atomic/f24/x86_64/docker-host to fedora-atomic/f24/x86_64/testing/docker-host

before the patch:

fallback for 111ec866aa7ce3688407fa4a1ae7c9fca93dcee0b851fc9434c59ff947830cc7 (47.0 MB)
fallback for c6a898265de22b02c89ea2f35d132628d0ee1c0a058052ed14fee5799c17904c (47.0 MB)
fallback for fbce656249ece77260887ed873e445561b9d43bcb28a32e759c0b1bab89e7137 (6.6 MB)
fallback for cfdb51457e47e0a0fe0bac38991a21279d2646ff2f019630c7b52a0cd3451397 (6.6 MB)
part 0 n:1972 compressed:11239809 uncompressed:33747412
part 1 n:1079 compressed:9683681 uncompressed:55641397
part 2 n:1507 compressed:15050265 uncompressed:44448838
part 3 n:101 compressed:1865881 uncompressed:31896086
part 4 n:278 compressed:2452585 uncompressed:52811323
part 5 n:18 compressed:67621 uncompressed:100220
uncompressed=218645276 compressed=40359842 loose=545102
rollsum=49 objects, 2117254 bytes
bsdiff=4067 objects

after the patch:

part 0 n:843 compressed:19844109 uncompressed:95443178
part 1 n:1223 compressed:11188609 uncompressed:33330401
part 2 n:990 compressed:15762905 uncompressed:61214132
part 3 n:1441 compressed:20614573 uncompressed:31534195
part 4 n:163 compressed:2734997 uncompressed:51356423
part 5 n:285 compressed:2480813 uncompressed:52902904
part 6 n:14 compressed:59125 uncompressed:75341
uncompressed=325856574 compressed=72685131 loose=533283
rollsum=51 objects, 57235332 bytes
bsdiff=4073 objects

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>

Closes: #554
Approved by: cgwalters
This commit is contained in:
Giuseppe Scrivano
2016-10-28 14:44:09 +02:00
committed by Atomic Bot
parent 0333260559
commit c4c8937b20

View File

@ -189,18 +189,30 @@ build_content_sizenames_filtered (OstreeRepo *repo,
static gboolean
string_array_nonempty_intersection (GPtrArray *a,
GPtrArray *b)
GPtrArray *b,
gboolean fuzzy)
{
guint i;
for (i = 0; i < a->len; i++)
{
guint j;
const char *a_str = a->pdata[i];
const char *a_dot = strchr (a_str, '.');
for (j = 0; j < b->len; j++)
{
const char *b_str = b->pdata[j];
if (strcmp (a_str, b_str) == 0)
return TRUE;
const char *b_dot = strchr (b_str, '.');
/* When doing fuzzy comparison, just compare the part before the '.' if it exists. */
if (fuzzy && a_dot && b_dot && b_dot - b_str && b_dot - b_str == a_dot - a_str)
{
if (strncmp (a_str, b_str, a_dot - a_str) == 0)
return TRUE;
}
else
{
if (strcmp (a_str, b_str) == 0)
return TRUE;
}
}
}
return FALSE;
@ -258,6 +270,8 @@ _ostree_delta_compute_similar_objects (OstreeRepo *repo,
upper = from_sizes->len;
for (i = 0; i < to_sizes->len; i++)
{
int fuzzy;
gboolean found = FALSE;
OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
const guint64 min_threshold = to_sizenames->size *
(1.0-similarity_percent_threshold/100.0);
@ -268,31 +282,41 @@ _ostree_delta_compute_similar_objects (OstreeRepo *repo,
if (to_sizenames->size == 0)
continue;
for (j = lower; j < upper; j++)
for (fuzzy = 0; fuzzy < 2 && !found; fuzzy++)
{
OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
/* Don't build candidates for the empty object */
if (from_sizenames->size == 0)
continue;
if (from_sizenames->size < min_threshold)
for (j = lower; j < upper; j++)
{
lower++;
continue;
OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
/* Don't build candidates for the empty object */
if (from_sizenames->size == 0)
{
continue;
}
if (from_sizenames->size < min_threshold)
{
lower++;
continue;
}
if (from_sizenames->size > max_threshold)
break;
if (!string_array_nonempty_intersection (from_sizenames->basenames,
to_sizenames->basenames,
fuzzy == 1))
{
continue;
}
/* Only one candidate right now */
g_hash_table_insert (ret_modified_regfile_content,
g_strdup (to_sizenames->checksum),
g_strdup (from_sizenames->checksum));
found = TRUE;
break;
}
if (from_sizenames->size > max_threshold)
break;
if (!string_array_nonempty_intersection (from_sizenames->basenames, to_sizenames->basenames))
continue;
/* Only one candidate right now */
g_hash_table_insert (ret_modified_regfile_content,
g_strdup (to_sizenames->checksum),
g_strdup (from_sizenames->checksum));
break;
}
}