deltas: Search for similar objects (possibly renamed across directories)

The previous diff algorithm was file tree based, and only looked
at modified files that lived at the same path.

However, components like the Linux kernel have versioned
subdirectories, e.g. /usr/lib/modules/$kver/.../ext4.ko.  We want to
be able to detect these "modified renames" so that we can compute
diffs (rollsum, bsdiff).
This commit is contained in:
Colin Walters 2015-02-11 03:29:14 -05:00
parent 0f74ed62b7
commit c54df89771
4 changed files with 402 additions and 67 deletions

View File

@ -91,6 +91,7 @@ libostree_1_la_SOURCES = \
src/libostree/ostree-repo-static-delta-core.c \
src/libostree/ostree-repo-static-delta-processing.c \
src/libostree/ostree-repo-static-delta-compilation.c \
src/libostree/ostree-repo-static-delta-compilation-analysis.c \
src/libostree/ostree-repo-static-delta-private.h \
$(NULL)
if USE_LIBARCHIVE

View File

@ -0,0 +1,305 @@
/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
*
* Copyright (C) 2015 Colin Walters <walters@verbum.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include "config.h"
#include <string.h>
#include <gio/gunixoutputstream.h>
#include "ostree-core-private.h"
#include "ostree-repo-private.h"
#include "ostree-lzma-compressor.h"
#include "ostree-repo-static-delta-private.h"
#include "ostree-diff.h"
#include "ostree-rollsum.h"
#include "otutil.h"
#include "ostree-varint.h"
void
_ostree_delta_content_sizenames_free (gpointer v)
{
OstreeDeltaContentSizeNames *ce = v;
g_free (ce->checksum);
g_ptr_array_unref (ce->basenames);
g_free (ce);
}
static gboolean
build_content_sizenames_recurse (OstreeRepo *repo,
OstreeRepoCommitTraverseIter *iter,
GHashTable *sizenames_map,
GHashTable *include_only_objects,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
while (TRUE)
{
OstreeRepoCommitIterResult iterres =
ostree_repo_commit_traverse_iter_next (iter, cancellable, error);
if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR)
goto out;
else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END)
break;
else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE)
{
char *name;
char *checksum;
OstreeDeltaContentSizeNames *csizenames;
ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum);
if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum))
continue;
csizenames = g_hash_table_lookup (sizenames_map, checksum);
if (!csizenames)
{
gs_unref_object GFileInfo *finfo = NULL;
csizenames = g_new0 (OstreeDeltaContentSizeNames, 1);
csizenames->checksum = g_strdup (checksum);
/* Transfer ownership so things get cleaned up if we
* throw an exception below.
*/
g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames);
if (!ostree_repo_load_file (repo, checksum,
NULL, &finfo, NULL,
cancellable, error))
goto out;
csizenames->size = g_file_info_get_size (finfo);
}
if (!csizenames->basenames)
csizenames->basenames = g_ptr_array_new_with_free_func (g_free);
g_ptr_array_add (csizenames->basenames, g_strdup (name));
}
else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR)
{
char *name;
char *content_checksum;
char *meta_checksum;
gs_unref_variant GVariant *dirtree = NULL;
ostree_cleanup_repo_commit_traverse_iter
OstreeRepoCommitTraverseIter subiter = { 0, };
ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum);
if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE,
content_checksum, &dirtree,
error))
goto out;
if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree,
OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
error))
goto out;
if (!build_content_sizenames_recurse (repo, &subiter,
sizenames_map, include_only_objects,
cancellable, error))
goto out;
}
else
g_assert_not_reached ();
}
ret = TRUE;
out:
return ret;
}
static int
compare_sizenames (const void *a,
const void *b)
{
OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a;
OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b;
return sn_a->size - sn_b->size;
}
/**
* Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...]
* for regular file content.
*/
static gboolean
build_content_sizenames_filtered (OstreeRepo *repo,
GVariant *commit,
GHashTable *include_only_objects,
GPtrArray **out_sizenames,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
gs_unref_ptrarray GPtrArray *ret_sizenames =
g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free);
gs_unref_hashtable GHashTable *sizenames_map =
g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free);
ostree_cleanup_repo_commit_traverse_iter
OstreeRepoCommitTraverseIter iter = { 0, };
if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit,
OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
error))
goto out;
if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects,
cancellable, error))
goto out;
{ GHashTableIter hashiter;
gpointer hkey, hvalue;
g_hash_table_iter_init (&hashiter, sizenames_map);
while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue))
{
g_hash_table_iter_steal (&hashiter);
g_ptr_array_add (ret_sizenames, hvalue);
}
}
g_ptr_array_sort (ret_sizenames, compare_sizenames);
ret = TRUE;
gs_transfer_out_value (out_sizenames, &ret_sizenames);
out:
return ret;
}
static gboolean
string_array_nonempty_intersection (GPtrArray *a,
GPtrArray *b)
{
guint i;
for (i = 0; i < a->len; i++)
{
guint j;
const char *a_str = a->pdata[i];
for (j = 0; j < b->len; j++)
{
const char *b_str = b->pdata[j];
if (strcmp (a_str, b_str) == 0)
return TRUE;
}
}
return FALSE;
}
/*
* Build up a map of files with matching basenames and similar size,
* and use it to find apparently similar objects.
*
* @new_reachable_regfile_content is a Set<checksum> of new regular
* file objects.
*
* Currently, @out_modified_regfile_content will be a Map<to checksum,from checksum>;
* however in the future it would be easy to have this function return
* multiple candidate matches. The hard part would be changing
* the delta compiler to iterate over all matches, determine
* a cost for each one, then pick the best.
*/
gboolean
_ostree_delta_compute_similar_objects (OstreeRepo *repo,
GVariant *from_commit,
GVariant *to_commit,
GHashTable *new_reachable_regfile_content,
guint similarity_percent_threshold,
GHashTable **out_modified_regfile_content,
GCancellable *cancellable,
GError **error)
{
gboolean ret = FALSE;
gs_unref_hashtable GHashTable *ret_modified_regfile_content =
g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_ptr_array_unref);
gs_unref_ptrarray GPtrArray *from_sizes = NULL;
gs_unref_ptrarray GPtrArray *to_sizes = NULL;
guint i, j;
guint lower;
guint upper;
if (!build_content_sizenames_filtered (repo, from_commit, NULL,
&from_sizes,
cancellable, error))
goto out;
if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content,
&to_sizes,
cancellable, error))
goto out;
/* Iterate over all newly added objects, find objects which have
* similar basename and sizes.
*
* Because the arrays are sorted by size, we can maintain a `lower`
* bound on the original (from) objects to start searching.
*/
lower = 0;
upper = from_sizes->len;
for (i = 0; i < to_sizes->len; i++)
{
OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
const guint64 min_threshold = to_sizenames->size *
(1.0-similarity_percent_threshold/100.0);
const guint64 max_threshold = to_sizenames->size *
(1.0+similarity_percent_threshold/100.0);
/* Don't build candidates for the empty object */
if (to_sizenames->size == 0)
continue;
for (j = lower; j < upper; j++)
{
OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
/* Don't build candidates for the empty object */
if (from_sizenames->size == 0)
continue;
if (from_sizenames->size < min_threshold)
{
lower++;
continue;
}
if (from_sizenames->size > max_threshold)
break;
if (!string_array_nonempty_intersection (from_sizenames->basenames, to_sizenames->basenames))
continue;
/* Only one candidate right now */
g_hash_table_insert (ret_modified_regfile_content,
g_strdup (to_sizenames->checksum),
g_strdup (from_sizenames->checksum));
break;
}
}
ret = TRUE;
gs_transfer_out_value (out_modified_regfile_content, &ret_modified_regfile_content);
out:
return ret;
}

View File

@ -32,6 +32,8 @@
#include "otutil.h"
#include "ostree-varint.h"
#define CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT (30)
typedef struct {
guint64 uncompressed_size;
GPtrArray *objects;
@ -479,7 +481,7 @@ try_content_rollsum (OstreeRepo *repo,
gs_unref_bytes GBytes *tmp_to = NULL;
gs_unref_object GFileInfo *from_finfo = NULL;
gs_unref_object GFileInfo *to_finfo = NULL;
OstreeRollsumMatches *matches;
OstreeRollsumMatches *matches = NULL;
ContentRollsum *ret_rollsum = NULL;
*out_rollsum = NULL;
@ -669,7 +671,6 @@ process_one_rollsum (OstreeRepo *repo,
return ret;
}
static gboolean
generate_delta_lowlatency (OstreeRepo *repo,
const char *from,
@ -681,18 +682,18 @@ generate_delta_lowlatency (OstreeRepo *repo,
gboolean ret = FALSE;
GHashTableIter hashiter;
gpointer key, value;
guint i;
OstreeStaticDeltaPartBuilder *current_part = NULL;
gs_unref_object GFile *root_from = NULL;
gs_unref_variant GVariant *from_commit = NULL;
gs_unref_object GFile *root_to = NULL;
gs_unref_ptrarray GPtrArray *modified = NULL;
gs_unref_ptrarray GPtrArray *removed = NULL;
gs_unref_ptrarray GPtrArray *added = NULL;
gs_unref_variant GVariant *to_commit = NULL;
gs_unref_hashtable GHashTable *to_reachable_objects = NULL;
gs_unref_hashtable GHashTable *from_reachable_objects = NULL;
gs_unref_hashtable GHashTable *from_regfile_content = NULL;
gs_unref_hashtable GHashTable *new_reachable_metadata = NULL;
gs_unref_hashtable GHashTable *new_reachable_content = NULL;
gs_unref_hashtable GHashTable *modified_content_objects = NULL;
gs_unref_hashtable GHashTable *new_reachable_regfile_content = NULL;
gs_unref_hashtable GHashTable *new_reachable_symlink_content = NULL;
gs_unref_hashtable GHashTable *modified_regfile_content = NULL;
gs_unref_hashtable GHashTable *rollsum_optimized_content_objects = NULL;
gs_unref_hashtable GHashTable *content_object_to_size = NULL;
@ -701,51 +702,30 @@ generate_delta_lowlatency (OstreeRepo *repo,
if (!ostree_repo_read_commit (repo, from, &root_from, NULL,
cancellable, error))
goto out;
}
if (!ostree_repo_read_commit (repo, to, &root_to, NULL,
cancellable, error))
goto out;
/* Gather a filesystem level diff; when we do heuristics to ship
* just parts of changed files, we can make use of this data.
*/
modified = g_ptr_array_new_with_free_func ((GDestroyNotify) ostree_diff_item_unref);
removed = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref);
added = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref);
if (!ostree_diff_dirs (OSTREE_DIFF_FLAGS_NONE, root_from, root_to, modified, removed, added,
cancellable, error))
goto out;
if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, from,
&from_commit, error))
goto out;
modified_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
g_free, g_free);
for (i = 0; i < modified->len; i++)
{
OstreeDiffItem *diffitem = modified->pdata[i];
/* Theoretically, a target file could replace multiple source
* files. That could happen if say a project changed from having
* multiple binaries to one binary.
*
* In that case, we have last one wins behavior. For ELF rollsum
* tends to be useless unless there's a large static data blob.
*/
g_hash_table_replace (modified_content_objects,
g_strdup (diffitem->target_checksum),
g_strdup (diffitem->src_checksum));
}
if (from)
{
if (!ostree_repo_traverse_commit (repo, from, 0, &from_reachable_objects,
cancellable, error))
goto out;
}
if (!ostree_repo_read_commit (repo, to, &root_to, NULL,
cancellable, error))
goto out;
if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, to,
&to_commit, error))
goto out;
if (!ostree_repo_traverse_commit (repo, to, 0, &to_reachable_objects,
cancellable, error))
goto out;
new_reachable_metadata = ostree_repo_traverse_new_reachable ();
new_reachable_content = ostree_repo_traverse_new_reachable ();
new_reachable_regfile_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
new_reachable_symlink_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
g_hash_table_iter_init (&hashiter, to_reachable_objects);
while (g_hash_table_iter_next (&hashiter, &key, &value))
@ -763,14 +743,41 @@ generate_delta_lowlatency (OstreeRepo *repo,
if (OSTREE_OBJECT_TYPE_IS_META (objtype))
g_hash_table_add (new_reachable_metadata, serialized_key);
else
g_hash_table_add (new_reachable_content, serialized_key);
{
gs_unref_object GFileInfo *finfo = NULL;
GFileType ftype;
if (!ostree_repo_load_file (repo, checksum, NULL, &finfo, NULL,
cancellable, error))
goto out;
ftype = g_file_info_get_file_type (finfo);
if (ftype == G_FILE_TYPE_REGULAR)
g_hash_table_add (new_reachable_regfile_content, g_strdup (checksum));
else if (ftype == G_FILE_TYPE_SYMBOLIC_LINK)
g_hash_table_add (new_reachable_symlink_content, g_strdup (checksum));
else
g_assert_not_reached ();
}
}
g_printerr ("modified: %u removed: %u added: %u\n",
modified->len, removed->len, added->len);
g_printerr ("new reachable: metadata=%u content=%u\n",
if (from_commit)
{
if (!_ostree_delta_compute_similar_objects (repo, from_commit, to_commit,
new_reachable_regfile_content,
CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT,
&modified_regfile_content,
cancellable, error))
goto out;
}
else
modified_regfile_content = g_hash_table_new (g_str_hash, g_str_equal);
g_printerr ("modified: %u\n", g_hash_table_size (modified_regfile_content));
g_printerr ("new reachable: metadata=%u content regular=%u symlink=%u\n",
g_hash_table_size (new_reachable_metadata),
g_hash_table_size (new_reachable_content));
g_hash_table_size (new_reachable_regfile_content),
g_hash_table_size (new_reachable_symlink_content));
/* We already ship the to commit in the superblock, don't ship it twice */
g_hash_table_remove (new_reachable_metadata,
@ -780,7 +787,7 @@ generate_delta_lowlatency (OstreeRepo *repo,
g_free,
(GDestroyNotify) content_rollsums_free);
g_hash_table_iter_init (&hashiter, modified_content_objects);
g_hash_table_iter_init (&hashiter, modified_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
const char *to_checksum = key;
@ -800,7 +807,7 @@ generate_delta_lowlatency (OstreeRepo *repo,
g_printerr ("rollsum for %u/%u modified\n",
g_hash_table_size (rollsum_optimized_content_objects),
g_hash_table_size (modified_content_objects));
g_hash_table_size (modified_regfile_content));
current_part = allocate_part (builder);
@ -837,22 +844,18 @@ generate_delta_lowlatency (OstreeRepo *repo,
/* Scan for large objects, so we can fall back to plain HTTP-based
* fetch.
*/
g_hash_table_iter_init (&hashiter, new_reachable_content);
g_hash_table_iter_init (&hashiter, new_reachable_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
GVariant *serialized_key = key;
const char *checksum;
OstreeObjectType objtype;
const char *checksum = key;
guint64 uncompressed_size;
gboolean fallback = FALSE;
ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
if (!ostree_repo_load_object_stream (repo, objtype, checksum,
if (!ostree_repo_load_object_stream (repo, OSTREE_OBJECT_TYPE_FILE, checksum,
NULL, &uncompressed_size,
cancellable, error))
goto out;
@ -862,30 +865,37 @@ generate_delta_lowlatency (OstreeRepo *repo,
if (fallback)
{
gs_free char *size = g_format_size (uncompressed_size);
g_printerr ("fallback for %s (%s)\n",
ostree_object_to_string (checksum, objtype), size);
g_printerr ("fallback for %s (%s)\n", checksum, size);
g_ptr_array_add (builder->fallback_objects,
g_variant_ref (serialized_key));
ostree_object_name_serialize (checksum, OSTREE_OBJECT_TYPE_FILE));
g_hash_table_iter_remove (&hashiter);
}
}
/* Now non-rollsummed content */
g_hash_table_iter_init (&hashiter, new_reachable_content);
/* Now non-rollsummed regular file content */
g_hash_table_iter_init (&hashiter, new_reachable_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
GVariant *serialized_key = key;
const char *checksum;
OstreeObjectType objtype;
ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
const char *checksum = key;
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
if (!process_one_object (repo, builder, &current_part,
checksum, objtype,
checksum, OSTREE_OBJECT_TYPE_FILE,
cancellable, error))
goto out;
}
/* Now symlinks */
g_hash_table_iter_init (&hashiter, new_reachable_symlink_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
const char *checksum = key;
if (!process_one_object (repo, builder, &current_part,
checksum, OSTREE_OBJECT_TYPE_FILE,
cancellable, error))
goto out;
}

View File

@ -152,4 +152,23 @@ _ostree_repo_static_delta_part_have_all_objects (OstreeRepo *repo,
gboolean *out_have_all,
GCancellable *cancellable,
GError **error);
typedef struct {
char *checksum;
guint64 size;
GPtrArray *basenames;
} OstreeDeltaContentSizeNames;
void _ostree_delta_content_sizenames_free (gpointer v);
gboolean
_ostree_delta_compute_similar_objects (OstreeRepo *repo,
GVariant *from_commit,
GVariant *to_commit,
GHashTable *new_reachable_regfile_content,
guint similarity_percent_threshold,
GHashTable **out_modified_regfile_content,
GCancellable *cancellable,
GError **error);
G_END_DECLS